diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 38a768e7c641..a626a6db3698 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -1120,6 +1120,7 @@ ("wav2vec2-bert", "wav2vec2_bert"), ("vibevoice_acoustic_tokenizer_encoder", "vibevoice_acoustic_tokenizer"), ("vibevoice_acoustic_tokenizer_decoder", "vibevoice_acoustic_tokenizer"), + ('mlcd_vision_model', "mlcd"), ] ) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 5f607e6e7aa5..a3c3d9ae280c 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -228,6 +228,7 @@ ("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")), ("vitmatte", ("VitMatteImageProcessor", "VitMatteImageProcessorFast")), ("vitpose", ("VitPoseImageProcessor", "VitPoseImageProcessorFast")), + ("vivit", ("VivitImageProcessor", None)), ("xclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")), ("yolos", ("YolosImageProcessor", "YolosImageProcessorFast")), ("zoedepth", ("ZoeDepthImageProcessor", "ZoeDepthImageProcessorFast")), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 5a0039fb016b..696675b29c78 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -984,7 +984,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("paligemma", "PaliGemmaForConditionalGeneration"), ("perception_lm", "PerceptionLMForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"), - ("pixtral", "LlavaForConditionalGeneration"), + ("llava", "LlavaForConditionalGeneration"), ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), ("qwen2_vl", "Qwen2VLForConditionalGeneration"), ("qwen3_5", "Qwen3_5ForConditionalGeneration"), @@ -1617,7 +1617,6 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): # Model for Text-To-Waveform mapping ("bark", "BarkModel"), ("csm", "CsmForConditionalGeneration"), - ("fastspeech2_conformer", "FastSpeech2ConformerWithHifiGan"), ("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGan"), ("higgs_audio_v2", "HiggsAudioV2ForConditionalGeneration"), ("musicgen", "MusicgenForConditionalGeneration"), diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index 227c4407f03d..9194218f6d8b 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -114,7 +114,7 @@ class EvollaConfig(PreTrainedConfig): >>> configuration = model.config ```""" - model_type = "EvollaModel" + model_type = "evolla" sub_configs = {"protein_encoder_config": SaProtConfig} default_theta = 500000.0 diff --git a/src/transformers/models/lasr/configuration_lasr.py b/src/transformers/models/lasr/configuration_lasr.py index 07b57ba4282b..50a85e9e8ebf 100644 --- a/src/transformers/models/lasr/configuration_lasr.py +++ b/src/transformers/models/lasr/configuration_lasr.py @@ -59,7 +59,7 @@ class LasrEncoderConfig(PreTrainedConfig): ``` This configuration class is based on the LasrEncoder architecture from Google Health AI. You can find more details - and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO). + and pre-trained models at [google/medasr](https://huggingface.co/google/medasr). """ model_type = "lasr_encoder" @@ -148,7 +148,7 @@ class LasrCTCConfig(PreTrainedConfig): >>> configuration = model.config ``` This configuration class is based on the Lasr CTC architecture from Google Health AI. You can find more details - and pre-trained models at [TODO/TODO](https://huggingface.co/TODO/TODO). + and pre-trained models at [google/medasr](https://huggingface.co/google/medasr). """ model_type = "lasr_ctc"