|
120 | 120 | "JambaForCausalLM": ("jamba", "JambaForCausalLM"), |
121 | 121 | "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"), |
122 | 122 | "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), |
123 | | - "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"), # noqa: E501 |
| 123 | + "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"), |
124 | 124 | # For decapoda-research/llama-* |
125 | 125 | "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), |
126 | 126 | "LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"), |
|
204 | 204 | "LlavaNextForConditionalGeneration": ( |
205 | 205 | "llava_next", |
206 | 206 | "LlavaNextForConditionalGeneration", |
207 | | - ), # noqa: E501 |
| 207 | + ), |
208 | 208 | "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), |
209 | 209 | "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 |
210 | 210 | # Technically Terratorch models work on images, both in |
|
240 | 240 | "AyaVisionForConditionalGeneration": ( |
241 | 241 | "aya_vision", |
242 | 242 | "AyaVisionForConditionalGeneration", |
243 | | - ), # noqa: E501 |
| 243 | + ), |
244 | 244 | "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"), |
245 | 245 | "ChameleonForConditionalGeneration": ( |
246 | 246 | "chameleon", |
247 | 247 | "ChameleonForConditionalGeneration", |
248 | | - ), # noqa: E501 |
| 248 | + ), |
249 | 249 | "Cohere2VisionForConditionalGeneration": ( |
250 | 250 | "cohere2_vision", |
251 | 251 | "Cohere2VisionForConditionalGeneration", |
252 | | - ), # noqa: E501 |
| 252 | + ), |
253 | 253 | "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"), |
254 | 254 | "DotsOCRForCausalLM": ("dots_ocr", "DotsOCRForCausalLM"), |
255 | 255 | "Ernie4_5_VLMoeForConditionalGeneration": ( |
256 | 256 | "ernie45_vl", |
257 | 257 | "Ernie4_5_VLMoeForConditionalGeneration", |
258 | | - ), # noqa: E501 |
| 258 | + ), |
259 | 259 | "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), |
260 | 260 | "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 |
261 | 261 | "Gemma3nForConditionalGeneration": ( |
262 | 262 | "gemma3n_mm", |
263 | 263 | "Gemma3nForConditionalGeneration", |
264 | | - ), # noqa: E501 |
| 264 | + ), |
265 | 265 | "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"), |
266 | 266 | "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 |
267 | 267 | "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"), # noqa: E501 |
268 | 268 | "GraniteSpeechForConditionalGeneration": ( |
269 | 269 | "granite_speech", |
270 | 270 | "GraniteSpeechForConditionalGeneration", |
271 | | - ), # noqa: E501 |
| 271 | + ), |
272 | 272 | "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), |
273 | 273 | "InternVLChatModel": ("internvl", "InternVLChatModel"), |
274 | 274 | "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"), |
275 | 275 | "InternS1ForConditionalGeneration": ( |
276 | 276 | "interns1", |
277 | 277 | "InternS1ForConditionalGeneration", |
278 | | - ), # noqa: E501 |
| 278 | + ), |
279 | 279 | "InternVLForConditionalGeneration": ( |
280 | 280 | "interns1", |
281 | 281 | "InternS1ForConditionalGeneration", |
282 | | - ), # noqa: E501 |
| 282 | + ), |
283 | 283 | "Idefics3ForConditionalGeneration": ( |
284 | 284 | "idefics3", |
285 | 285 | "Idefics3ForConditionalGeneration", |
|
289 | 289 | "KeyeVL1_5ForConditionalGeneration": ( |
290 | 290 | "keye_vl1_5", |
291 | 291 | "KeyeVL1_5ForConditionalGeneration", |
292 | | - ), # noqa: E501 |
| 292 | + ), |
293 | 293 | "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"), |
294 | 294 | "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 |
295 | 295 | "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"), |
|
298 | 298 | "LlavaNextForConditionalGeneration": ( |
299 | 299 | "llava_next", |
300 | 300 | "LlavaNextForConditionalGeneration", |
301 | | - ), # noqa: E501 |
| 301 | + ), |
302 | 302 | "LlavaNextVideoForConditionalGeneration": ( |
303 | 303 | "llava_next_video", |
304 | 304 | "LlavaNextVideoForConditionalGeneration", |
305 | | - ), # noqa: E501 |
| 305 | + ), |
306 | 306 | "LlavaOnevisionForConditionalGeneration": ( |
307 | 307 | "llava_onevision", |
308 | 308 | "LlavaOnevisionForConditionalGeneration", |
309 | | - ), # noqa: E501 |
| 309 | + ), |
310 | 310 | "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501 |
311 | 311 | "MiDashengLMModel": ("midashenglm", "MiDashengLMModel"), |
312 | 312 | "MiniMaxVL01ForConditionalGeneration": ( |
313 | 313 | "minimax_vl_01", |
314 | 314 | "MiniMaxVL01ForConditionalGeneration", |
315 | | - ), # noqa: E501 |
| 315 | + ), |
316 | 316 | "MiniCPMO": ("minicpmo", "MiniCPMO"), |
317 | 317 | "MiniCPMV": ("minicpmv", "MiniCPMV"), |
318 | 318 | "Mistral3ForConditionalGeneration": ( |
319 | 319 | "mistral3", |
320 | 320 | "Mistral3ForConditionalGeneration", |
321 | | - ), # noqa: E501 |
| 321 | + ), |
322 | 322 | "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"), |
323 | 323 | "NVLM_D": ("nvlm_d", "NVLM_D_Model"), |
324 | 324 | "Ovis": ("ovis", "Ovis"), |
325 | 325 | "Ovis2_5": ("ovis2_5", "Ovis2_5"), |
326 | 326 | "PaliGemmaForConditionalGeneration": ( |
327 | 327 | "paligemma", |
328 | 328 | "PaliGemmaForConditionalGeneration", |
329 | | - ), # noqa: E501 |
| 329 | + ), |
330 | 330 | "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), |
331 | 331 | "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"), |
332 | 332 | "Phi4MultimodalForCausalLM": ("phi4_multimodal", "Phi4MultimodalForCausalLM"), # noqa: E501 |
|
336 | 336 | "Qwen2_5_VLForConditionalGeneration": ( |
337 | 337 | "qwen2_5_vl", |
338 | 338 | "Qwen2_5_VLForConditionalGeneration", |
339 | | - ), # noqa: E501 |
| 339 | + ), |
340 | 340 | "Qwen2AudioForConditionalGeneration": ( |
341 | 341 | "qwen2_audio", |
342 | 342 | "Qwen2AudioForConditionalGeneration", |
343 | | - ), # noqa: E501 |
| 343 | + ), |
344 | 344 | "Qwen2_5OmniModel": ( |
345 | 345 | "qwen2_5_omni_thinker", |
346 | 346 | "Qwen2_5OmniThinkerForConditionalGeneration", |
347 | | - ), # noqa: E501 |
| 347 | + ), |
348 | 348 | "Qwen2_5OmniForConditionalGeneration": ( |
349 | 349 | "qwen2_5_omni_thinker", |
350 | 350 | "Qwen2_5OmniThinkerForConditionalGeneration", |
351 | | - ), # noqa: E501 |
| 351 | + ), |
352 | 352 | "Qwen3VLForConditionalGeneration": ("qwen3_vl", "Qwen3VLForConditionalGeneration"), # noqa: E501 |
353 | 353 | "Qwen3VLMoeForConditionalGeneration": ( |
354 | 354 | "qwen3_vl_moe", |
355 | 355 | "Qwen3VLMoeForConditionalGeneration", |
356 | | - ), # noqa: E501 |
| 356 | + ), |
357 | 357 | "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"), |
358 | 358 | "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501 |
359 | 359 | "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501 |
360 | 360 | "Tarsier2ForConditionalGeneration": ( |
361 | 361 | "qwen2_vl", |
362 | 362 | "Tarsier2ForConditionalGeneration", |
363 | | - ), # noqa: E501 |
| 363 | + ), |
364 | 364 | "UltravoxModel": ("ultravox", "UltravoxModel"), |
365 | 365 | "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 |
366 | 366 | # [Encoder-decoder] |
|
401 | 401 | "TransformersMoEForMultimodalLM": ( |
402 | 402 | "transformers_moe", |
403 | 403 | "TransformersMoEForMultimodalLM", |
404 | | - ), # noqa: E501 |
| 404 | + ), |
405 | 405 | "TransformersEmbeddingModel": ( |
406 | 406 | "transformers_pooling", |
407 | 407 | "TransformersEmbeddingModel", |
408 | | - ), # noqa: E501 |
| 408 | + ), |
409 | 409 | "TransformersForSequenceClassification": ( |
410 | 410 | "transformers_pooling", |
411 | 411 | "TransformersForSequenceClassification", |
412 | | - ), # noqa: E501 |
| 412 | + ), |
413 | 413 | "TransformersMoEForSequenceClassification": ( |
414 | 414 | "transformers_pooling", |
415 | 415 | "TransformersMoEForSequenceClassification", |
416 | | - ), # noqa: E501 |
| 416 | + ), |
417 | 417 | "TransformersMoEEmbeddingModel": ( |
418 | 418 | "transformers_pooling", |
419 | 419 | "TransformersMoEEmbeddingModel", |
420 | | - ), # noqa: E501 |
| 420 | + ), |
421 | 421 | } |
422 | 422 |
|
423 | 423 | _VLLM_MODELS = { |
|
0 commit comments