|
51 | 51 | BaseProcessingInfo) |
52 | 52 | from vllm.multimodal.profiling import BaseDummyInputsBuilder |
53 | 53 | from vllm.sequence import IntermediateTensors |
54 | | -from vllm.utils import is_list_of |
55 | 54 |
|
56 | 55 | from .interfaces import (MultiModalEmbeddings, SupportsLoRA, |
57 | 56 | SupportsMultiModal, SupportsPP, SupportsQuant) |
@@ -217,9 +216,6 @@ def wrapper(*args, **kwargs): |
217 | 216 |
|
218 | 217 | class MultiModalProcessingInfo(BaseProcessingInfo): |
219 | 218 |
|
220 | | - def get_hf_config(self): |
221 | | - return self.ctx.model_config.hf_config |
222 | | - |
223 | 219 | def get_supported_mm_limits(self): |
224 | 220 | return {"image": None} |
225 | 221 |
|
@@ -784,6 +780,7 @@ def _can_concat(x: list[torch.Tensor]): |
784 | 780 | }, |
785 | 781 | enable_if=can_enable_torch_compile) |
786 | 782 | class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal): |
| 783 | + merge_by_field_config = True |
787 | 784 | # Backwards compatibility for prev released models. State dicts back then |
788 | 785 | # had different formats and cannot be loaded with `AutoModel` mapping as is |
789 | 786 | hf_to_vllm_mapper = WeightsMapper( |
@@ -828,40 +825,27 @@ def get_language_model(self) -> torch.nn.Module: |
828 | 825 | return self.model |
829 | 826 |
|
830 | 827 | def get_multimodal_embeddings(self, **kwargs): |
831 | | - pixel_values = kwargs.pop("pixel_values", None) |
832 | | - pixel_values = pixel_values if pixel_values is not None else kwargs.pop( |
833 | | - "image_patches", None) |
834 | | - image_embeds = kwargs.pop("image_embeds", None) |
| 828 | + pixel_values: Optional[torch.Tensor] = kwargs.pop("pixel_values", None) |
| 829 | + image_embeds: Optional[torch.Tensor] = kwargs.pop("image_embeds", None) |
| 830 | + # Model might use `image_patches` instead of `pixel_values` |
| 831 | + if pixel_values is None: |
| 832 | + pixel_values = kwargs.pop("image_patches", None) |
835 | 833 |
|
836 | 834 | if image_embeds is not None: |
837 | 835 | return image_embeds |
838 | 836 |
|
839 | | - if pixel_values is None and image_embeds is None: |
| 837 | + if pixel_values is None: |
840 | 838 | return None |
841 | 839 |
|
842 | 840 | num_image_patches = kwargs.pop("num_image_patches") |
843 | 841 | if pixel_values is not None: |
844 | | - if isinstance(pixel_values, torch.Tensor): |
845 | | - pixel_values = flatten_bn(pixel_values).to(self.dtype) |
846 | | - elif is_list_of(pixel_values, torch.Tensor): |
847 | | - pixel_values = flatten_and_concat(pixel_values).to(self.dtype) |
848 | | - else: |
849 | | - raise ValueError( |
850 | | - f"Unsupported pixel_values type {type(pixel_values)}. " |
851 | | - "Expected `torch.Tensor` or list of `torch.Tensor`.") |
852 | | - |
853 | | - if isinstance(num_image_patches, list): |
854 | | - num_image_patches = torch.cat(num_image_patches) |
855 | | - |
856 | 842 | vision_embeddings = self.model.get_image_features( |
857 | | - pixel_values, |
858 | | - **{ |
859 | | - k: v.flatten(0, 1) |
860 | | - for k, v in kwargs.items() |
861 | | - }, |
862 | | - ) |
| 843 | + pixel_values, **kwargs) |
863 | 844 |
|
864 | 845 | if isinstance(vision_embeddings, torch.Tensor): |
| 846 | + if isinstance(num_image_patches, list): |
| 847 | + num_image_patches = torch.cat(num_image_patches) |
| 848 | + |
865 | 849 | if vision_embeddings.ndim == 2: |
866 | 850 | vision_embeddings = vision_embeddings.unsqueeze(0) |
867 | 851 |
|
|
0 commit comments