Skip to content

Commit 40b69e3

Browse files
zhang-progywang96Isotr0py
authored
[Model] Add PaddleOCR-VL Model Support (vllm-project#27758)
Signed-off-by: zhangyue <zhangyue66@baidu.com> Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: zhangyue66 <zhangyue66@baidu.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
1 parent 3225729 commit 40b69e3

File tree

7 files changed

+1475
-0
lines changed

7 files changed

+1475
-0
lines changed

docs/models/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
675675
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
676676
| `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
677677
| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
678+
| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
678679
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ |
679680
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ |
680681
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |

examples/offline_inference/vision_language.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1242,6 +1242,32 @@ def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
12421242
)
12431243

12441244

1245+
# PaddleOCR-VL
1246+
def run_paddleocr_vl(questions: list[str], modality: str) -> ModelRequestData:
1247+
assert modality == "image"
1248+
1249+
model_name = "PaddlePaddle/PaddleOCR-VL"
1250+
1251+
engine_args = EngineArgs(
1252+
model=model_name,
1253+
max_model_len=4096,
1254+
max_num_seqs=2,
1255+
limit_mm_per_prompt={modality: 1},
1256+
trust_remote_code=True,
1257+
)
1258+
1259+
placeholder = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
1260+
prompts = [
1261+
(f"<|begin_of_sentence|>User: {question}{placeholder}\nAssistant: ")
1262+
for question in questions
1263+
]
1264+
1265+
return ModelRequestData(
1266+
engine_args=engine_args,
1267+
prompts=prompts,
1268+
)
1269+
1270+
12451271
# PaliGemma
12461272
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
12471273
assert modality == "image"
@@ -1817,6 +1843,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
18171843
"NVLM_D": run_nvlm_d,
18181844
"ovis": run_ovis,
18191845
"ovis2_5": run_ovis2_5,
1846+
"paddleocr_vl": run_paddleocr_vl,
18201847
"paligemma": run_paligemma,
18211848
"paligemma2": run_paligemma2,
18221849
"phi3_v": run_phi3v,

examples/offline_inference/vision_language_multi_image.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,27 @@ def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
801801
)
802802

803803

804+
def load_paddleocr_vl(question: str, image_urls: list[str]) -> ModelRequestData:
805+
model_name = "PaddlePaddle/PaddleOCR-VL"
806+
807+
engine_args = EngineArgs(
808+
model=model_name,
809+
trust_remote_code=True,
810+
max_model_len=8192,
811+
max_num_seqs=2,
812+
limit_mm_per_prompt={"image": len(image_urls)},
813+
)
814+
815+
placeholders = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" * len(image_urls)
816+
prompt = f"<|begin_of_sentence|>User: {question}{placeholders}\nAssistant: "
817+
818+
return ModelRequestData(
819+
engine_args=engine_args,
820+
prompt=prompt,
821+
image_data=[fetch_image(url) for url in image_urls],
822+
)
823+
824+
804825
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
805826
model_name = "mistral-community/pixtral-12b"
806827

@@ -1312,6 +1333,7 @@ def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
13121333
"NVLM_D": load_nvlm_d,
13131334
"ovis": load_ovis,
13141335
"ovis2_5": load_ovis2_5,
1336+
"paddleocr_vl": load_paddleocr_vl,
13151337
"phi3_v": load_phi3v,
13161338
"phi4_mm": load_phi4mm,
13171339
"phi4_multimodal": load_phi4_multimodal,

tests/models/registry.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,10 @@ def check_available_online(
712712
},
713713
),
714714
"Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True),
715+
"PaddleOCRVLForConditionalGeneration": _HfExamplesInfo(
716+
"PaddlePaddle/PaddleOCR-VL",
717+
trust_remote_code=True,
718+
),
715719
"PaliGemmaForConditionalGeneration": _HfExamplesInfo(
716720
"google/paligemma-3b-mix-224",
717721
extras={"v2": "google/paligemma2-3b-ft-docci-448"},

vllm/model_executor/models/ernie45.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,22 @@
2323
# limitations under the License.
2424
"""Inference-only Erine model compatible with HuggingFace weights."""
2525

26+
from vllm.compilation.decorators import support_torch_compile
2627
from vllm.config import VllmConfig
2728
from vllm.model_executor.models.llama import LlamaForCausalLM
2829

2930
from .utils import PPMissingLayer
3031

3132

33+
@support_torch_compile(
34+
# set dynamic_arg_dims to support mrope
35+
dynamic_arg_dims={
36+
"input_ids": 0,
37+
"positions": -1,
38+
"intermediate_tensors": 0,
39+
"inputs_embeds": 0,
40+
}
41+
)
3242
class Ernie4_5ForCausalLM(LlamaForCausalLM):
3343
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
3444
super().__init__(vllm_config=vllm_config, prefix=prefix)

0 commit comments

Comments
 (0)