diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 9ff12163b12a..18efa2524425 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -1605,7 +1605,7 @@ def __call__( if videos is not None: videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) # If user has not requested video metadata, pop it - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"] diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index 79935cbde7b4..80b151566e5a 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -129,7 +129,7 @@ def __call__( if videos is not None: videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) # If user has not requested video metadata, pop it - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"] diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 7758a23e2970..77eadc012b13 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -1346,7 +1346,7 @@ def __call__( videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] # If user has not requested video metadata, pop it - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"] diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py index b86e3c282ed4..5137d916d810 100644 --- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py @@ -148,7 +148,7 @@ def __call__( videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] # If user has not requested video metadata, pop it - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"] diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py index 2ce6465ee971..98828b2d53a8 100644 --- a/src/transformers/models/smolvlm/processing_smolvlm.py +++ b/src/transformers/models/smolvlm/processing_smolvlm.py @@ -343,7 +343,7 @@ def __call__( # If user has not requested video metadata, pop it. By default metadata # is always returned to expand video tokens correctly - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): vision_inputs.pop("video_metadata") inputs.update(vision_inputs) diff --git a/src/transformers/models/video_llama_3/modular_video_llama_3.py b/src/transformers/models/video_llama_3/modular_video_llama_3.py index 789248676ab7..cfcaf0dec655 100644 --- a/src/transformers/models/video_llama_3/modular_video_llama_3.py +++ b/src/transformers/models/video_llama_3/modular_video_llama_3.py @@ -1134,7 +1134,7 @@ def __call__( for grid_thw, merge_size in zip(videos_inputs["video_grid_thw"], videos_inputs["video_merge_sizes"]) ] video_compression_masks = videos_inputs["video_compression_mask"].split(num_video_tokens) - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"] diff --git a/src/transformers/models/video_llama_3/processing_video_llama_3.py b/src/transformers/models/video_llama_3/processing_video_llama_3.py index d5ea2c75e9d8..7b36b4fd1d12 100644 --- a/src/transformers/models/video_llama_3/processing_video_llama_3.py +++ b/src/transformers/models/video_llama_3/processing_video_llama_3.py @@ -134,7 +134,7 @@ def __call__( for grid_thw, merge_size in zip(videos_inputs["video_grid_thw"], videos_inputs["video_merge_sizes"]) ] video_compression_masks = videos_inputs["video_compression_mask"].split(num_video_tokens) - if "return_metadata" not in kwargs: + if not kwargs.get("return_metadata"): video_metadata = videos_inputs.pop("video_metadata") else: video_metadata = videos_inputs["video_metadata"]