diff --git a/fastdeploy/engine/async_llm.py b/fastdeploy/engine/async_llm.py index 43a36585f43..e77c0a02af9 100644 --- a/fastdeploy/engine/async_llm.py +++ b/fastdeploy/engine/async_llm.py @@ -722,7 +722,7 @@ def _setting_environ_variables(self): "FLAGS_use_append_attn": 1, "NCCL_ALGO": "Ring", "FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)), - "OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)), + "OMP_NUM_THREADS": 3, } # environment variables needed by Dy2St variables.update( diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 01f28819fbe..6271d054ab9 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -453,7 +453,7 @@ def _setting_environ_variables(self): "PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python", "NCCL_ALGO": "Ring", "FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)), - "OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)), + "OMP_NUM_THREADS": 3, "FD_ENABLE_PDL": envs.FD_ENABLE_PDL, } # environment variables needed by Dy2St diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 226f4e14c1d..6f42e232dbd 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -357,10 +357,6 @@ def __init__( self.output_sizes = output_sizes def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None): - weight_need_transpose = getattr(param, "weight_need_transpose", False) - if weight_need_transpose: - loaded_weight = get_tensor(loaded_weight).transpose([1, 0]) - assert loaded_shard_id in ["q_a", "kv_a"] if not param._is_initialized(): param.initialize() @@ -386,7 +382,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N else: loaded_weight = loaded_weight.cast(param.dtype) # (bukejiyu) After this fix, the early H2D copy for non-GPU devices is no longer needed and can be safely removed. - loaded_weight = get_tensor(loaded_weight) h2d_copy(param, loaded_weight) @@ -453,7 +448,17 @@ def __init__( if self.with_bias: # col parallel _set_var_distributed(self.bias, split_axis=1) - set_weight_attrs(self.bias, {"output_dim": True}) + set_weight_attrs( + self.bias, + { + "output_dim": True, + "weight_loader": ( + self.weight_loader + if hasattr(self, "weight_loader") + else default_weight_loader(self.fd_config) + ), + }, + ) class MergedColumnParallelLinear(ColumnParallelLinear): @@ -962,7 +967,10 @@ def __init__( self.num_heads_per_partition = divide(num_attention_heads, self.nranks) self.local_rank = fd_config.parallel_config.tensor_parallel_rank self.fd_config = fd_config - self.kv_b_proj = kv_b_proj + if self.fd_config.load_config.load_choices == "default_v1": + self.kv_b_proj = kv_b_proj + else: + self.kv_b_proj = None self.weight_dtype = self._helper.get_default_dtype() diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index c0644896e8e..adb13187eeb 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -141,7 +141,10 @@ def get_tensor(input: Union[paddle.Tensor, np.ndarray, str], model_path=None) -> if isinstance(input, paddle.Tensor): if input.place.is_cpu_place(): - return input.to(paddle.device.get_device()) + if current_platform.is_cuda(): + return input.cuda() + else: + return input.to(paddle.device.get_device()) return input elif isinstance(input, np.ndarray): return paddle.to_tensor(input) diff --git a/fastdeploy/model_executor/models/qwen2_5_vl/dfnrope/modeling.py b/fastdeploy/model_executor/models/qwen2_5_vl/dfnrope/modeling.py index 8f7b262432f..4414eb91712 100644 --- a/fastdeploy/model_executor/models/qwen2_5_vl/dfnrope/modeling.py +++ b/fastdeploy/model_executor/models/qwen2_5_vl/dfnrope/modeling.py @@ -32,7 +32,7 @@ from paddleformers.transformers.model_utils import PretrainedModel from fastdeploy.model_executor.layers.utils import divide, get_tensor -from fastdeploy.model_executor.utils import fd_cast, h2d_copy, set_weight_attrs +from fastdeploy.model_executor.utils import fd_cast, set_weight_attrs from .activation import ACT2FN from .configuration import DFNRopeVisionTransformerConfig @@ -151,7 +151,8 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N assert param.shape == shard_weight.shape, ( f" Attempted to load weight ({shard_weight.shape}) " f"into parameter ({param.shape})" ) - h2d_copy(param, shard_weight) + shard_weight = get_tensor(shard_weight) + param.copy_(shard_weight, False) def forward( self, diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 3b42e0294e6..5e1613fb869 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -281,7 +281,6 @@ def default_weight_loader(fd_config: FDConfig = None) -> None: def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None): """fn""" - output_dim = getattr(param, "output_dim", None) weight_need_transpose = getattr(param, "weight_need_transpose", False) if weight_need_transpose: @@ -306,7 +305,8 @@ def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None): assert param.shape == loaded_weight.shape, ( f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})" ) - h2d_copy(dst=param, src=loaded_weight) + loaded_weight = get_tensor(loaded_weight) + param.copy_(loaded_weight, False) return fn @@ -365,8 +365,9 @@ def h2d_copy(dst, src, blocking=True): if not current_platform.is_cuda() or not is_paddle_support_new_h2d(): # For non-GPU devices, data is transferred to device (H2D) in advance. src = get_tensor(src) - if not dst._is_initialized(): - dst.initialize() + if len(src.shape) == 1: + # TODO (bukejiyu):A recently merged Paddle PR introduced a hang when copying 1-D non-contiguous tensors. This approach serves as a temporary workaround. + src = get_tensor(src) dst.copy_(src, blocking)