Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fastdeploy/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,7 @@ def _setting_environ_variables(self):
"FLAGS_use_append_attn": 1,
"NCCL_ALGO": "Ring",
"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
"OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)),
"OMP_NUM_THREADS": 3,
}
# environment variables needed by Dy2St
variables.update(
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ def _setting_environ_variables(self):
"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python",
"NCCL_ALGO": "Ring",
"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 1024)),
"OMP_NUM_THREADS": int(os.getenv("OMP_NUM_THREADS", 3)),
"OMP_NUM_THREADS": 3,
"FD_ENABLE_PDL": envs.FD_ENABLE_PDL,
}
# environment variables needed by Dy2St
Expand Down
22 changes: 15 additions & 7 deletions fastdeploy/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,10 +357,6 @@ def __init__(
self.output_sizes = output_sizes

def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
weight_need_transpose = getattr(param, "weight_need_transpose", False)
if weight_need_transpose:
loaded_weight = get_tensor(loaded_weight).transpose([1, 0])

assert loaded_shard_id in ["q_a", "kv_a"]
if not param._is_initialized():
param.initialize()
Expand All @@ -386,7 +382,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
else:
loaded_weight = loaded_weight.cast(param.dtype)
# (bukejiyu) After this fix, the early H2D copy for non-GPU devices is no longer needed and can be safely removed.
loaded_weight = get_tensor(loaded_weight)
h2d_copy(param, loaded_weight)


Expand Down Expand Up @@ -453,7 +448,17 @@ def __init__(
if self.with_bias:
# col parallel
_set_var_distributed(self.bias, split_axis=1)
set_weight_attrs(self.bias, {"output_dim": True})
set_weight_attrs(
self.bias,
{
"output_dim": True,
"weight_loader": (
self.weight_loader
if hasattr(self, "weight_loader")
else default_weight_loader(self.fd_config)
),
},
)


class MergedColumnParallelLinear(ColumnParallelLinear):
Expand Down Expand Up @@ -962,7 +967,10 @@ def __init__(
self.num_heads_per_partition = divide(num_attention_heads, self.nranks)
self.local_rank = fd_config.parallel_config.tensor_parallel_rank
self.fd_config = fd_config
self.kv_b_proj = kv_b_proj
if self.fd_config.load_config.load_choices == "default_v1":
self.kv_b_proj = kv_b_proj
else:
self.kv_b_proj = None

self.weight_dtype = self._helper.get_default_dtype()

Expand Down
5 changes: 4 additions & 1 deletion fastdeploy/model_executor/layers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,10 @@ def get_tensor(input: Union[paddle.Tensor, np.ndarray, str], model_path=None) ->

if isinstance(input, paddle.Tensor):
if input.place.is_cpu_place():
return input.to(paddle.device.get_device())
if current_platform.is_cuda():
return input.cuda()
else:
return input.to(paddle.device.get_device())
return input
elif isinstance(input, np.ndarray):
return paddle.to_tensor(input)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from paddleformers.transformers.model_utils import PretrainedModel

from fastdeploy.model_executor.layers.utils import divide, get_tensor
from fastdeploy.model_executor.utils import fd_cast, h2d_copy, set_weight_attrs
from fastdeploy.model_executor.utils import fd_cast, set_weight_attrs

from .activation import ACT2FN
from .configuration import DFNRopeVisionTransformerConfig
Expand Down Expand Up @@ -151,7 +151,8 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
assert param.shape == shard_weight.shape, (
f" Attempted to load weight ({shard_weight.shape}) " f"into parameter ({param.shape})"
)
h2d_copy(param, shard_weight)
shard_weight = get_tensor(shard_weight)
param.copy_(shard_weight, False)

def forward(
self,
Expand Down
9 changes: 5 additions & 4 deletions fastdeploy/model_executor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,6 @@ def default_weight_loader(fd_config: FDConfig = None) -> None:

def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
"""fn"""

output_dim = getattr(param, "output_dim", None)
weight_need_transpose = getattr(param, "weight_need_transpose", False)
if weight_need_transpose:
Expand All @@ -306,7 +305,8 @@ def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
assert param.shape == loaded_weight.shape, (
f" Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({param.shape})"
)
h2d_copy(dst=param, src=loaded_weight)
loaded_weight = get_tensor(loaded_weight)
param.copy_(loaded_weight, False)

return fn

Expand Down Expand Up @@ -365,8 +365,9 @@ def h2d_copy(dst, src, blocking=True):
if not current_platform.is_cuda() or not is_paddle_support_new_h2d():
# For non-GPU devices, data is transferred to device (H2D) in advance.
src = get_tensor(src)
if not dst._is_initialized():
dst.initialize()
if len(src.shape) == 1:
# TODO (bukejiyu):A recently merged Paddle PR introduced a hang when copying 1-D non-contiguous tensors. This approach serves as a temporary workaround.
src = get_tensor(src)
dst.copy_(src, blocking)


Expand Down
Loading