mindspore-lab
diff --git a/‎.github/workflows/ci_pipeline.yaml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/ci_pipeline.yaml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎mindnlp/quant/smooth_quant/quant.py‎
Lines changed: 4 additions & 4 deletions b/‎mindnlp/quant/smooth_quant/quant.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎mindnlp/transformers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎mindnlp/transformers/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎mindnlp/transformers/cache_utils.py‎
Lines changed: 29 additions & 0 deletions b/‎mindnlp/transformers/cache_utils.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎mindtorch/_C/__init__.py‎
Lines changed: 0 additions & 7 deletions b/‎mindtorch/_C/__init__.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎mindtorch/__init__.py‎
Lines changed: 2 additions & 3 deletions b/‎mindtorch/__init__.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎mindtorch/_apis/cpu.py‎
Lines changed: 78 additions & 18 deletions b/‎mindtorch/_apis/cpu.py‎
Lines changed: 78 additions & 18 deletions
diff --git a/‎mindtorch/_apis/gpu.py‎
Lines changed: 77 additions & 1 deletion b/‎mindtorch/_apis/gpu.py‎
Lines changed: 77 additions & 1 deletion
@@ -35,6 +35,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip==24.0
+        pip install torch --index-url https://download.pytorch.org/whl/cpu
         pip install -r requirements/pylint_requirements.txt
     # - name: Install MindSpore
     #   shell: bash
@@ -122,6 +123,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip==24.0
+        pip install torch --index-url https://download.pytorch.org/whl/cpu
         pip install -r requirements/requirements.txt
     - name: Install MindSpore
       shell: bash
@@ -132,7 +134,7 @@ jobs:
         pip install mindspore
     - name: Test with pytest
       run: |
-        pip install transformers==4.56.2
+        pip install transformers==4.57.1
         cd tests
         git clone -b v4.56.2 https://github.com/huggingface/transformers
         cd ..
 
@@ -30,7 +30,7 @@ def infer_dtype(self, x_dtype, v_dtype, bias_dtype=None):
 
 
 def quantize_mat(mat: Tensor) -> Tuple[Tensor, Tensor]:
-    max_val = (ops.max(ops.abs(mat), dim=-1)[0] / 127.0).to(dtype=mat.dtype)
+    max_val = (ops.max(ops.abs(mat), -1)[0] / 127.0).to(dtype=mat.dtype)
     mat = (mat / max_val[..., None]).to(dtype=mindspore.int8)
     return mat, max_val
 
@@ -53,7 +53,7 @@ def decomposition(mat: Tensor, unq_idx: Tensor, t: Tensor):
 
 
 def get_unq_idx_topk(mat: Tensor, k: int = 64):
-    idx = ops.topk(ops.max(mat.view(-1, mat.shape[-1]).abs(), dim=-2)[0], k, dim=-1)[1]
+    idx = ops.topk(ops.max(mat.view(-1, mat.shape[-1]).abs(), -2)[0], k, dim=-1)[1]
     t = ops.ones((mat.shape[-1]), dtype=mat.dtype)
     t = t.copy()
     if ON_ORANGE_PI:
@@ -64,7 +64,7 @@ def get_unq_idx_topk(mat: Tensor, k: int = 64):
 
 
 def get_unq_idx_thres(mat: Tensor, threshold: float = 6.0):
-    k = ops.max(mat.view(-1, mat.shape[-1]).abs(), dim=-2)[0] >= threshold
+    k = ops.max(mat.view(-1, mat.shape[-1]).abs(), -2)[0] >= threshold
     return ops.nonzero(k).view(-1), k
 
 
@@ -113,7 +113,7 @@ def __init__(
         self.scales = None
         if act_max is not None:
             self.scales = (
-                (act_max.pow(alpha) / ops.max(ori_w.abs(), dim=0)[0].pow(1 - alpha))
+                (act_max.pow(alpha) / ops.max(ori_w.abs(), 0)[0].pow(1 - alpha))
                 .clamp(min=1e-5)
                 .to(dtype=ori_w.dtype)
             )
 
@@ -10,6 +10,7 @@
 from .masking_utils import create_causal_mask, create_sliding_window_causal_mask, create_masks_for_generate
 from .modeling_utils import construct_pipeline_parallel_model, _load_pretrained_model_wrapper, \
     _get_resolved_checkpoint_files_wrapper
+from .cache_utils import dynamic_layer_update
 from .tokenization_utils import apply_chat_template_wrapper
 from .trainer import training_step
 from ..utils.decorators import dtype_wrapper, patch_dtype_wrapper, patch_wrappers
@@ -68,5 +69,6 @@ def empty_fn(*args, **kwargs):
 
 transformers.trainer.Trainer.training_step = training_step
 
+transformers.cache_utils.DynamicLayer.update = dynamic_layer_update
 # add mindnlp.transformers modules/attrs to lazymodule
 # setattr(sys.modules[__name__], 'test_ms_model', test_ms_model)
@@ -0,0 +1,29 @@
+from typing import Any, Optional
+import mindtorch
+
+def dynamic_layer_update(
+    self,
+    key_states: mindtorch.Tensor,
+    value_states: mindtorch.Tensor,
+    cache_kwargs: Optional[dict[str, Any]] = None,
+) -> tuple[mindtorch.Tensor, mindtorch.Tensor]:
+    """
+    Update the key and value caches in-place, and return the necessary keys and value states.
+
+    Args:
+        key_states (`mindtorch.Tensor`): The new key states to cache.
+        value_states (`mindtorch.Tensor`): The new value states to cache.
+        cache_kwargs (`dict[str, Any]`, *optional*): Additional arguments for the cache.
+
+    Returns:
+        tuple[`mindtorch.Tensor`, `mindtorch.Tensor`]: The key and value states.
+    """
+    # Lazy initialization
+    if not self.is_initialized:
+        self.lazy_initialization(key_states)
+        self.keys = key_states
+        self.values = value_states
+    else:
+        self.keys = mindtorch.cat([self.keys, key_states], dim=-2)
+        self.values = mindtorch.cat([self.values, value_states], dim=-2)
+    return self.keys, self.values
@@ -91,11 +91,6 @@ def __eq__(self, __value):
     def __hash__(self):
         return hash(self.type) ^ hash(self.index)
 
-    def __gt__(self, other):
-        if self.type == 'cpu':
-            return False
-        return True
-
     def __enter__(self):
         # self.prev_idx = torch.cuda._exchange_device(self.idx)
         mindtorch._bind.set_device_in_context(self)
@@ -201,8 +196,6 @@ def _step(self, step):
             Current seed and offset.
         """
         outs = self._generator(STEP, (self._seed, self._offset, step,))[:2]
-        for o in outs:
-            o._device = self.device
         return outs
 
 default_generator = Generator()
 
@@ -169,11 +169,10 @@ def _running_with_deploy():
 
 from .amp import autocast, GradScaler
 from .func import vmap
-from .configs import set_pyboost
 from .storage import UntypedStorage, Storage, TypedStorage
 
-from . import _dynamo
-from . import profiler, cuda, amp, compiler, jit, version, __future__, overrides, \
+from . import _dynamo, library
+from . import profiler, cuda, npu, amp, compiler, jit, version, __future__, overrides, \
     return_types, linalg, fx, backends, nn, fft, _jit_internal, utils, optim, testing, _ops
 from ._lowrank import svd_lowrank
 from .random import get_rng_state, initial_seed, manual_seed, seed, set_rng_state
 
@@ -6,7 +6,7 @@
 from mindspore._c_expression import _empty_instance
 from mindspore.ops.auto_generate.gen_ops_prim import Empty
 import mindtorch
-from .._op_prim.cpu import legacy
+from .._op_prim.cpu import legacy, pyboost
 
 empty_op = Empty().set_device('CPU')
 def empty(size, dtype):
@@ -124,22 +124,7 @@ def transpose_view(input, dim0, dim1):
     return legacy.transpose(input, tuple(ranks))
 
 def matmul(self, other):
-    if self.ndim > 2:
-        if self.ndim == other.ndim:
-            return legacy.batch_mat_mul(self, other, False, False)
-        else:
-            self_shape = self.shape
-            other_shape = other.shape
-            if other.ndim == 2:
-                self = reshape(self, (-1, self_shape[-1]))
-                out = legacy.mat_mul(self, other, False, False)
-                return reshape(out, (*self_shape[:-1], out.shape[-1]))
-            if self.ndim == 2:
-                other = reshape(other, (-1, other_shape[-1]))
-                out = legacy.mat_mul(self, other, False, False)
-                return reshape(out, (*other_shape[:-1], out.shape[-1]))
-    
-    return legacy.mat_mul(self, other, False, False)
+    return pyboost.matmul_ext_op(self, other)
 
 def div(input, other):
     return legacy.div(input, other)
@@ -592,7 +577,20 @@ def batch_norm(input, weight, bias, running_mean=None, runnning_var=None, traini
 def tanh(input):
     return legacy.tanh(input)
 
-def dropout(input, p, seed, offset):
+def dropout(input, p, training=True):
+    """
+    Returns a tensor with dropout applied element-wise.
+
+    Args:
+        input (Tensor): The input tensor.
+        p (float): The dropout probability.
+        seed (int): The random seed.
+
+    Returns:
+        Tensor: The tensor with dropout applied.
+    """
+    if not training or p==0:
+        return input
     return legacy.dropout(input, 1-p, 0, 0)
 
 def split_tensor(input, split_size_or_sections, dim):
@@ -1259,3 +1257,65 @@ def lerp(input, end, weight):
 
 def smooth_l1_loss(input, target, beta=1.0, reduction='none'):
     return legacy.smooth_l1_loss(input, target, beta, reduction)
+
+def index_select(input, dim, index):
+    return legacy.gather(input, index, dim, 0)
+
+def custom_circular_pad(x, pad):
+
+    ndim = x.ndim
+    n_pad_dims = len(pad) // 2
+    assert n_pad_dims <= ndim, "填充参数超过了张量的维度"
+
+    # 按从最后维度向前处理填充
+    for dim in range(ndim-1, ndim-1-n_pad_dims, -1):
+        # 当前维度的左右填充量
+        idx = 2 * (ndim - 1 - dim)  # 在pad元组中的起始位置
+        left_pad = pad[idx]
+        right_pad = pad[idx + 1]
+        
+        if left_pad == 0 and right_pad == 0:
+            continue  # 跳过该维度
+            
+        size = x.shape[dim]  # 当前维度的原始长度
+        new_size = left_pad + size + right_pad
+        
+        # 生成循环索引: (index - left_pad) mod size
+        index = fmod_scalar(add(arange(0, new_size, 1, mindspore.int64), new_size - left_pad), size)
+        index = (index + x.shape[dim]) % x.shape[dim]
+        x = index_select(x, dim, index)
+
+    return x
+
+def pad(input, pad, mode='constant', value=None):
+    if isinstance(pad, tuple):
+        pad = tuple(p if isinstance(p, int) else p.item() for p in pad)
+
+    new_pad = ()
+    for idx, pad_v in enumerate(pad):
+        if not isinstance(pad_v, int):
+            pad_v = pad_v.item()
+        if pad_v < 0:
+            dim = input.ndim - 1 - idx // 2
+            input = narrow(input, dim, 0, input.shape[dim] + pad_v)
+            pad_v = 0
+        new_pad += (pad_v,)
+    if sum(new_pad) == 0:
+        return input
+    if mode == 'circular':
+        return custom_circular_pad(input, pad)
+    elif mode == 'reflect':
+        return pad_v3(input, new_pad, mode)
+    if value is None:
+        value = 0
+    if mode == "replicate":
+        mode = "edge"
+        return pad_v3(input, new_pad, mode)
+    if input.dtype.is_floating_point:
+        value = float(value)
+    elif input.dtype == mindtorch.bool:
+        value = bool(value)
+    elif input.dtype in [mindtorch.int32, mindtorch.int64]:
+        value = int(value)
+
+    return pad_v3(input, new_pad, mode, value)
@@ -532,7 +532,20 @@ def batch_norm(input, weight, bias, running_mean=None, runnning_var=None, traini
 def tanh(input):
     return legacy.tanh(input)
 
-def dropout(input, p, seed, offset):
+def dropout(input, p, training=True):
+    """
+    Returns a tensor with dropout applied element-wise.
+
+    Args:
+        input (Tensor): The input tensor.
+        p (float): The dropout probability.
+        seed (int): The random seed.
+
+    Returns:
+        Tensor: The tensor with dropout applied.
+    """
+    if not training or p==0:
+        return input
     return legacy.dropout(input, 1-p, 0, 0)
 
 def split_tensor(input, split_size_or_sections, dim):
@@ -1256,3 +1269,66 @@ def cumprod(input, dim, dtype):
 
 def lerp(input, end, weight):
     return legacy.lerp(input, end, weight)
+
+def custom_circular_pad(x, pad):
+
+    ndim = x.ndim
+    n_pad_dims = len(pad) // 2
+    assert n_pad_dims <= ndim, "填充参数超过了张量的维度"
+
+    # 按从最后维度向前处理填充
+    for dim in range(ndim-1, ndim-1-n_pad_dims, -1):
+        # 当前维度的左右填充量
+        idx = 2 * (ndim - 1 - dim)  # 在pad元组中的起始位置
+        left_pad = pad[idx]
+        right_pad = pad[idx + 1]
+        
+        if left_pad == 0 and right_pad == 0:
+            continue  # 跳过该维度
+            
+        size = x.shape[dim]  # 当前维度的原始长度
+        new_size = left_pad + size + right_pad
+        
+        # 生成循环索引: (index - left_pad) mod size
+        index = fmod_scalar(add(arange(0, new_size, 1, mindspore.int64), new_size - left_pad), size)
+        index = (index + x.shape[dim]) % x.shape[dim]
+        x = index_select(x, dim, index)
+
+    return x
+
+def pad(input, pad, mode='constant', value=None):
+    if isinstance(pad, tuple):
+        pad = tuple(p if isinstance(p, int) else p.item() for p in pad)
+
+    new_pad = ()
+    for idx, pad_v in enumerate(pad):
+        if not isinstance(pad_v, int):
+            pad_v = pad_v.item()
+        if pad_v < 0:
+            dim = input.ndim - 1 - idx // 2
+            input = narrow(input, dim, 0, input.shape[dim] + pad_v)
+            pad_v = 0
+        new_pad += (pad_v,)
+    if sum(new_pad) == 0:
+        return input
+    if mode == 'circular':
+        return custom_circular_pad(input, pad)
+    elif mode == 'reflect':
+        return pad_v3(input, new_pad, mode)
+    if value is None:
+        value = 0
+    if mode == "replicate":
+        mode = "edge"
+        return pad_v3(input, new_pad, mode)
+    if input.dtype.is_floating_point:
+        value = float(value)
+    elif input.dtype == mindtorch.bool:
+        value = bool(value)
+    elif input.dtype in [mindtorch.int32, mindtorch.int64]:
+        value = int(value)
+    if mode == 'constant' and value == 0 and len(new_pad) > 6:
+        paddings = ()
+        for i in range(input.ndim-1, -1, -1):
+            paddings += ((new_pad[2*i], new_pad[2*i+1]),)
+        return pad(input, paddings)
+    return pad_v3(input, new_pad, mode, value)