ServiceNow · oleksost · Apr 24, 2025 · Feb 18, 2025 · Mar 13, 2025 · Mar 14, 2025
diff --git a/fast_llm/layers/common/conv.py b/fast_llm/layers/common/conv.py
@@ -0,0 +1,147 @@
+import logging
+import typing
+
+import torch
+
+from fast_llm.engine.config_utils.tensor_space import TensorDim
+from fast_llm.tensor import ParameterMeta, init_zeros_
+
+logger = logging.getLogger(__name__)
+
+
+class Conv1DBase(torch.nn.Module):
+    """
+    A base module for 1D convolutional layers holding weights and biases.
+    """
+
+    def __init__(
+        self,
+        in_channels: TensorDim,
+        out_channels: TensorDim,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        *,
+        bias=True,
+        weight_init_method,
+        bias_init_method=init_zeros_,
+        auto_bias_grad_accumulation: bool = False,
+        lr_scale: float | None | tuple[float | None, ...] = None,
+    ):
+        super().__init__()
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._kernel_size = kernel_size
+        self._stride = stride
+        self._padding = padding
+        self._dilation = dilation
+        self._groups = groups
+
+        self.weight = ParameterMeta.from_dims(
+            (self._out_channels, TensorDim("D_in", self._in_channels.size // groups), TensorDim("D_kernel", self._kernel_size)),
+            init_method=weight_init_method,
+            auto_grad_accumulation=False,
+            lr_scale=lr_scale,
+        )
+
+        if bias:
+            self.bias = ParameterMeta.from_dims(
+                (self._out_channels,),
+                init_method=bias_init_method,
+                weight_decay=False,
+                auto_grad_accumulation=auto_bias_grad_accumulation,
+                lr_scale=lr_scale,
+            )
+        else:
+            self.bias = None
+
+
+class Conv1D(Conv1DBase):
+    """
+    A basic 1D convolutional layer without tensor parallelism.
+    """
+
+    def __init__(
+        self,
+        in_channels: TensorDim,
+        out_channels: TensorDim,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        *,
+        bias=True,
+        weight_init_method,
+        bias_init_method=init_zeros_,
+        lr_scale: float | None | tuple[float | None, ...] = None,
+    ):
+        assert in_channels.parallel_dim is None
+        assert out_channels.parallel_dim is None
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias,
+            weight_init_method=weight_init_method,
+            bias_init_method=bias_init_method,
+            lr_scale=lr_scale,
+        )
+
+    def forward(self, input_: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.conv1d(
+            input_,
+            self.weight,
+            self.bias,
+            stride=self._stride,
+            padding=self._padding,
+            dilation=self._dilation,
+            groups=self._groups,
+        )
+
+    def forward_only(
+        self, input_: torch.Tensor
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor, dict]]:
+        # Store context for backward pass
+        context = {
+            "input": input_,
+            "weight": self.weight,
+            "stride": self._stride,
+            "padding": self._padding,
+            "dilation": self._dilation,
+            "groups": self._groups,
+        }
+
+        output = torch.nn.functional.conv1d(
+            input_,
+            self.weight,
+            self.bias,
+            stride=self._stride,
+            padding=self._padding,
+            dilation=self._dilation,
+            groups=self._groups,
+        )
+
+        return output, (input_, self.weight, context)
+
+    def backward(self, grad_output: torch.Tensor, context: tuple[torch.Tensor, torch.Tensor, dict]) -> torch.Tensor:
+        input_, weight, ctx = context
+
+        # Calculate gradients
+        grad_input = torch.nn.grad.conv1d_input(
+            input_.shape,
+            weight,
+            grad_output,
+            stride=ctx["stride"],
+            padding=ctx["padding"],
+            dilation=ctx["dilation"],
+            groups=ctx["groups"],
+        )
+
+        return grad_input
diff --git a/fast_llm/layers/ssm/config.py b/fast_llm/layers/ssm/config.py
@@ -0,0 +1,175 @@
+import math
+from typing import Optional
+
+from fast_llm.config import Field, FieldHint, FieldUpdate, check_field, config_class, skip_valid_if_none
+from fast_llm.engine.base_model.config import BaseModelConfig
+from fast_llm.layers.common.config import NormalizationConfig
+from fast_llm.layers.transformer.config import TransformerArchitectureConfig
+from fast_llm.utils import Assert
+
+@config_class()
+class MambaConfig(TransformerArchitectureConfig, BaseModelConfig):
+    """Configuration for a Structured State Space Model (SSM) layer."""
+
+    # Core architecture parameters
+    hidden_size: int = Field(
+        default=768,
+        desc="Size of the hidden representations",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    state_size: int = Field(
+        default=64,
+        desc="Size of the internal state vector",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    expansion_factor: int = Field(
+        default=2,
+        desc="Factor by which to expand hidden size in SSM computation",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    # SSM specific parameters
+    conv_dimension: int = Field(
+        default=4,
+        desc="Size of the convolutional kernel",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    dt_rank: str | int = Field(
+        default="auto",
+        desc="Rank of the Δ projection matrix. If 'auto', set to ceil(hidden_size/16)",
+        hint=FieldHint.core,
+    )
+
+    dt_min: float = Field(
+        default=0.001,
+        desc="Minimum step size for discretization",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    dt_max: float = Field(
+        default=0.1,
+        desc="Maximum step size for discretization",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    dt_init_floor: float = Field(
+        default=1e-4,
+        desc="Minimum value for initializing dt",
+        hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    # Layer parameters
+    add_bias_linear: bool = Field(
+        default=False,
+        desc="Whether to use bias in linear transformations",
+        hint=FieldHint.core,
+    )
+
+    conv_bias: bool = Field(
+        default=True,
+        desc="Whether to use bias in convolution layer",
+        hint=FieldHint.core,
+    )
+
+    # Normalization
+    normalization: NormalizationConfig = FieldUpdate(
+        default_factory=NormalizationConfig
+    )
+
+    # Performance optimization
+    use_fast_path: bool = Field(
+        default=True,
+        desc="Whether to use optimized CUDA kernels when available",
+        hint=FieldHint.performance,
+    )
+
+    # Initialization parameters
+    init_method_std: float = Field(
+        default=None,
+        desc="Default scale for weight initialization. Default: hidden_size**-0.5",
+        hint=FieldHint.optional,
+        valid=skip_valid_if_none(check_field(Assert.geq, 0)),
+    )
+
+
+    device: str = Field(
+        default="cuda",
+        desc="device",
+        hint=FieldHint.optional,
+    )
+
+    mamba_headdim: int = Field(
+        default=64,
+        desc="headdim",
+        hint=FieldHint.optional,
+    )
+    mamba_ngroups: int = Field(
+        default=1,
+        desc="ngroups",
+        hint=FieldHint.optional,
+    )
+
+    use_low_rank_mamba_proj: bool = Field(
+        default=False,
+        desc="use_low_rank_mamba_proj",
+        hint=FieldHint.optional,
+    )
+
+    use_module_layernorm: bool = Field(
+        default=False,
+        desc="use_module_layernorm",
+        hint=FieldHint.optional,
+    )
+
+    layernorm_epsilon: float = Field(
+        default=1e-5,
+        desc="layernorm_epsilon",
+        hint=FieldHint.optional,
+    )
+
+    rms_norm: bool = Field(
+        default=False,
+        desc="rms_norm",
+        hint=FieldHint.optional,
+    )
+
+    fused_add_norm: bool = Field(
+        default=False,
+        desc="fused_add_norm",
+        hint=FieldHint.optional,
+    )
+
+    residual_in_fp32: bool = Field(
+        default=False,
+        desc="residual_in_fp32",
+        hint=FieldHint.optional,
+    )
+
+    def _validate(self) -> None:
+        """Validate configuration parameters."""
+        if self.init_method_std is None:
+            self.init_method_std = self.hidden_size**-0.5
+
+        super()._validate()
+
+        # Validate SSM-specific parameters
+        Assert.gt(self.state_size, 0)
+        Assert.gt(self.expansion_factor, 0)
+        Assert.gt(self.conv_dimension, 0)
+        Assert.gt(self.dt_min, 0)
+        Assert.gt(self.dt_max, 0)
+        Assert.gt(self.dt_init_floor, 0)
+        Assert.geq(self.dt_max, self.dt_min)
+
+        if isinstance(self.dt_rank, int):
+            Assert.gt(self.dt_rank, 0)