include ar+diff option as a seperate style

nitsanluke · nitsanluke · commit 0b469fb98ff4 · 2025-07-08T00:42:41.000Z
diff --git a/fast_llm/data/data/gpt/data.py b/fast_llm/data/data/gpt/data.py
@@ -139,7 +139,6 @@ def gpt_data_collate_fn(batch: list[GPTSample], sampling_parameters: GPTSampling
     token_ids = torch.from_numpy(stacked_ids)
 
     if sampling_parameters.diffusion.style == DiffusionStyle.masked:
-
         diffusion_config = sampling_parameters.diffusion
 
         batch_size, seq_len = token_ids.shape
diff --git a/fast_llm/layers/language_model/config.py b/fast_llm/layers/language_model/config.py
@@ -42,6 +42,8 @@ class LanguageModelKwargs:
     mask_indexes = "mask_indexes"
     mask_probabilities = "mask_probabilities"
     mask_inputs = "mask_inputs"
+    loss_weights = "loss_weights"
+    in_context = "in_context"
 
 
 @config_class()
diff --git a/fast_llm/layers/language_model/head.py b/fast_llm/layers/language_model/head.py
@@ -377,7 +377,7 @@ def __init__(
         prediction_distance: int,
     ):
         super().__init__(config, tensor_space, prediction_distance)
-        if config.transformer.diffusion is not None and config.transformer.diffusion == DiffusionStyle.masked:
+        if config.transformer.diffusion == DiffusionStyle.masked:
             self._loss_name = LanguageModelLossNames.mlm_loss
 
     def _logits_cross_entropy_forward_backward(
@@ -402,44 +402,67 @@ def _logits_cross_entropy_forward_backward(
             sequence_parallel=self._sequence_parallel and self._parallel_embeddings,
         )
 
-        masked_indices = kwargs[LanguageModelKwargs.mask_indexes]
-        p_mask = kwargs[LanguageModelKwargs.mask_probabilities]
-        #                                 index   [0, 1, 2, 3, 4, 5] ->
-        # The labels are already left shifted x = [A, B, C, D, E, F] ->
-        #                                 embd =  [A, B, C, D, E]
-        #                                label =  [B, C, D, E, F]
-
-        # Question Pier: if 2 is the masked token, settling needs to settled 3; but 3 is already seen by the model,
-        # can it just learn to copy 3? i.e copy the next token to the masked?
-        # Yes. We need to drop those position from loss if the next token is not masked
-        # We can include curruption to further enhance, but it seems not to big looking at other CPT (diffuLlama)
-
-        last_weight = 0
-        B = logits.shape[0]
-
-        loss_weight = torch.cat(
-            (
-                # ar_weight * in_context[:, 1:] + # not implement yet
-                masked_indices[:, 1:] / p_mask[:, None],
-                # + un_weight * ((1-epsilon) * in_shuffled[:, 1:] + epsilon * in_clean[:, 1:]) / (1 - p_mask[:, None]) # not implement yet
-                (last_weight * torch.ones(B, device=logits.device)).unsqueeze(1),
-                # This may need some weighting in terms of masking. Let's do last_weight=0 TODO: Decide later
-            ),
-            dim=1,
-        ).to(logits.dtype)
+        if self.config.transformer.diffusion == DiffusionStyle.masked:
+            masked_indices = kwargs[LanguageModelKwargs.mask_indexes]
+            p_mask = kwargs[LanguageModelKwargs.mask_probabilities]
+            #                                 index   [0, 1, 2, 3, 4, 5] ->
+            # The labels are already left shifted x = [A, B, C, D, E, F] ->
+            #                                 embd =  [A, B, C, D, E]
+            #                                label =  [B, C, D, E, F]
+
+            # Question Pier: if 2 is the masked token, settling needs to settled 3; but 3 is already seen by the model,
+            # can it just learn to copy 3? i.e copy the next token to the masked?
+            # Yes. We need to drop those position from loss if the next token is not masked
+            # We can include curruption to further enhance, but it seems not to big looking at other CPT (diffuLlama)
+
+            last_weight = 0
+            B = logits.shape[0]
+
+            loss_weight = torch.cat(
+                (
+                    # ar_weight * in_context[:, 1:] + # not implement yet
+                    masked_indices[:, 1:] / p_mask[:, None],
+                    # + un_weight * ((1-epsilon) * in_shuffled[:, 1:] + epsilon * in_clean[:, 1:]) / (1 - p_mask[:, None]) # not implement yet
+                    (last_weight * torch.ones(B, device=logits.device)).unsqueeze(1),
+                    # This may need some weighting in terms of masking. Let's do last_weight=0 TODO: Decide later
+                ),
+                dim=1,
+            ).to(logits.dtype)
+
+            # print(f"Loss weight: {loss_weight}")
 
-        # print(f"Loss weight: {loss_weight}")
+            loss, grad = cross_entropy_forward_backward(
+                logits=logits.flatten(0, -2),
+                target=target,
+                loss_mask=None,
+                grad_output=grad_output,
+                group=self._tensor_space.distributed.tensor_group if self._parallel_embeddings else None,
+                implementation=self._cross_entropy_impl,
+                logits_scale_factor=self._logits_scale_factor,
+                loss_weight=loss_weight,
+            )
 
-        loss, grad = cross_entropy_forward_backward(
-            logits=logits.flatten(0, -2),
-            target=target,
-            loss_mask=None,
-            grad_output=grad_output,
-            group=self._tensor_space.distributed.tensor_group if self._parallel_embeddings else None,
-            implementation=self._cross_entropy_impl,
-            logits_scale_factor=self._logits_scale_factor,
-            loss_weight=loss_weight,
-        )
+        elif self.confing.transformer.diffusion == DiffusionStyle.ar_masked:
+
+            loss_weights = kwargs[LanguageModelKwargs.loss_weights]
+            context_index = kwargs[LanguageModelKwargs.in_context]
+            masked_index = kwargs[LanguageModelKwargs.mask_indexes]
+            B = loss_weights.shape[0]
+            masked_index = torch.cat([masked_index[:, 1:], torch.zeros(B, 1, device=loss_weights.device)], dim=1)
+            context_index = torch.cat([context_index[:, 1:], torch.zeros(B, 1, device=loss_weights.device)], dim=1)
+
+            loss, grad, per_token_loss_b4_weight = cross_entropy_forward_backward(
+                logits.flatten(0, -2),
+                target=target,
+                group=self._tensor_space.distributed.tensor_group if self._parallel_embeddings else None,
+                grad_output=grad_output,
+                implementation=self._cross_entropy_impl,
+                logits_scale_factor=self._logits_scale_factor,
+                loss_weight=loss_weights,
+            )
+
+            losses["loss_mask_tokens"].append((per_token_loss_b4_weight * masked_index).mean())
+            losses["loss_in_context_tokens"].append((per_token_loss_b4_weight * context_index).mean())
 
         # This happens with the loss_weight.
         # MDM https://github.com/ML-GSAI/SMDM/blob/583aa4716d17728dbb825aec6c24a121164d616a/pretrain/train_mdm.py#L274
diff --git a/fast_llm/layers/transformer/attention.py b/fast_llm/layers/transformer/attention.py
@@ -389,15 +389,41 @@ def forward(self, input_: torch.Tensor, kwargs: dict[str, typing.Any]) -> tuple[
                         softmax_scale=self._softmax_scale,
                     )
             input_ = input_.flatten(-2)
+
         else:
             # TODO: Avoid the flattens.
+
             input_ = self._attn_fused(
                 query.flatten(-2),
                 key.flatten(-2),
                 value.flatten(-2),
                 kwargs[TransformerKwargs.attention_mask],
                 kwargs[TransformerKwargs.attention_mask_value],
             )
+            # print(f"Fused: Attention: {input_.shape} {input_} ")
+
+            flash_input_ = _flash_attn_func(
+                query,
+                key,
+                value,
+                window_size=(-1, -1) if window_size is None else (window_size - 1, 0),
+                dropout_p=self._config.attention_dropout if self.training else 0.0,
+                causal=False,
+                softmax_scale=self._softmax_scale,
+            )
+            # print(f"1: Flash : Attention: {flash_input_.shape} {flash_input_} ")
+            flash_input_ = flash_input_.flatten(-2)
+            # print(f"2: Flash: Attention: {flash_input_.shape} {flash_input_} ")
+            diff = input_ - flash_input_
+            # print(f"Element-wise difference: {diff.shape} {diff}")
+            max_diff = diff.abs().max()
+            min_diff = diff.abs().min()
+            print(f"Min element-wise difference: {min_diff.item()}")
+            print(f"Max element-wise difference: {max_diff.item()}")
+            # if max_diff > 1e-3:
+            #     print("Warning: Max difference exceeds 1e-3")
+            #     import sys
+            #     sys.exit(1)
 
         if self._debug_transformer:
             self._debug_log(query, "query", self._QUERY_DIMS, kwargs)
diff --git a/fast_llm/models/gpt/model.py b/fast_llm/models/gpt/model.py
@@ -351,7 +351,8 @@ def preprocess(
                         )
                         # Setup bidirection attention for masked diffusion
                         # It uses _flash_attn_func so no need to set attention_mask and attention_mask_value.
-                        # kwargs[TransformerKwargs.causal] = False
+                        kwargs[TransformerKwargs.causal] = False
+
                         batch_size, seq_len = batch.token_ids.shape
                         seq_len -= 1  # last token is dropped inputs
                         attention_mask = torch.ones(
@@ -395,40 +396,48 @@ def preprocess(
                         # seq_len -= 1  # last token is drop from the input
                         # # Compute attention mask for diffusion
                         C = batch.in_context_length.to(device=self._tensor_space.distributed.device)
-                        # row_idx = torch.arange(seq_len, device=self._tensor_space.distributed.device).view(1, seq_len, 1)
-                        # col_idx = torch.arange(seq_len, device=self._tensor_space.distributed.device).view(1, 1, seq_len)
-                        # C_exp = C.view(batch_size, 1, 1)
+                        row_idx = torch.arange(seq_len, device=self._tensor_space.distributed.device).view(
+                            1, seq_len, 1
+                        )
+                        col_idx = torch.arange(seq_len, device=self._tensor_space.distributed.device).view(
+                            1, 1, seq_len
+                        )
+                        C_exp = C.view(batch_size, 1, 1)
 
-                        # causal_mask = col_idx <= row_idx
-                        # row_idx < C_exp
-                        # col_idx < C_exp
+                        causal_mask = col_idx <= row_idx
+                        row_idx < C_exp
+                        col_idx < C_exp
 
-                        # attn_mask = torch.zeros(
-                        #     batch_size, seq_len, seq_len, dtype=torch.bool, device=self._tensor_space.distributed.device
-                        # )
+                        attn_mask = torch.zeros(
+                            batch_size,
+                            seq_len,
+                            seq_len,
+                            dtype=torch.bool,
+                            device=self._tensor_space.distributed.device,
+                        )
 
-                        # for b in range(batch_size):
-                        #     C_val = C[b].item()
+                        for b in range(batch_size):
+                            C_val = C[b].item()
 
-                        #     if C_val > 0:
-                        #         context_causal = causal_mask[0, :C_val, :C_val]
-                        #         attn_mask[b, :C_val, :C_val] = context_causal
+                            if C_val > 0:
+                                context_causal = causal_mask[0, :C_val, :C_val]
+                                attn_mask[b, :C_val, :C_val] = context_causal
 
-                        #     if C_val > 0 and C_val < seq_len:
-                        #         attn_mask[b, C_val:, :C_val] = True
+                            if C_val > 0 and C_val < seq_len:
+                                attn_mask[b, C_val:, :C_val] = True
 
-                        #     if C_val < seq_len:
-                        #         attn_mask[b, C_val:, C_val:] = True
+                            if C_val < seq_len:
+                                attn_mask[b, C_val:, C_val:] = True
 
                         # Handle padding if needed
-                        # if batch.sequence_lengths is not None:
-                        #     padded = torch.zeros(
-                        #         batch_size, seq_len, dtype=torch.bool, device=self._tensor_space.distributed.device
-                        #     )
-                        #     for b in range(batch_size):
-                        #         padded[b, batch.sequence_lengths[b] :] = True
-                        #     not_padded = ~padded[:, 1:]
-                        #     attn_mask = attn_mask & not_padded.unsqueeze(1) & not_padded.unsqueeze(2)
+                        if batch.sequence_lengths is not None:
+                            padded = torch.zeros(
+                                batch_size, seq_len, dtype=torch.bool, device=self._tensor_space.distributed.device
+                            )
+                            for b in range(batch_size):
+                                padded[b, batch.sequence_lengths[b] :] = True
+                            not_padded = ~padded[:, 1:]
+                            attn_mask = attn_mask & not_padded.unsqueeze(1) & not_padded.unsqueeze(2)
 
                         # Reshape to match expected attention mask format
                         attention_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # Add additional dimension