Enable fused path on Segment NE (meta-pytorch#3498) (meta-pytorch#3499)

axeisghost · meta-codesync[bot] · commit 14473cec59bc · 2025-11-05T10:49:40.000-08:00
Summary: # Motivation T242704386 mentioned that Segment NE is not compatible of FUSED metrics compute, which can bring efficiency win from [post](https://fb.workplace.com/groups/429376538334034/permalink/1474708170467527) # Solution Run group by group NE computation on tensors of all tasks. # Compatibility #thanks to the suggestion from ge0405, we use backward compatible passing of compute mode using bool to avoid cyclic dependencies. Pull Request resolved: meta-pytorch#3499 Reviewed By: iamzainhuda, ge0405 Differential Revision: D85879827 Pulled By: axeisghost fbshipit-source-id: ada8f0cf2105ea4f5ce7bf3ba4719d73deea1d8d
diff --git a/torchrec/metrics/rec_metric.py b/torchrec/metrics/rec_metric.py
@@ -135,7 +135,7 @@ def __init__(
         window_size: int,
         compute_on_all_ranks: bool = False,
         should_validate_update: bool = False,
-        fuse_state_tensors: bool = False,
+        compute_mode: RecComputeMode = RecComputeMode.UNFUSED_TASKS_COMPUTATION,
         process_group: Optional[dist.ProcessGroup] = None,
         fused_update_limit: int = 0,
         allow_missing_label_with_zero_weight: bool = False,
@@ -144,7 +144,13 @@ def __init__(
     ) -> None:
         metric_init_signature = inspect.signature(Metric.__init__)
         if "fuse_state_tensors" in metric_init_signature.parameters:
-            kwargs["fuse_state_tensors"] = fuse_state_tensors
+            kwargs["fuse_state_tensors"] = (
+                True
+                if compute_mode == RecComputeMode.FUSED_TASKS_AND_STATES_COMPUTATION
+                else False
+            )
+        if "compute_mode" in metric_init_signature.parameters:
+            kwargs["compute_mode"] = compute_mode
         super().__init__(
             process_group=process_group,
             *args,
@@ -169,6 +175,7 @@ def __init__(
                 dist_reduce_fx=lambda x: torch.any(x, dim=0).byte(),
                 persistent=True,
             )
+        self._compute_mode: RecComputeMode = compute_mode
 
     @staticmethod
     def get_window_state_name(state_name: str) -> str:
@@ -428,9 +435,7 @@ def __init__(
                 window_size=self._window_size,
                 compute_on_all_ranks=compute_on_all_ranks,
                 should_validate_update=self._should_validate_update,
-                fuse_state_tensors=(
-                    compute_mode == RecComputeMode.FUSED_TASKS_AND_STATES_COMPUTATION
-                ),
+                compute_mode=compute_mode,
                 process_group=process_group,
                 **{**kwargs, **self._get_task_kwargs(task_config)},
             )
diff --git a/torchrec/metrics/segmented_ne.py b/torchrec/metrics/segmented_ne.py
@@ -102,6 +102,29 @@ def compute_ne(
     return result_ne
 
 
+def compute_ne_fused(
+    ce_sum: torch.Tensor,
+    weighted_num_samples: torch.Tensor,
+    pos_labels: torch.Tensor,
+    neg_labels: torch.Tensor,
+    num_groups: int,
+    n_tasks: int,
+    eta: float,
+) -> torch.Tensor:
+    # size should be (n_tasks, num_groups)
+    result_ne = torch.zeros([n_tasks, num_groups])
+    for group in range(num_groups):
+        mean_label = pos_labels[:, group] / weighted_num_samples[:, group]
+        ce_norm = _compute_cross_entropy_norm(
+            mean_label, pos_labels[:, group], neg_labels[:, group], eta
+        )
+        ne = ce_sum[:, group] / ce_norm
+        result_ne[:, group] = ne
+
+    # ne indexed by group - tensor size (num_groups)
+    return result_ne
+
+
 def get_segemented_ne_states(
     labels: torch.Tensor,
     predictions: torch.Tensor,
@@ -111,12 +134,8 @@ def get_segemented_ne_states(
     num_groups: int,
 ) -> Dict[str, torch.Tensor]:
     groups = torch.unique(grouping_keys)
-    cross_entropy, weighted_num_samples, pos_labels, neg_labels = (
-        torch.zeros(num_groups).to(labels.device),
-        torch.zeros(num_groups).to(labels.device),
-        torch.zeros(num_groups).to(labels.device),
-        torch.zeros(num_groups).to(labels.device),
-    )
+    buffer = torch.zeros((4, num_groups), device=labels.device)
+    cross_entropy, weighted_num_samples, pos_labels, neg_labels = buffer.unbind(0)
     for group in groups:
         group_mask = grouping_keys == group
 
@@ -152,6 +171,53 @@ def get_segemented_ne_states(
     }
 
 
+def get_segemented_ne_states_fused(
+    labels: torch.Tensor,
+    predictions: torch.Tensor,
+    weights: torch.Tensor,
+    grouping_keys: torch.Tensor,
+    eta: float,
+    num_groups: int,
+    n_tasks: int,
+) -> Dict[str, torch.Tensor]:
+    groups = torch.unique(grouping_keys)
+    buffer = torch.zeros((4, n_tasks, num_groups), device=labels.device)
+    cross_entropy, weighted_num_samples, pos_labels, neg_labels = buffer.unbind(0)
+    for group in groups:
+        group_mask = grouping_keys == group
+
+        group_labels = labels[:, group_mask]
+        group_predictions = predictions[:, group_mask]
+        group_weights = weights[:, group_mask]
+
+        ce_sum_group = torch.sum(
+            compute_cross_entropy(
+                labels=group_labels,
+                predictions=group_predictions,
+                weights=group_weights,
+                eta=eta,
+            ),
+            dim=-1,
+        )
+
+        weighted_num_samples_group = torch.sum(group_weights, dim=-1)
+        pos_labels_group = torch.sum(group_weights * group_labels, dim=-1)
+        neg_labels_group = torch.sum(group_weights * (1.0 - group_labels), dim=-1)
+
+        cross_entropy[:, group] = ce_sum_group
+        weighted_num_samples[:, group] = weighted_num_samples_group
+        pos_labels[:, group] = pos_labels_group
+        neg_labels[:, group] = neg_labels_group
+
+    # tensor size for each value is (num_groups)
+    return {
+        "cross_entropy_sum": cross_entropy,
+        "weighted_num_samples": weighted_num_samples,
+        "pos_labels": pos_labels,
+        "neg_labels": neg_labels,
+    }
+
+
 def _state_reduction_sum(state: torch.Tensor) -> torch.Tensor:
     return state.sum(dim=0)
 
@@ -251,21 +317,91 @@ def update(
                 )
 
         grouping_keys = kwargs["required_inputs"][self._grouping_keys]
-        states = get_segemented_ne_states(
-            labels,
-            predictions,
-            weights,
-            grouping_keys,
-            eta=self.eta,
-            num_groups=self._num_groups,
-        )
+        # When labels is 2D, we're in a fused mode (either FUSED_TASKS_COMPUTATION or FUSED_TASKS_AND_STATES_COMPUTATION)
+        # The states update and NE computation need to be done differently.
+        # On fused path, we need to group all tasks together to compute NE and update states for all tasks in one tensor.
+        if (
+            self._compute_mode == RecComputeMode.FUSED_TASKS_COMPUTATION
+            or self._compute_mode == RecComputeMode.FUSED_TASKS_AND_STATES_COMPUTATION
+        ):
+            states = get_segemented_ne_states_fused(
+                labels,
+                predictions,
+                weights,
+                grouping_keys,
+                eta=self.eta,
+                num_groups=self._num_groups,
+                n_tasks=self._n_tasks,
+            )
+        else:
+            states = get_segemented_ne_states(
+                labels,
+                predictions,
+                weights,
+                grouping_keys,
+                eta=self.eta,
+                num_groups=self._num_groups,
+            )
 
         for state_name, state_value in states.items():
             state = getattr(self, state_name)
             state += state_value
 
+    def _compute_fused(self) -> List[MetricComputationReport]:
+        reports = []
+        computed_ne = compute_ne_fused(
+            # pyre-fixme[6]: `In call `compute_ne_fused`, for 1st positional argument, expected `Tensor` but got `Union[Tensor, Module]`
+            self.cross_entropy_sum,
+            # pyre-fixme[6]: `In call `compute_ne_fused`, for 1st positional argument, expected `Tensor` but got `Union[Tensor, Module]`
+            self.weighted_num_samples,
+            # pyre-fixme[6]: `In call `compute_ne_fused`, for 1st positional argument, expected `Tensor` but got `Union[Tensor, Module]`
+            self.pos_labels,
+            # pyre-fixme[6]: `In call `compute_ne_fused`, for 1st positional argument, expected `Tensor` but got `Union[Tensor, Module]`
+            self.neg_labels,
+            num_groups=self._num_groups,
+            n_tasks=self._n_tasks,
+            eta=self.eta,
+        )
+        for group in range(self._num_groups):
+            reports.append(
+                MetricComputationReport(
+                    name=MetricName.SEGMENTED_NE,
+                    metric_prefix=MetricPrefix.LIFETIME,
+                    value=computed_ne[:, group],
+                    description="_" + str(group),
+                ),
+            )
+
+        if self._include_logloss:
+            log_loss_groups = compute_logloss(
+                # pyre-fixme[6]: `In call `compute_ne_fused`, for 1st positional argument, expected `Tensor` but got `Union[Tensor, Module]`
+                self.cross_entropy_sum,
+                # pyre-fixme[6]: `In call `compute_ne_fused`, for 1st positional argument, expected `Tensor` but got `Union[Tensor, Module]`
+                self.pos_labels,
+                # pyre-fixme[6]: `In call `compute_ne_fused`, for 1st positional argument, expected `Tensor` but got `Union[Tensor, Module]`
+                self.neg_labels,
+                eta=self.eta,
+            )
+            for group in range(self._num_groups):
+                reports.append(
+                    MetricComputationReport(
+                        name=MetricName.LOG_LOSS,
+                        metric_prefix=MetricPrefix.LIFETIME,
+                        value=log_loss_groups[:, group],
+                        description="_" + str(group),
+                    )
+                )
+
+        return reports
+
     def _compute(self) -> List[MetricComputationReport]:
         reports = []
+        if (
+            self._compute_mode == RecComputeMode.FUSED_TASKS_COMPUTATION
+            or self._compute_mode == RecComputeMode.FUSED_TASKS_AND_STATES_COMPUTATION
+        ):
+            return self._compute_fused()
+
         computed_ne = compute_ne(
             # pyre-fixme[29]: `Union[(self: TensorBase, indices: Union[None, _NestedS...
             self.cross_entropy_sum[0],
@@ -349,8 +485,3 @@ def __init__(
         else:
             # pyre-ignore[6]
             self._required_inputs.add(kwargs["grouping_keys"])
-        if self._compute_mode == RecComputeMode.FUSED_TASKS_AND_STATES_COMPUTATION:
-            logging.warning(
-                f"compute_mode FUSED_TASKS_AND_STATES_COMPUTATION can't support {self._namespace} yet "
-                "because its states are not 1D Tensors. Only FUSED_TASKS_COMPUTATION will take effect."
-            )
diff --git a/torchrec/metrics/tests/test_segmented_ne.py b/torchrec/metrics/tests/test_segmented_ne.py
@@ -12,6 +12,7 @@
 
 import torch
 from torch import no_grad
+from torchrec.metrics.metrics_config import RecComputeMode
 from torchrec.metrics.rec_metric import RecTaskInfo
 from torchrec.metrics.segmented_ne import SegmentedNEMetric
 
@@ -33,6 +34,7 @@ def _test_segemented_ne_helper(
         grouping_keys: torch.Tensor,
         grouping_key_tensor_name: str = "grouping_keys",
         cast_keys_to_int: bool = False,
+        compute_mode: RecComputeMode = RecComputeMode.UNFUSED_TASKS_COMPUTATION,
     ) -> None:
         num_task = labels.shape[0]
         batch_size = labels.shape[0]
@@ -70,6 +72,7 @@ def _test_segemented_ne_helper(
             grouping_keys=grouping_key_tensor_name,
             # pyre-ignore
             cast_keys_to_int=cast_keys_to_int,
+            compute_mode=compute_mode,
         )
         ne.update(**inputs)
         actual_ne = ne.compute()
@@ -95,9 +98,39 @@ def test_grouped_ne(self) -> None:
         test_data = generate_model_outputs_cases()
         for inputs in test_data:
             try:
-                self._test_segemented_ne_helper(**inputs)
+                self._test_segemented_ne_helper(
+                    **inputs,
+                    compute_mode=RecComputeMode.UNFUSED_TASKS_COMPUTATION,
+                )
+            except AssertionError:
+                print(
+                    "Assertion error caught with data set in UNFUSED_TASKS_COMPUTATION mode",
+                    inputs,
+                )
+                raise
+
+            try:
+                self._test_segemented_ne_helper(
+                    **inputs,
+                    compute_mode=RecComputeMode.FUSED_TASKS_COMPUTATION,
+                )
             except AssertionError:
-                print("Assertion error caught with data set ", inputs)
+                print(
+                    "Assertion error caught with data set in FUSED_TASKS_COMPUTATION mode",
+                    inputs,
+                )
+                raise
+
+            try:
+                self._test_segemented_ne_helper(
+                    **inputs,
+                    compute_mode=RecComputeMode.FUSED_TASKS_AND_STATES_COMPUTATION,
+                )
+            except AssertionError:
+                print(
+                    "Assertion error caught with data set in FUSED_TASKS_AND_STATES_COMPUTATION mode",
+                    inputs,
+                )
                 raise