Merge: [DLRM/PyT] Stop using apex AMP and DDP

nv-kkudrynski · nv-kkudrynski · commit 370a221cc9b6 · 2023-05-29T11:37:42.000-07:00
diff --git a/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/fused_gather_embedding.py b/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/fused_gather_embedding.py
@@ -17,7 +17,7 @@
 """
 
 from absl import logging
-from apex import amp
+import torch
 from torch.autograd import Function
 
 from dlrm.cuda_ext import fused_embedding
@@ -26,12 +26,14 @@
 class BuckleEmbeddingFusedGatherFunction(Function):
     """Customized embedding gather """
     @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
     def forward(ctx, embedding, indices, offsets, amp_train):
         output = fused_embedding.gather_gpu_fused_fwd(embedding, indices, offsets, amp_train)
         ctx.save_for_backward(embedding, indices, offsets)
         return output
 
     @staticmethod
+    @torch.cuda.amp.custom_bwd
     def backward(ctx, grad_output):
         embedding, indices, offsets = ctx.saved_tensors
 
@@ -40,4 +42,4 @@ def backward(ctx, grad_output):
         return grad_weights, None, None, None
 
 
-buckle_embedding_fused_gather = amp.float_function(BuckleEmbeddingFusedGatherFunction.apply)
+buckle_embedding_fused_gather = BuckleEmbeddingFusedGatherFunction.apply
diff --git a/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/sparse_embedding.py b/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/sparse_embedding.py
@@ -15,7 +15,7 @@
 import copy
 
 import torch
-from apex import amp
+from torch.cuda import amp
 from dlrm.cuda_ext import sparse_gather
 from torch import nn
 from torch.autograd import Function
@@ -24,18 +24,18 @@
 class EmbeddingGatherFunction(Function):
     """Customized embedding gather with fused plain SGD"""
     @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
     def forward(ctx, embedding, indices):
         output = sparse_gather.gather_gpu_fwd(embedding, indices)
         ctx.save_for_backward(indices)
         ctx.num_features = embedding.size(0)
         return output
 
     @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
     def backward(ctx, grad_output):
         indices = ctx.saved_tensors[0]
-
         grad_embedding = sparse_gather.gather_gpu_bwd(grad_output, indices, ctx.num_features)
-
         return grad_embedding, None
 
 
@@ -66,4 +66,4 @@ def forward(self, categorical_inputs):
         return embedding_out
 
 
-embedding_gather = amp.float_function(EmbeddingGatherFunction.apply)
+embedding_gather = EmbeddingGatherFunction.apply
diff --git a/PyTorch/Recommendation/DLRM/dlrm/scripts/main.py b/PyTorch/Recommendation/DLRM/dlrm/scripts/main.py
@@ -17,7 +17,7 @@
 import os
 import sys
 from absl import app, flags, logging
-from apex import amp, parallel, optimizers as apex_optim
+from apex import optimizers as apex_optim
 
 from dlrm.data.feature_spec import FeatureSpec
 from dlrm.model.distributed import DistributedDlrm
@@ -500,10 +500,7 @@ def parallelize(model):
         if world_size <= 1:
             return model
 
-        if use_gpu:
-            model.top_model = parallel.DistributedDataParallel(model.top_model)
-        else:  # Use other backend for CPU
-            model.top_model = torch.nn.parallel.DistributedDataParallel(model.top_model)
+        model.top_model = torch.nn.parallel.DistributedDataParallel(model.top_model)
         return model
 
     if FLAGS.mode == 'test':