add point finetune: fix bug of all_finite for distributed training

Mark-ZhouWX · Mark-ZhouWX · commit 84f697bbc5f2 · 2024-01-15T11:44:59.000+08:00
diff --git a/official/cv/segment-anything/requirements.txt b/official/cv/segment-anything/requirements.txt
@@ -6,3 +6,4 @@ omegaconf==2.0.0
 # Optional. for preprocess medical ct and mr dataset
 # connected-components-3d
 # SimpleITK
+# scikit-image
diff --git a/official/cv/segment-anything/scripts/preprocess_CT_MR_dataset.py b/official/cv/segment-anything/scripts/preprocess_CT_MR_dataset.py
@@ -78,7 +78,7 @@
     # remove label ids
     for remove_label_id in remove_label_ids:
         gt_data_ori[gt_data_ori == remove_label_id] = 0
-    all_labels = np.unique(gt_data_ori).sort()[1:]
+    all_labels = np.sort(np.unique(gt_data_ori))[1:]
 
     # remove obj with more than one connected area
     # for l in all_labels:
diff --git a/official/cv/segment-anything/segment_anything/utils/model_wrapper.py b/official/cv/segment-anything/segment_anything/utils/model_wrapper.py
@@ -284,7 +284,9 @@ def _train_fn(*data_element):
             # print(f'loss list', loss_list)
             t0 = time.time()
             grad_accum = grad_reducer_wrapper(ms.mutable(grad_accum)) # mutable tuple to prevent duplicate graph compiling
-            if np.all(grad_finite_list):
+            # all finite should be after grad reduce for multi node
+            grad_accum_finite = all_finite(grad_accum)
+            if grad_accum_finite:
                 optimizer_wrapper(ms.mutable(grad_accum)) # mutable tuple to prevent duplicate graph compiling
             else:
                 print(f'gradient overflow')