Merge pull request #720 from Labelbox/ms/remove-subclass-from-metrics

msokoloff1 · web-flow · commit 70484dd31f7e · 2022-10-27T10:15:47.000-04:00
remove subclass from metric calculations
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+# In progress
+## Changed
+* Default behavior for metrics to not include subclasses in the calculation.
+
+## Fixed
+* Polygon extraction from masks creating invalid polygons. This would cause issues in the coco converter.
+
 # Version 3.28.0 (2022-10-14)
 
 ### Added
@@ -45,7 +52,7 @@
 * Increase scalar metric value limit to 100m
 * Added deprecation warnings when updating project `queue_mode`
 ### Fixed
-* Fix bug in `feature_confusion_matrix` and `confusion_matrix` causing FPs and FNs to be capped at 1 when there were no matching annotations 
+* Fix bug in `feature_confusion_matrix` and `confusion_matrix` causing FPs and FNs to be capped at 1 when there were no matching annotations
 
 # Version 3.26.2 (2022-09-06)
 ### Added
@@ -65,7 +72,7 @@
     * Resets model run training metadata
 * `ModelRun.get_config()`
     * Fetches model run training metadata
-    
+
 ### Changed
 * `Model.create_model_run()`
     * Add training metadata config as a model run creation param
diff --git a/labelbox/data/metrics/confusion_matrix/calculation.py b/labelbox/data/metrics/confusion_matrix/calculation.py
@@ -268,29 +268,10 @@ def mask_confusion_matrix(ground_truths: List[ObjectAnnotation],
     elif has_no_annotations(ground_truths, predictions):
         return None
 
-    if include_subclasses:
-        # This results in a faily drastically different value than without subclasses.
-        # If we have subclasses set to True, then this is object detection with masks
-        # Otherwise this will compute metrics on each pixel.
-        pairs = _get_mask_pairs(ground_truths, predictions)
-        return object_pair_confusion_matrix(
-            pairs, include_subclasses=include_subclasses, iou=iou)
-
-    prediction_np = np.max([pred.value.draw(color=1) for pred in predictions],
-                           axis=0)
-    ground_truth_np = np.max(
-        [ground_truth.value.draw(color=1) for ground_truth in ground_truths],
-        axis=0)
-    if prediction_np.shape != ground_truth_np.shape:
-        raise ValueError(
-            "Prediction and mask must have the same shape."
-            f" Found {prediction_np.shape}/{ground_truth_np.shape}.")
-
-    tp_mask = prediction_np == ground_truth_np == 1
-    fp_mask = (prediction_np == 1) & (ground_truth_np == 0)
-    fn_mask = (prediction_np == 0) & (ground_truth_np == 1)
-    tn_mask = prediction_np == ground_truth_np == 0
-    return [np.sum(tp_mask), np.sum(fp_mask), np.sum(fn_mask), np.sum(tn_mask)]
+    pairs = _get_mask_pairs(ground_truths, predictions)
+    return object_pair_confusion_matrix(pairs,
+                                        include_subclasses=include_subclasses,
+                                        iou=iou)
 
 
 def ner_confusion_matrix(ground_truths: List[ObjectAnnotation],
diff --git a/labelbox/data/metrics/confusion_matrix/confusion_matrix.py b/labelbox/data/metrics/confusion_matrix/confusion_matrix.py
@@ -16,7 +16,7 @@ def confusion_matrix_metric(ground_truths: List[Union[
     ObjectAnnotation, ClassificationAnnotation]],
                             predictions: List[Union[ObjectAnnotation,
                                                     ClassificationAnnotation]],
-                            include_subclasses=True,
+                            include_subclasses=False,
                             iou=0.5) -> List[ConfusionMatrixMetric]:
     """
     Computes confusion matrix metrics between two sets of annotations.
@@ -47,7 +47,7 @@ def confusion_matrix_metric(ground_truths: List[Union[
 def feature_confusion_matrix_metric(
     ground_truths: List[Union[ObjectAnnotation, ClassificationAnnotation]],
     predictions: List[Union[ObjectAnnotation, ClassificationAnnotation]],
-    include_subclasses=True,
+    include_subclasses=False,
     iou: float = 0.5,
 ) -> List[ConfusionMatrixMetric]:
     """
diff --git a/labelbox/data/metrics/iou/iou.py b/labelbox/data/metrics/iou/iou.py
@@ -13,7 +13,7 @@ def miou_metric(ground_truths: List[Union[ObjectAnnotation,
                                           ClassificationAnnotation]],
                 predictions: List[Union[ObjectAnnotation,
                                         ClassificationAnnotation]],
-                include_subclasses=True) -> List[ScalarMetric]:
+                include_subclasses=False) -> List[ScalarMetric]:
     """
     Computes miou between two sets of annotations.
     These annotations should relate to the same data (image/video).
@@ -68,7 +68,7 @@ def feature_miou_metric(ground_truths: List[Union[ObjectAnnotation,
 
 def data_row_miou(ground_truth: Label,
                   prediction: Label,
-                  include_subclasses=True) -> Optional[float]:
+                  include_subclasses=False) -> Optional[float]:
     """
 
     This function is no longer supported. Use miou() for raw values or miou_metric() for the metric
diff --git a/tests/data/metrics/confusion_matrix/conftest.py b/tests/data/metrics/confusion_matrix/conftest.py
@@ -11,10 +11,16 @@
 
 class NameSpace(SimpleNamespace):
 
-    def __init__(self, predictions, ground_truths, expected):
-        super(NameSpace, self).__init__(predictions=predictions,
-                                        ground_truths=ground_truths,
-                                        expected=expected)
+    def __init__(self,
+                 predictions,
+                 ground_truths,
+                 expected,
+                 expected_without_subclasses=None):
+        super(NameSpace, self).__init__(
+            predictions=predictions,
+            ground_truths=ground_truths,
+            expected=expected,
+            expected_without_subclasses=expected_without_subclasses or expected)
 
 
 def get_radio(name, answer_name):
@@ -109,7 +115,8 @@ def get_object_pairs(tool_fn, **kwargs):
                         **kwargs,
                         subclasses=[get_radio("is_animal", answer_name="yes")])
             ],
-            expected={'cat': [1, 0, 0, 0]}),
+            expected={'cat': [1, 0, 0, 0]},
+            expected_without_subclasses={'cat': [1, 0, 0, 0]}),
         NameSpace(predictions=[
             tool_fn("cat",
                     **kwargs,
@@ -121,7 +128,8 @@ def get_object_pairs(tool_fn, **kwargs):
                           **kwargs,
                           subclasses=[get_radio("is_animal", answer_name="no")])
                   ],
-                  expected={'cat': [0, 1, 0, 1]}),
+                  expected={'cat': [0, 1, 0, 1]},
+                  expected_without_subclasses={'cat': [1, 0, 0, 0]}),
         NameSpace(predictions=[
             tool_fn("cat",
                     **kwargs,
@@ -136,7 +144,8 @@ def get_object_pairs(tool_fn, **kwargs):
                           **kwargs,
                           subclasses=[get_radio("is_animal", answer_name="no")])
                   ],
-                  expected={'cat': [1, 1, 0, 0]}),
+                  expected={'cat': [1, 1, 0, 0]},
+                  expected_without_subclasses={'cat': [1, 1, 0, 0]}),
         NameSpace(predictions=[
             tool_fn("cat",
                     **kwargs,
@@ -154,6 +163,10 @@ def get_object_pairs(tool_fn, **kwargs):
                   expected={
                       'cat': [0, 1, 0, 1],
                       'dog': [0, 1, 0, 0]
+                  },
+                  expected_without_subclasses={
+                      'cat': [1, 0, 0, 0],
+                      'dog': [0, 1, 0, 0]
                   }),
         NameSpace(
             predictions=[tool_fn("cat", **kwargs),
@@ -171,7 +184,10 @@ def get_object_pairs(tool_fn, **kwargs):
             ground_truths=[tool_fn("cat", **kwargs),
                            tool_fn("cat", **kwargs)],
             expected={'cat': [1, 0, 0, 1]}),
-        NameSpace(predictions=[], ground_truths=[], expected=[]),
+        NameSpace(predictions=[],
+                  ground_truths=[],
+                  expected=[],
+                  expected_without_subclasses=[]),
         NameSpace(predictions=[],
                   ground_truths=[tool_fn("cat", **kwargs)],
                   expected={'cat': [0, 0, 0, 1]}),
@@ -183,7 +199,7 @@ def get_object_pairs(tool_fn, **kwargs):
                   expected={
                       'cat': [0, 1, 0, 0],
                       'dog': [0, 0, 0, 1]
-                  }),
+                  })
     ]
 
 
diff --git a/tests/data/metrics/confusion_matrix/test_confusion_matrix_data_row.py b/tests/data/metrics/confusion_matrix/test_confusion_matrix_data_row.py
@@ -14,18 +14,25 @@
 ])
 def test_overlapping_objects(tool_examples):
     for example in tool_examples:
-        score = confusion_matrix_metric(example.ground_truths,
-                                        example.predictions)
 
-        if len(example.expected) == 0:
-            assert len(score) == 0
-        else:
-            expected = [0, 0, 0, 0]
-            for expected_values in example.expected.values():
-                for idx in range(4):
-                    expected[idx] += expected_values[idx]
-            assert score[0].value == tuple(
-                expected), f"{example.predictions},{example.ground_truths}"
+        for include_subclasses, expected_attr_name in [[
+                True, 'expected'
+        ], [False, 'expected_without_subclasses']]:
+            score = confusion_matrix_metric(
+                example.ground_truths,
+                example.predictions,
+                include_subclasses=include_subclasses)
+
+            if len(getattr(example, expected_attr_name)) == 0:
+                assert len(score) == 0
+            else:
+                expected = [0, 0, 0, 0]
+                for expected_values in getattr(example,
+                                               expected_attr_name).values():
+                    for idx in range(4):
+                        expected[idx] += expected_values[idx]
+                assert score[0].value == tuple(
+                    expected), f"{example.predictions},{example.ground_truths}"
 
 
 @parametrize("tool_examples",
diff --git a/tests/data/metrics/confusion_matrix/test_confusion_matrix_feature.py b/tests/data/metrics/confusion_matrix/test_confusion_matrix_feature.py
@@ -14,14 +14,21 @@
 ])
 def test_overlapping_objects(tool_examples):
     for example in tool_examples:
-        metrics = feature_confusion_matrix_metric(example.ground_truths,
-                                                  example.predictions)
-
-        metrics = {r.feature_name: list(r.value) for r in metrics}
-        if len(example.expected) == 0:
-            assert len(metrics) == 0
-        else:
-            assert metrics == example.expected, f"{example.predictions},{example.ground_truths}"
+        for include_subclasses, expected_attr_name in [[
+                True, 'expected'
+        ], [False, 'expected_without_subclasses']]:
+            metrics = feature_confusion_matrix_metric(
+                example.ground_truths,
+                example.predictions,
+                include_subclasses=include_subclasses)
+
+            metrics = {r.feature_name: list(r.value) for r in metrics}
+            if len(getattr(example, expected_attr_name)) == 0:
+                assert len(metrics) == 0
+            else:
+                assert metrics == getattr(
+                    example, expected_attr_name
+                ), f"{example.predictions},{example.ground_truths}"
 
 
 @parametrize("tool_examples",
diff --git a/tests/data/metrics/iou/data_row/conftest.py b/tests/data/metrics/iou/data_row/conftest.py
@@ -12,24 +12,26 @@ def __init__(self,
                  predictions,
                  labels,
                  expected,
+                 expected_without_subclasses=None,
                  data_row_expected=None,
                  media_attributes=None,
                  metadata=None,
                  classifications=None):
-        super(NameSpace,
-              self).__init__(predictions=predictions,
-                             labels={
-                                 'DataRow ID': 'ckppihxc10005aeyjen11h7jh',
-                                 'Labeled Data': "https://.jpg",
-                                 'Media Attributes': media_attributes or {},
-                                 'DataRow Metadata': metadata or [],
-                                 'Label': {
-                                     'objects': labels,
-                                     'classifications': classifications or []
-                                 }
-                             },
-                             expected=expected,
-                             data_row_expected=data_row_expected)
+        super(NameSpace, self).__init__(
+            predictions=predictions,
+            labels={
+                'DataRow ID': 'ckppihxc10005aeyjen11h7jh',
+                'Labeled Data': "https://.jpg",
+                'Media Attributes': media_attributes or {},
+                'DataRow Metadata': metadata or [],
+                'Label': {
+                    'objects': labels,
+                    'classifications': classifications or []
+                }
+            },
+            expected=expected,
+            expected_without_subclasses=expected_without_subclasses or expected,
+            data_row_expected=data_row_expected)
 
 
 @pytest.fixture
@@ -645,7 +647,8 @@ def test_box_with_wrong_subclass():
                              'answer': 'not_test'
                          }]
                      }],
-                     expected=0.5)
+                     expected=0.5,
+                     expected_without_subclasses=1.0)
 
 
 @pytest.fixture
@@ -780,4 +783,4 @@ def partial_matching_ner():
                              "end": 5
                          }
                      }],
-                     expected=0.2857142857142857)
+                     expected=0.2857142857142857)
diff --git a/tests/data/metrics/iou/data_row/test_data_row_iou.py b/tests/data/metrics/iou/data_row/test_data_row_iou.py
@@ -21,15 +21,29 @@ def check_iou(pair, mask=None):
                 annotation.value.mask.arr = np.frombuffer(
                     base64.b64decode(annotation.value.mask.url.encode('utf-8')),
                     dtype=np.uint8).reshape((32, 32, 3))
-    assert math.isclose(data_row_miou(label, prediction), pair.expected)
-    assert math.isclose(
-        miou_metric(label.annotations, prediction.annotations)[0].value,
-        pair.expected)
-    feature_ious = feature_miou_metric(label.annotations,
-                                       prediction.annotations)
-    assert len(feature_ious
-              ) == 1  # The tests run here should only have one class present.
-    assert math.isclose(feature_ious[0].value, pair.expected)
+
+    for include_subclasses, expected_attr_name in [[
+            True, 'expected'
+    ], [False, 'expected_without_subclasses']]:
+        assert math.isclose(
+            data_row_miou(label,
+                          prediction,
+                          include_subclasses=include_subclasses),
+            getattr(pair, expected_attr_name))
+        assert math.isclose(
+            miou_metric(label.annotations,
+                        prediction.annotations,
+                        include_subclasses=include_subclasses)[0].value,
+            getattr(pair, expected_attr_name))
+        feature_ious = feature_miou_metric(
+            label.annotations,
+            prediction.annotations,
+            include_subclasses=include_subclasses)
+        assert len(
+            feature_ious
+        ) == 1  # The tests run here should only have one class present.
+        assert math.isclose(feature_ious[0].value,
+                            getattr(pair, expected_attr_name))
 
 
 def check_iou_checklist(pair, mask=None):
@@ -122,4 +136,4 @@ def test_others(pair):
              strings_to_fixtures(
                  ["matching_ner", "no_matching_ner", "partial_matching_ner"]))
 def test_ner(pair):
-    check_iou(pair)
+    check_iou(pair)