Merge pull request #635 from Labelbox/kkim/AL-2896

kkim-labelbox · web-flow · commit e5770fd98ee0 · 2022-07-19T15:26:07.000-07:00
[AL-2896] Video annotation serialization/deserialization using segment_index
diff --git a/labelbox/data/annotation_types/annotation.py b/labelbox/data/annotation_types/annotation.py
@@ -1,5 +1,5 @@
 import abc
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 from .classification import Checklist, Dropdown, Radio, Text
 from .feature import FeatureSchema
@@ -72,11 +72,13 @@ class VideoObjectAnnotation(ObjectAnnotation):
         value (Geometry)
         frame (Int): The frame index that this annotation corresponds to
         keyframe (bool): Whether or not this annotation was a human generated or interpolated annotation
+        segment_id (Optional[Int]): Index of video segment this annotation belongs to
         classifications (List[ClassificationAnnotation]) = []
         extra (Dict[str, Any])
     """
     frame: int
     keyframe: bool
+    segment_index: Optional[int] = None
 
 
 class VideoClassificationAnnotation(ClassificationAnnotation):
@@ -87,6 +89,8 @@ class VideoClassificationAnnotation(ClassificationAnnotation):
         feature_schema_id (Optional[Cuid])
         value (Union[Text, Checklist, Radio, Dropdown])
         frame (int): The frame index that this annotation corresponds to
+        segment_id (Optional[Int]): Index of video segment this annotation belongs to
         extra (Dict[str, Any])
     """
     frame: int
+    segment_index: Optional[int] = None
diff --git a/labelbox/data/serialization/ndjson/label.py b/labelbox/data/serialization/ndjson/label.py
@@ -88,6 +88,39 @@ def _get_consecutive_frames(
             consecutive.append((group[0], group[-1]))
         return consecutive
 
+    @classmethod
+    def _get_segment_frame_ranges(
+        cls, annotation_group: List[Union[VideoClassificationAnnotation,
+                                          VideoObjectAnnotation]]
+    ) -> List[Tuple[int, int]]:
+        sorted_frame_segment_indices = sorted([
+            (annotation.frame, annotation.segment_index)
+            for annotation in annotation_group
+            if annotation.segment_index is not None
+        ])
+        if len(sorted_frame_segment_indices) == 0:
+            # Group segment by consecutive frames, since `segment_index` is not present
+            return cls._get_consecutive_frames(
+                sorted([annotation.frame for annotation in annotation_group]))
+        elif len(sorted_frame_segment_indices) == len(annotation_group):
+            # Group segment by segment_index
+            last_segment_id = 0
+            segment_groups = defaultdict(list)
+            for frame, segment_index in sorted_frame_segment_indices:
+                if segment_index < last_segment_id:
+                    raise ValueError(
+                        f"`segment_index` must be in ascending order. Please investigate video annotation at frame, '{frame}'"
+                    )
+                segment_groups[segment_index].append(frame)
+                last_segment_id = segment_index
+            frame_ranges = []
+            for group in segment_groups.values():
+                frame_ranges.append((group[0], group[-1]))
+            return frame_ranges
+        else:
+            raise ValueError(
+                f"Video annotations cannot partially have `segment_index` set")
+
     @classmethod
     def _create_video_annotations(
         cls, label: Label
@@ -102,12 +135,12 @@ def _create_video_annotations(
                                   annot.name].append(annot)
 
         for annotation_group in video_annotations.values():
-            consecutive_frames = cls._get_consecutive_frames(
-                sorted([annotation.frame for annotation in annotation_group]))
+            segment_frame_ranges = cls._get_segment_frame_ranges(
+                annotation_group)
             if isinstance(annotation_group[0], VideoClassificationAnnotation):
                 annotation = annotation_group[0]
                 frames_data = []
-                for frames in consecutive_frames:
+                for frames in segment_frame_ranges:
                     frames_data.append({'start': frames[0], 'end': frames[-1]})
                 annotation.extra.update({'frames': frames_data})
                 yield NDClassification.from_common(annotation, label.data)
@@ -118,7 +151,7 @@ def _create_video_annotations(
                     for video object annotations
                     and will not import alongside the object annotations.""")
                 segments = []
-                for start_frame, end_frame in consecutive_frames:
+                for start_frame, end_frame in segment_frame_ranges:
                     segment = []
                     for annotation in annotation_group:
                         if annotation.keyframe and start_frame <= annotation.frame <= end_frame:
diff --git a/labelbox/data/serialization/ndjson/objects.py b/labelbox/data/serialization/ndjson/objects.py
@@ -65,9 +65,10 @@ def from_common(cls, point: Point,
 class NDFramePoint(VideoSupported):
     point: _Point
 
-    def to_common(self, name: str,
-                  feature_schema_id: Cuid) -> VideoObjectAnnotation:
+    def to_common(self, name: str, feature_schema_id: Cuid,
+                  segment_index: int) -> VideoObjectAnnotation:
         return VideoObjectAnnotation(frame=self.frame,
+                                     segment_index=segment_index,
                                      keyframe=True,
                                      name=name,
                                      feature_schema_id=feature_schema_id,
@@ -104,10 +105,11 @@ def from_common(cls, line: Line,
 class NDFrameLine(VideoSupported):
     line: List[_Point]
 
-    def to_common(self, name: str,
-                  feature_schema_id: Cuid) -> VideoObjectAnnotation:
+    def to_common(self, name: str, feature_schema_id: Cuid,
+                  segment_index: int) -> VideoObjectAnnotation:
         return VideoObjectAnnotation(
             frame=self.frame,
+            segment_index=segment_index,
             keyframe=True,
             name=name,
             feature_schema_id=feature_schema_id,
@@ -171,10 +173,11 @@ def from_common(cls, rectangle: Rectangle,
 class NDFrameRectangle(VideoSupported):
     bbox: Bbox
 
-    def to_common(self, name: str,
-                  feature_schema_id: Cuid) -> VideoObjectAnnotation:
+    def to_common(self, name: str, feature_schema_id: Cuid,
+                  segment_index: int) -> VideoObjectAnnotation:
         return VideoObjectAnnotation(
             frame=self.frame,
+            segment_index=segment_index,
             keyframe=True,
             name=name,
             feature_schema_id=feature_schema_id,
@@ -211,11 +214,13 @@ def segment_with_uuid(keyframe: Union[NDFrameRectangle, NDFramePoint,
         keyframe.extra = {'uuid': uuid}
         return keyframe
 
-    def to_common(self, name: str, feature_schema_id: Cuid, uuid: str):
+    def to_common(self, name: str, feature_schema_id: Cuid, uuid: str,
+                  segment_index: int):
         return [
             self.segment_with_uuid(
                 keyframe.to_common(name=name,
-                                   feature_schema_id=feature_schema_id), uuid)
+                                   feature_schema_id=feature_schema_id,
+                                   segment_index=segment_index), uuid)
             for keyframe in self.keyframes
         ]
 
@@ -235,11 +240,12 @@ class NDSegments(NDBaseObject):
 
     def to_common(self, name: str, feature_schema_id: Cuid):
         result = []
-        for segment in self.segments:
+        for idx, segment in enumerate(self.segments):
             result.extend(
                 NDSegment.to_common(segment,
                                     name=name,
                                     feature_schema_id=feature_schema_id,
+                                    segment_index=idx,
                                     uuid=self.uuid))
         return result
 
diff --git a/tests/data/assets/ndjson/video_import.json b/tests/data/assets/ndjson/video_import.json
@@ -30,13 +30,17 @@
                     {
                         "frame": 1,
                         "line": [{"x": 10.0, "y": 10.0}, {"x": 100.0, "y": 100.0}, {"x": 50.0, "y": 30.0}]
+                    },
+                    {
+                        "frame": 5,
+                        "line": [{"x": 15.0, "y": 10.0}, {"x": 50.0, "y": 100.0}, {"x": 50.0, "y": 30.0}]
                     }
                 ]
             },
             {
                 "keyframes": [
                     {
-                        "frame": 5,
+                        "frame": 8,
                         "line": [{"x": 100.0, "y": 10.0}, {"x": 50.0, "y": 100.0}, {"x": 50.0, "y": 30.0}]
                     }
                 ]
@@ -62,6 +66,10 @@
                     {
                         "frame": 5,
                         "point": {"x": 50.0, "y": 50.0}
+                    },
+                    {
+                        "frame": 10,
+                        "point": {"x": 10.0, "y": 50.0}
                     }
                 ]
             }
@@ -78,13 +86,17 @@
                     {
                         "frame": 1,
                         "bbox": {"top": 10.0, "left": 5.0, "height": 100.0, "width": 150.0}
+                    },
+                    {
+                        "frame": 5,
+                        "bbox": {"top": 30.0, "left": 5.0, "height": 50.0, "width": 150.0}
                     }
                 ]
             },
             {
                 "keyframes": [
                     {
-                        "frame": 5,
+                        "frame": 10,
                         "bbox": {"top": 300.0, "left": 200.0, "height": 400.0, "width": 150.0}
                     }
                 ]
diff --git a/tests/data/assets/ndjson/video_import_name_only.json b/tests/data/assets/ndjson/video_import_name_only.json
@@ -30,13 +30,17 @@
                     {
                         "frame": 1,
                         "line": [{"x": 10.0, "y": 10.0}, {"x": 100.0, "y": 100.0}, {"x": 50.0, "y": 30.0}]
+                    },
+                    {
+                        "frame": 5,
+                        "line": [{"x": 15.0, "y": 10.0}, {"x": 50.0, "y": 100.0}, {"x": 50.0, "y": 30.0}]
                     }
                 ]
             },
             {
                 "keyframes": [
                     {
-                        "frame": 5,
+                        "frame": 8,
                         "line": [{"x": 100.0, "y": 10.0}, {"x": 50.0, "y": 100.0}, {"x": 50.0, "y": 30.0}]
                     }
                 ]
@@ -62,6 +66,10 @@
                     {
                         "frame": 5,
                         "point": {"x": 50.0, "y": 50.0}
+                    },
+                    {
+                        "frame": 10,
+                        "point": {"x": 10.0, "y": 50.0}
                     }
                 ]
             }
@@ -78,13 +86,17 @@
                     {
                         "frame": 1,
                         "bbox": {"top": 10.0, "left": 5.0, "height": 100.0, "width": 150.0}
+                    },
+                    {
+                        "frame": 5,
+                        "bbox": {"top": 30.0, "left": 5.0, "height": 50.0, "width": 150.0}
                     }
                 ]
             },
             {
                 "keyframes": [
                     {
-                        "frame": 5,
+                        "frame": 10,
                         "bbox": {"top": 300.0, "left": 200.0, "height": 400.0, "width": 150.0}
                     }
                 ]

Original file line number	Diff line number	Diff line change
`@@ -30,13 +30,17 @@`
`30`	`30`	`{`
`31`	`31`	`"frame": 1,`
`32`	`32`	`"line": [{"x": 10.0, "y": 10.0}, {"x": 100.0, "y": 100.0}, {"x": 50.0, "y": 30.0}]`
	`33`	`+ },`
	`34`	`+ {`
	`35`	`+ "frame": 5,`
	`36`	`+ "line": [{"x": 15.0, "y": 10.0}, {"x": 50.0, "y": 100.0}, {"x": 50.0, "y": 30.0}]`
`33`	`37`	`}`
`34`	`38`	`]`
`35`	`39`	`},`
`36`	`40`	`{`
`37`	`41`	`"keyframes": [`
`38`	`42`	`{`
`39`		`- "frame": 5,`
	`43`	`+ "frame": 8,`
`40`	`44`	`"line": [{"x": 100.0, "y": 10.0}, {"x": 50.0, "y": 100.0}, {"x": 50.0, "y": 30.0}]`
`41`	`45`	`}`
`42`	`46`	`]`
`@@ -62,6 +66,10 @@`
`62`	`66`	`{`
`63`	`67`	`"frame": 5,`
`64`	`68`	`"point": {"x": 50.0, "y": 50.0}`
	`69`	`+ },`
	`70`	`+ {`
	`71`	`+ "frame": 10,`
	`72`	`+ "point": {"x": 10.0, "y": 50.0}`
`65`	`73`	`}`
`66`	`74`	`]`
`67`	`75`	`}`
`@@ -78,13 +86,17 @@`
`78`	`86`	`{`
`79`	`87`	`"frame": 1,`
`80`	`88`	`"bbox": {"top": 10.0, "left": 5.0, "height": 100.0, "width": 150.0}`
	`89`	`+ },`
	`90`	`+ {`
	`91`	`+ "frame": 5,`
	`92`	`+ "bbox": {"top": 30.0, "left": 5.0, "height": 50.0, "width": 150.0}`
`81`	`93`	`}`
`82`	`94`	`]`
`83`	`95`	`},`
`84`	`96`	`{`
`85`	`97`	`"keyframes": [`
`86`	`98`	`{`
`87`		`- "frame": 5,`
	`99`	`+ "frame": 10,`
`88`	`100`	`"bbox": {"top": 300.0, "left": 200.0, "height": 400.0, "width": 150.0}`
`89`	`101`	`}`
`90`	`102`	`]`