chore: it works for temporal text/radio/checklist classifications

rishisurana-labelbox · rishisurana-labelbox · commit 16896fd92968 · 2025-09-11T12:13:20.000-07:00
diff --git a/examples/annotation_import/audio_temporal.ipynb b/examples/annotation_import/audio_temporal.ipynb
@@ -49,10 +49,11 @@
         "\n",
         "## Key Features\n",
         "\n",
-        "- **Time-based API**: Use seconds for user-friendly input\n",
-        "- **Frame-based storage**: Internally uses milliseconds (1 frame = 1ms)\n",
+        "- **Millisecond-based API**: Direct millisecond input for precise timing control\n",
+        "- **Video-compatible structure**: Matches video temporal annotation pattern exactly\n",
+        "- **Keyframe serialization**: Proper NDJSON structure for frontend timeline display\n",
         "- **MAL compatible**: Works with existing Model-Assisted Labeling pipeline\n",
-        "- **UI compatible**: Uses existing video timeline components\n",
+        "- **UI compatible**: Uses existing video timeline components seamlessly\n",
         "\n",
         "## Import Methods\n",
         "\n",
diff --git a/libs/labelbox/src/labelbox/data/annotation_types/audio.py b/libs/labelbox/src/labelbox/data/annotation_types/audio.py
@@ -17,42 +17,14 @@ class AudioClassificationAnnotation(ClassificationAnnotation):
         feature_schema_id (Optional[Cuid]): Feature schema identifier
         value (Union[Text, Checklist, Radio]): Classification value
         frame (int): The frame index in milliseconds (e.g., 2500 = 2.5 seconds)
+        end_frame (Optional[int]): End frame in milliseconds (for time ranges)
         segment_index (Optional[int]): Index of audio segment this annotation belongs to
         extra (Dict[str, Any]): Additional metadata
     """
 
     frame: int
+    end_frame: Optional[int] = None
     segment_index: Optional[int] = None
-    
-    @classmethod
-    def from_time_range(cls, start_ms: int, end_ms: int, **kwargs):
-        """Create from milliseconds (user-friendly) to frames (internal)
-        
-        Args:
-            start_ms (int): Start time in milliseconds
-            end_ms (int): End time in milliseconds  
-            **kwargs: Additional arguments for the annotation
-            
-        Returns:
-            AudioClassificationAnnotation: Annotation with frame set to start_ms
-            
-        Example:
-            >>> AudioClassificationAnnotation.from_time_range(
-            ...     start_ms=2500, end_ms=4100,
-            ...     name="speaker_id",
-            ...     value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name="john"))
-            ... )
-        """
-        return cls(frame=start_ms, **kwargs)
-    
-    @property
-    def start_time(self) -> float:
-        """Convert frame to seconds for user-facing APIs
-        
-        Returns:
-            float: Time in seconds (e.g., 2500 -> 2.5)
-        """
-        return self.frame / 1000.0
 
 
 class AudioObjectAnnotation(ObjectAnnotation, ConfidenceNotSupportedMixin, CustomMetricsNotSupportedMixin):
@@ -68,42 +40,14 @@ class AudioObjectAnnotation(ObjectAnnotation, ConfidenceNotSupportedMixin, Custo
         feature_schema_id (Optional[Cuid]): Feature schema identifier
         value (Union[TextEntity, Geometry]): Localization or text content
         frame (int): The frame index in milliseconds (e.g., 10000 = 10.0 seconds)
+        end_frame (Optional[int]): End frame in milliseconds (for time ranges)
         keyframe (bool): Whether this is a keyframe annotation (default: True)
         segment_index (Optional[int]): Index of audio segment this annotation belongs to
         classifications (Optional[List[ClassificationAnnotation]]): Optional sub-classifications
         extra (Dict[str, Any]): Additional metadata
     """
 
     frame: int
+    end_frame: Optional[int] = None
     keyframe: bool = True
     segment_index: Optional[int] = None
-    
-    @classmethod
-    def from_time_range(cls, start_ms: int, end_ms: int, **kwargs):
-        """Create from milliseconds (user-friendly) to frames (internal)
-        
-        Args:
-            start_ms (int): Start time in milliseconds
-            end_ms (int): End time in milliseconds
-            **kwargs: Additional arguments for the annotation
-            
-        Returns:
-            AudioObjectAnnotation: Annotation with frame set to start_ms
-            
-        Example:
-            >>> AudioObjectAnnotation.from_time_range(
-            ...     start_ms=10000, end_ms=12500,
-            ...     name="transcription",
-            ...     value=lb_types.TextEntity(text="Hello world")
-            ... )
-        """
-        return cls(frame=start_ms, **kwargs)
-    
-    @property
-    def start_time(self) -> float:
-        """Convert frame to seconds for user-facing APIs
-        
-        Returns:
-            float: Time in seconds (e.g., 10000 -> 10.0)
-        """
-        return self.frame / 1000.0
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py
@@ -224,7 +224,7 @@ def from_common(
 # ====== End of subclasses
 
 
-class NDText(NDAnnotation, NDTextSubclass):
+class NDText(NDAnnotation, NDTextSubclass, VideoSupported):
     @classmethod
     def from_common(
         cls,
@@ -243,6 +243,7 @@ def from_common(
             name=name,
             schema_id=feature_schema_id,
             uuid=uuid,
+            frames=extra.get("frames"),
             message_id=message_id,
             confidence=text.confidence,
             custom_metrics=text.custom_metrics,
diff --git a/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py b/libs/labelbox/src/labelbox/data/serialization/ndjson/label.py
@@ -186,12 +186,57 @@ def _create_audio_annotations(
                 )
 
         for annotation_group in audio_annotations.values():
-            # For audio, treat each annotation as a single frame (no segments needed)
             if isinstance(annotation_group[0], AudioClassificationAnnotation):
-                annotation = annotation_group[0]
-                # Add frame information to extra (milliseconds)
-                annotation.extra.update({"frame": annotation.frame})
-                yield NDClassification.from_common(annotation, label.data)
+                # For TEXT classifications, group them into one feature with multiple keyframes
+                from ...annotation_types.classification.classification import Text
+                if isinstance(annotation_group[0].value, Text):
+                    
+                    # Group all annotations into one feature with multiple keyframes
+                    # Use first annotation as template but create combined content
+                    annotation = annotation_group[0]
+                    frames_data = []
+                    all_tokens = []
+                    
+                    for individual_annotation in annotation_group:
+                        frame = individual_annotation.frame
+                        end_frame = individual_annotation.end_frame if hasattr(individual_annotation, 'end_frame') and individual_annotation.end_frame is not None else frame
+                        frames_data.append({"start": frame, "end": end_frame})
+                        all_tokens.append(individual_annotation.value.answer)
+                    
+                    # For per-token annotations, embed token mapping in the content
+                    # Create a JSON structure that includes both the default text and token mapping
+                    import json
+                    token_mapping = {}
+                    for individual_annotation in annotation_group:
+                        frame = individual_annotation.frame
+                        token_mapping[str(frame)] = individual_annotation.value.answer
+                    
+                    # Embed token mapping in the answer field as JSON
+                    content_with_mapping = {
+                        "default_text": " ".join(all_tokens),  # Fallback text
+                        "token_mapping": token_mapping         # Per-keyframe content
+                    }
+                    from ...annotation_types.classification.classification import Text
+                    annotation.value = Text(answer=json.dumps(content_with_mapping))
+                    
+                    # Update the annotation with frames data
+                    annotation.extra = {"frames": frames_data}
+                    yield NDClassification.from_common(annotation, label.data)
+                else:
+                    # For non-TEXT classifications, process each individually
+                    for annotation in annotation_group:
+                        
+                        # Ensure frame data is properly formatted in extra field
+                        if hasattr(annotation, 'frame') and annotation.frame is not None:
+                            if not annotation.extra:
+                                annotation.extra = {}
+                            
+                            if 'frames' not in annotation.extra:
+                                end_frame = annotation.end_frame if hasattr(annotation, 'end_frame') and annotation.end_frame is not None else annotation.frame
+                                frames_data = [{"start": annotation.frame, "end": end_frame}]
+                                annotation.extra.update({"frames": frames_data})
+                        
+                        yield NDClassification.from_common(annotation, label.data)
 
             elif isinstance(annotation_group[0], AudioObjectAnnotation):
                 # For audio objects, treat like single video frame