chore: final nail; interface is simple and works with frame arg

rishisurana-labelbox · rishisurana-labelbox · commit 478fb23fec5b · 2025-10-02T21:26:11.000-07:00
diff --git a/examples/annotation_import/audio.ipynb b/examples/annotation_import/audio.ipynb
@@ -259,14 +259,14 @@
   },
   {
    "metadata": {},
-   "source": "# Define tokens with precise timing (from demo script)\ntokens_data = [\n    (\"Hello\", 586, 770),  # Hello: frames 586-770\n    (\"AI\", 771, 955),  # AI: frames 771-955\n    (\"how\", 956, 1140),  # how: frames 956-1140\n    (\"are\", 1141, 1325),  # are: frames 1141-1325\n    (\"you\", 1326, 1510),  # you: frames 1326-1510\n    (\"doing\", 1511, 1695),  # doing: frames 1511-1695\n    (\"today\", 1696, 1880),  # today: frames 1696-1880\n]\n\n# Create temporal annotations for each token using NEW frames interface\ntemporal_annotations = []\nfor token, start_frame, end_frame in tokens_data:\n    token_annotation = lb_types.AudioClassificationAnnotation(\n        frames=[lb_types.FrameLocation(start=start_frame, end=end_frame)],\n        name=\"User Speaker\",\n        value=lb_types.Text(answer=token),\n    )\n    temporal_annotations.append(token_annotation)\n\nprint(f\"Created {len(temporal_annotations)} temporal token annotations\")",
+   "source": "# Define tokens with precise timing\ntokens_data = [\n    (\"Hello\", 586, 770),  # Hello: frames 586-770\n    (\"AI\", 771, 955),  # AI: frames 771-955\n    (\"how\", 956, 1140),  # how: frames 956-1140\n    (\"are\", 1141, 1325),  # are: frames 1141-1325\n    (\"you\", 1326, 1510),  # you: frames 1326-1510\n    (\"doing\", 1511, 1695),  # doing: frames 1511-1695\n    (\"today\", 1696, 1880),  # today: frames 1696-1880\n]\n\n# Create temporal annotations for each token\ntemporal_annotations = []\nfor token, start_frame, end_frame in tokens_data:\n    token_annotation = lb_types.AudioClassificationAnnotation(\n        start_frame=start_frame,\n        end_frame=end_frame,\n        name=\"User Speaker\",\n        value=lb_types.Text(answer=token),\n    )\n    temporal_annotations.append(token_annotation)\n\nprint(f\"Created {len(temporal_annotations)} temporal token annotations\")",
    "cell_type": "code",
    "outputs": [],
    "execution_count": null
   },
   {
    "metadata": {},
-   "source": "# Create label with regular and temporal annotations\nlabel_with_temporal = []\nlabel_with_temporal.append(\n    lb_types.Label(\n        data={\"global_key\": global_key},\n        annotations=[text_annotation, checklist_annotation, radio_annotation] +\n        temporal_annotations,\n    ))\n\nprint(\n    f\"Created label with {len(label_with_temporal[0].annotations)} total annotations\"\n)\nprint(f\"  - Regular annotations: 3\")\nprint(f\"  - Temporal annotations: {len(temporal_annotations)}\")\n\n# Example: Nested temporal annotation with explicit frame matching\n# Structure: Speaker -> Transcription -> Emotion -> Intensity\n# Each level can have different frame ranges (subsets of parent)\nnested_temporal_annotation = lb_types.AudioClassificationAnnotation(\n    frames=[lb_types.FrameLocation(start=100, end=500)],\n    name=\"Speaker Analysis\",\n    value=lb_types.Radio(\n        answer=lb_types.ClassificationAnswer(\n            name=\"User\",\n            classifications=[\n                lb_types.ClassificationAnnotation(\n                    name=\"Transcription\",\n                    value=lb_types.Text(answer=\"Hello there\"),\n                    frames=[lb_types.FrameLocation(start=100, end=500)],\n                    classifications=[\n                        lb_types.ClassificationAnnotation(\n                            name=\"Emotion\",\n                            value=lb_types.Radio(\n                                answer=lb_types.ClassificationAnswer(\n                                    name=\"happy\",\n                                    frames=[lb_types.FrameLocation(start=150, end=450)],\n                                    classifications=[\n                                        lb_types.ClassificationAnnotation(\n                                            name=\"Intensity\",\n                                            value=lb_types.Radio(\n                                                answer=lb_types.ClassificationAnswer(\n                                                    name=\"high\",\n                                                    frames=[lb_types.FrameLocation(start=200, end=400)]\n                                                )\n                                            )\n                                        )\n                                    ]\n                                )\n                            ),\n                            frames=[lb_types.FrameLocation(start=150, end=450)]\n                        )\n                    ]\n                )\n            ]\n        )\n    )\n)\n\nprint(\"\\nNested temporal annotation created:\")\nprint(\"  - Speaker: 100-500ms\")\nprint(\"    → Transcription: 100-500ms\")\nprint(\"      → Emotion: 150-450ms (subset)\")\nprint(\"        → Intensity: 200-400ms (subset)\")\n",
+   "source": "# Create label with regular and temporal annotations\nlabel_with_temporal = []\nlabel_with_temporal.append(\n    lb_types.Label(\n        data={\"global_key\": global_key},\n        annotations=[text_annotation, checklist_annotation, radio_annotation] +\n        temporal_annotations,\n    ))\n\nprint(\n    f\"Created label with {len(label_with_temporal[0].annotations)} total annotations\"\n)\nprint(f\"  - Regular annotations: 3\")\nprint(f\"  - Temporal annotations: {len(temporal_annotations)}\")\n\n# Example: Nested temporal annotation with hierarchical classifications\n# Structure: Speaker -> Transcription -> Emotion -> Intensity\n# Parent uses start_frame/end_frame, nested items use frames for discontinuous ranges\nnested_temporal_annotation = lb_types.AudioClassificationAnnotation(\n    start_frame=100,\n    end_frame=500,\n    name=\"Speaker Analysis\",\n    value=lb_types.Radio(\n        answer=lb_types.ClassificationAnswer(\n            name=\"User\",\n            classifications=[\n                lb_types.ClassificationAnnotation(\n                    name=\"Transcription\",\n                    value=lb_types.Text(answer=\"Hello there\"),\n                    frames=[lb_types.FrameLocation(start=100, end=500)],\n                    classifications=[\n                        lb_types.ClassificationAnnotation(\n                            name=\"Emotion\",\n                            value=lb_types.Radio(\n                                answer=lb_types.ClassificationAnswer(\n                                    name=\"happy\",\n                                    frames=[lb_types.FrameLocation(start=150, end=450)],\n                                    classifications=[\n                                        lb_types.ClassificationAnnotation(\n                                            name=\"Intensity\",\n                                            value=lb_types.Radio(\n                                                answer=lb_types.ClassificationAnswer(\n                                                    name=\"high\",\n                                                    frames=[lb_types.FrameLocation(start=200, end=400)]\n                                                )\n                                            )\n                                        )\n                                    ]\n                                )\n                            ),\n                            frames=[lb_types.FrameLocation(start=150, end=450)]\n                        )\n                    ]\n                )\n            ]\n        )\n    )\n)\n\nprint(\"\\nNested temporal annotation created:\")\nprint(\"  - Speaker: 100-500ms (parent range)\")\nprint(\"    → Transcription: 100-500ms\")\nprint(\"      → Emotion: 150-450ms (nested subset)\")\nprint(\"        → Intensity: 200-400ms (nested subset)\")\n",
    "cell_type": "code",
    "outputs": [],
    "execution_count": null
diff --git a/libs/labelbox/src/labelbox/data/annotation_types/audio.py b/libs/labelbox/src/labelbox/data/annotation_types/audio.py
@@ -8,7 +8,7 @@
 
 
 class AudioClassificationAnnotation(ClassificationAnnotation):
-    """Audio classification for specific time range(s)
+    """Audio classification for specific time range
 
     Examples:
     - Speaker identification from 2500ms to 4100ms
@@ -19,10 +19,19 @@ class AudioClassificationAnnotation(ClassificationAnnotation):
         name (Optional[str]): Name of the classification
         feature_schema_id (Optional[Cuid]): Feature schema identifier
         value (Union[Text, Checklist, Radio]): Classification value
-        frames (Optional[List[FrameLocation]]): List of frame ranges (in milliseconds)
+        start_frame (Optional[int]): Start frame in milliseconds
+        end_frame (Optional[int]): End frame in milliseconds
         segment_index (Optional[int]): Index of audio segment this annotation belongs to
         extra (Dict[str, Any]): Additional metadata
+
+    Note:
+        Parent AudioClassificationAnnotation uses start_frame/end_frame (single range).
+        Nested classifications/answers use frames: List[FrameLocation] for discontinuous ranges.
+        Multiple time ranges for same classification = multiple separate annotation objects.
     """
 
-    frames: Optional[List[FrameLocation]] = None
+    start_frame: Optional[int] = Field(
+        default=None, validation_alias=AliasChoices("start_frame", "frame")
+    )
+    end_frame: Optional[int] = None
     segment_index: Optional[int] = None
diff --git a/libs/labelbox/src/labelbox/data/annotation_types/classification/classification.py b/libs/labelbox/src/labelbox/data/annotation_types/classification/classification.py
@@ -34,10 +34,6 @@ class ClassificationAnswer(FeatureSchema, ConfidenceMixin, CustomMetricsMixin):
     classifications: Optional[List["ClassificationAnnotation"]] = None
     frames: Optional[List[FrameLocation]] = None
 
-    # Deprecated: use frames instead
-    start_frame: Optional[int] = None
-    end_frame: Optional[int] = None
-
 
 class Radio(ConfidenceMixin, CustomMetricsMixin, BaseModel):
     """A classification with only one selected option allowed
@@ -92,6 +88,3 @@ class ClassificationAnnotation(
     message_id: Optional[str] = None
     frames: Optional[List[FrameLocation]] = None
 
-    # Deprecated: use frames instead
-    start_frame: Optional[int] = None
-    end_frame: Optional[int] = None