Skip to content

Commit 0d32b92

Browse files
PTDT-3807: Add temporal audio annotation support (#2013)
2 parents 2fab8c9 + f49a1d8 commit 0d32b92

File tree

8 files changed

+1394
-90
lines changed

8 files changed

+1394
-90
lines changed

examples/README.md

Lines changed: 84 additions & 84 deletions
Large diffs are not rendered by default.

examples/annotation_import/audio.ipynb

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@
170170
},
171171
{
172172
"metadata": {},
173-
"source": "ontology_builder = lb.OntologyBuilder(classifications=[\n lb.Classification(class_type=lb.Classification.Type.TEXT,\n name=\"text_audio\"),\n lb.Classification(\n class_type=lb.Classification.Type.CHECKLIST,\n name=\"checklist_audio\",\n options=[\n lb.Option(value=\"first_checklist_answer\"),\n lb.Option(value=\"second_checklist_answer\"),\n ],\n ),\n lb.Classification(\n class_type=lb.Classification.Type.RADIO,\n name=\"radio_audio\",\n options=[\n lb.Option(value=\"first_radio_answer\"),\n lb.Option(value=\"second_radio_answer\"),\n ],\n ),\n])\n\nontology = client.create_ontology(\n \"Ontology Audio Annotations\",\n ontology_builder.asdict(),\n media_type=lb.MediaType.Audio,\n)",
173+
"source": "ontology_builder = lb.OntologyBuilder(classifications=[\n lb.Classification(class_type=lb.Classification.Type.TEXT,\n name=\"text_audio\"),\n lb.Classification(\n class_type=lb.Classification.Type.CHECKLIST,\n name=\"checklist_audio\",\n options=[\n lb.Option(value=\"first_checklist_answer\"),\n lb.Option(value=\"second_checklist_answer\"),\n ],\n ),\n lb.Classification(\n class_type=lb.Classification.Type.RADIO,\n name=\"radio_audio\",\n options=[\n lb.Option(value=\"first_radio_answer\"),\n lb.Option(value=\"second_radio_answer\"),\n ],\n ),\n # Temporal classification for token-level annotations\n lb.Classification(\n class_type=lb.Classification.Type.TEXT,\n name=\"User Speaker\",\n scope=lb.Classification.Scope.INDEX, # INDEX scope for temporal\n ),\n])\n\nontology = client.create_ontology(\n \"Ontology Audio Annotations\",\n ontology_builder.asdict(),\n media_type=lb.MediaType.Audio,\n)",
174174
"cell_type": "code",
175175
"outputs": [],
176176
"execution_count": null
@@ -252,6 +252,40 @@
252252
],
253253
"cell_type": "markdown"
254254
},
255+
{
256+
"metadata": {},
257+
"source": [
258+
"## Temporal Audio Annotations\n",
259+
"\n",
260+
"Labelbox supports temporal annotations for audio/video with frame-level precision using the new temporal classification API.\n",
261+
"\n",
262+
"### Key Features:\n",
263+
"- **Frame-based timing**: All annotations use millisecond precision\n",
264+
"- **Deep nesting**: Support for multi-level nested classifications (Text > Text > Text, Radio > Radio > Radio, etc.)\n",
265+
"- **Inductive structures**: Multiple parent values can share nested classifications that are automatically split based on frame overlap\n",
266+
"- **Frame validation**: Frames start at 1 (not 0) and must be non-overlapping for Text and Radio siblings\n",
267+
"\n",
268+
"### Important Constraints:\n",
269+
"1. **Frame indexing**: Frames are 1-based (frame 0 is invalid)\n",
270+
"2. **Non-overlapping siblings**: Text and Radio classifications at the same level cannot have overlapping frame ranges\n",
271+
"3. **Overlapping checklists**: Only Checklist answers can have overlapping frame ranges with their siblings"
272+
],
273+
"cell_type": "markdown"
274+
},
275+
{
276+
"metadata": {},
277+
"source": "# Define tokens with precise timing (from demo script)\ntokens_data = [\n (\"Hello\", 586, 770), # Hello: frames 586-770\n (\"AI\", 771, 955), # AI: frames 771-955\n (\"how\", 956, 1140), # how: frames 956-1140\n (\"are\", 1141, 1325), # are: frames 1141-1325\n (\"you\", 1326, 1510), # you: frames 1326-1510\n (\"doing\", 1511, 1695), # doing: frames 1511-1695\n (\"today\", 1696, 1880), # today: frames 1696-1880\n]\n\n# Create temporal annotations for each token\ntemporal_annotations = []\nfor token, start_frame, end_frame in tokens_data:\n token_annotation = lb_types.AudioClassificationAnnotation(\n frame=start_frame,\n end_frame=end_frame,\n name=\"User Speaker\",\n value=lb_types.Text(answer=token),\n )\n temporal_annotations.append(token_annotation)\n\nprint(f\"Created {len(temporal_annotations)} temporal token annotations\")",
278+
"cell_type": "code",
279+
"outputs": [],
280+
"execution_count": null
281+
},
282+
{
283+
"metadata": {},
284+
"source": "# Create label with both regular and temporal annotations\nlabel_with_temporal = []\nlabel_with_temporal.append(\n lb_types.Label(\n data={\"global_key\": global_key},\n annotations=[text_annotation, checklist_annotation, radio_annotation] +\n temporal_annotations,\n ))\n\nprint(\n f\"Created label with {len(label_with_temporal[0].annotations)} total annotations\"\n)\nprint(\" - Regular annotations: 3\")\nprint(f\" - Temporal annotations: {len(temporal_annotations)}\")",
285+
"cell_type": "code",
286+
"outputs": [],
287+
"execution_count": null
288+
},
255289
{
256290
"metadata": {},
257291
"source": [
@@ -260,6 +294,13 @@
260294
],
261295
"cell_type": "markdown"
262296
},
297+
{
298+
"metadata": {},
299+
"source": "# Upload temporal annotations via MAL\ntemporal_upload_job = lb.MALPredictionImport.create_from_objects(\n client=client,\n project_id=project.uid,\n name=f\"temporal_mal_job-{str(uuid.uuid4())}\",\n predictions=label_with_temporal,\n)\n\ntemporal_upload_job.wait_until_done()\nprint(\"Temporal upload completed!\")\nprint(\"Errors:\", temporal_upload_job.errors)\nprint(\"Status:\", temporal_upload_job.statuses)",
300+
"cell_type": "code",
301+
"outputs": [],
302+
"execution_count": null
303+
},
263304
{
264305
"metadata": {},
265306
"source": "# Upload our label using Model-Assisted Labeling\nupload_job = lb.MALPredictionImport.create_from_objects(\n client=client,\n project_id=project.uid,\n name=f\"mal_job-{str(uuid.uuid4())}\",\n predictions=label,\n)\n\nupload_job.wait_until_done()\nprint(\"Errors:\", upload_job.errors)\nprint(\"Status of uploads: \", upload_job.statuses)",

libs/labelbox/src/labelbox/data/annotation_types/__init__.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
from .video import MaskInstance
2020
from .video import VideoMaskAnnotation
2121

22+
from .temporal import TemporalClassificationText
23+
from .temporal import TemporalClassificationQuestion
24+
from .temporal import TemporalClassificationAnswer
25+
2226
from .ner import ConversationEntity
2327
from .ner import DocumentEntity
2428
from .ner import DocumentTextSelection
@@ -59,3 +63,70 @@
5963
MessageRankingTask,
6064
MessageEvaluationTaskAnnotation,
6165
)
66+
67+
__all__ = [
68+
# Geometry
69+
"Line",
70+
"Point",
71+
"Mask",
72+
"Polygon",
73+
"Rectangle",
74+
"Geometry",
75+
"DocumentRectangle",
76+
"RectangleUnit",
77+
# Annotation
78+
"ClassificationAnnotation",
79+
"ObjectAnnotation",
80+
# Relationship
81+
"RelationshipAnnotation",
82+
"Relationship",
83+
# Video
84+
"VideoClassificationAnnotation",
85+
"VideoObjectAnnotation",
86+
"MaskFrame",
87+
"MaskInstance",
88+
"VideoMaskAnnotation",
89+
# Temporal
90+
"TemporalClassificationText",
91+
"TemporalClassificationQuestion",
92+
"TemporalClassificationAnswer",
93+
# NER
94+
"ConversationEntity",
95+
"DocumentEntity",
96+
"DocumentTextSelection",
97+
"TextEntity",
98+
# Classification
99+
"Checklist",
100+
"ClassificationAnswer",
101+
"Radio",
102+
"Text",
103+
# Data
104+
"GenericDataRowData",
105+
"MaskData",
106+
# Label
107+
"Label",
108+
"LabelGenerator",
109+
# Metrics
110+
"ScalarMetric",
111+
"ScalarMetricAggregation",
112+
"ConfusionMatrixMetric",
113+
"ConfusionMatrixAggregation",
114+
"ScalarMetricValue",
115+
"ConfusionMatrixMetricValue",
116+
# Tiled Image
117+
"EPSG",
118+
"EPSGTransformer",
119+
"TiledBounds",
120+
"TiledImageData",
121+
"TileLayer",
122+
# LLM Prompt Response
123+
"PromptText",
124+
"PromptClassificationAnnotation",
125+
# MMC
126+
"MessageInfo",
127+
"OrderedMessageInfo",
128+
"MessageSingleSelectionTask",
129+
"MessageMultiSelectionTask",
130+
"MessageRankingTask",
131+
"MessageEvaluationTaskAnnotation",
132+
]

libs/labelbox/src/labelbox/data/annotation_types/label.py

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@
1313
from .metrics import ScalarMetric, ConfusionMatrixMetric
1414
from .video import VideoClassificationAnnotation
1515
from .video import VideoObjectAnnotation, VideoMaskAnnotation
16+
from .temporal import (
17+
TemporalClassificationText,
18+
TemporalClassificationQuestion,
19+
)
1620
from .mmc import MessageEvaluationTaskAnnotation
1721
from pydantic import BaseModel, field_validator
1822

@@ -44,6 +48,8 @@ class Label(BaseModel):
4448
ClassificationAnnotation,
4549
ObjectAnnotation,
4650
VideoMaskAnnotation,
51+
TemporalClassificationText,
52+
TemporalClassificationQuestion,
4753
ScalarMetric,
4854
ConfusionMatrixMetric,
4955
RelationshipAnnotation,
@@ -63,8 +69,22 @@ def validate_data(cls, data):
6369
def object_annotations(self) -> List[ObjectAnnotation]:
6470
return self._get_annotations_by_type(ObjectAnnotation)
6571

66-
def classification_annotations(self) -> List[ClassificationAnnotation]:
67-
return self._get_annotations_by_type(ClassificationAnnotation)
72+
def classification_annotations(
73+
self,
74+
) -> List[
75+
Union[
76+
ClassificationAnnotation,
77+
TemporalClassificationText,
78+
TemporalClassificationQuestion,
79+
]
80+
]:
81+
return self._get_annotations_by_type(
82+
(
83+
ClassificationAnnotation,
84+
TemporalClassificationText,
85+
TemporalClassificationQuestion,
86+
)
87+
)
6888

6989
def _get_annotations_by_type(self, annotation_type):
7090
return [
@@ -75,15 +95,58 @@ def _get_annotations_by_type(self, annotation_type):
7595

7696
def frame_annotations(
7797
self,
78-
) -> Dict[str, Union[VideoObjectAnnotation, VideoClassificationAnnotation]]:
98+
) -> Dict[
99+
Union[int, None],
100+
List[
101+
Union[
102+
VideoObjectAnnotation,
103+
VideoClassificationAnnotation,
104+
TemporalClassificationText,
105+
TemporalClassificationQuestion,
106+
]
107+
],
108+
]:
109+
"""Get temporal annotations organized by frame
110+
111+
Returns:
112+
Dict[int, List]: Dictionary mapping frame (milliseconds) to list of temporal annotations
113+
114+
Example:
115+
>>> label.frame_annotations()
116+
{2500: [VideoClassificationAnnotation(...), TemporalClassificationText(...)]}
117+
118+
Note:
119+
For TemporalClassificationText/Question, returns dictionary mapping to start of first frame range.
120+
These annotations may have multiple discontinuous frame ranges.
121+
"""
79122
frame_dict = defaultdict(list)
80123
for annotation in self.annotations:
81124
if isinstance(
82125
annotation,
83126
(VideoObjectAnnotation, VideoClassificationAnnotation),
84127
):
85128
frame_dict[annotation.frame].append(annotation)
86-
return frame_dict
129+
elif isinstance(
130+
annotation,
131+
(TemporalClassificationText, TemporalClassificationQuestion),
132+
):
133+
# For temporal annotations with multiple values/answers, use first frame
134+
if (
135+
isinstance(annotation, TemporalClassificationText)
136+
and annotation.value
137+
):
138+
frame_dict[annotation.value[0][0]].append(
139+
annotation
140+
) # value[0][0] is start_frame
141+
elif (
142+
isinstance(annotation, TemporalClassificationQuestion)
143+
and annotation.value
144+
):
145+
if annotation.value[0].frames:
146+
frame_dict[annotation.value[0].frames[0][0]].append(
147+
annotation
148+
) # frames[0][0] is start_frame
149+
return dict(frame_dict)
87150

88151
def add_url_to_masks(self, signer) -> "Label":
89152
"""

0 commit comments

Comments
 (0)