Skip to content

Commit e4fd630

Browse files
chore: PoC + ipynb
1 parent 2fab8c9 commit e4fd630

File tree

10 files changed

+1618
-4
lines changed

10 files changed

+1618
-4
lines changed

examples/annotation_import/audio_temporal.ipynb

Lines changed: 786 additions & 0 deletions
Large diffs are not rendered by default.

libs/labelbox/src/labelbox/data/annotation_types/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
from .video import MaskInstance
2020
from .video import VideoMaskAnnotation
2121

22+
from .audio import AudioClassificationAnnotation
23+
from .audio import AudioObjectAnnotation
24+
2225
from .ner import ConversationEntity
2326
from .ner import DocumentEntity
2427
from .ner import DocumentTextSelection
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
from typing import Optional
2+
3+
from labelbox.data.annotation_types.annotation import ClassificationAnnotation, ObjectAnnotation
4+
from labelbox.data.mixins import ConfidenceNotSupportedMixin, CustomMetricsNotSupportedMixin
5+
6+
7+
class AudioClassificationAnnotation(ClassificationAnnotation):
8+
"""Audio classification for specific time range
9+
10+
Examples:
11+
- Speaker identification from 2.5s to 4.1s
12+
- Audio quality assessment for a segment
13+
- Language detection for audio segments
14+
15+
Args:
16+
name (Optional[str]): Name of the classification
17+
feature_schema_id (Optional[Cuid]): Feature schema identifier
18+
value (Union[Text, Checklist, Radio]): Classification value
19+
frame (int): The frame index in milliseconds (e.g., 2500 = 2.5 seconds)
20+
segment_index (Optional[int]): Index of audio segment this annotation belongs to
21+
extra (Dict[str, Any]): Additional metadata
22+
"""
23+
24+
frame: int
25+
segment_index: Optional[int] = None
26+
27+
@classmethod
28+
def from_time_range(cls, start_sec: float, end_sec: float, **kwargs):
29+
"""Create from seconds (user-friendly) to frames (internal)
30+
31+
Args:
32+
start_sec (float): Start time in seconds
33+
end_sec (float): End time in seconds
34+
**kwargs: Additional arguments for the annotation
35+
36+
Returns:
37+
AudioClassificationAnnotation: Annotation with frame set to start_sec * 1000
38+
39+
Example:
40+
>>> AudioClassificationAnnotation.from_time_range(
41+
... start_sec=2.5, end_sec=4.1,
42+
... name="speaker_id",
43+
... value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name="john"))
44+
... )
45+
"""
46+
return cls(frame=int(start_sec * 1000), **kwargs)
47+
48+
@property
49+
def start_time(self) -> float:
50+
"""Convert frame to seconds for user-facing APIs
51+
52+
Returns:
53+
float: Time in seconds (e.g., 2500 -> 2.5)
54+
"""
55+
return self.frame / 1000.0
56+
57+
58+
class AudioObjectAnnotation(ObjectAnnotation, ConfidenceNotSupportedMixin, CustomMetricsNotSupportedMixin):
59+
"""Audio object annotation for specific time range
60+
61+
Examples:
62+
- Transcription: "Hello world" from 2.5s to 4.1s
63+
- Sound events: "Dog barking" from 10s to 12s
64+
- Audio segments with metadata
65+
66+
Args:
67+
name (Optional[str]): Name of the annotation
68+
feature_schema_id (Optional[Cuid]): Feature schema identifier
69+
value (Union[TextEntity, Geometry]): Localization or text content
70+
frame (int): The frame index in milliseconds (e.g., 10000 = 10.0 seconds)
71+
keyframe (bool): Whether this is a keyframe annotation (default: True)
72+
segment_index (Optional[int]): Index of audio segment this annotation belongs to
73+
classifications (Optional[List[ClassificationAnnotation]]): Optional sub-classifications
74+
extra (Dict[str, Any]): Additional metadata
75+
"""
76+
77+
frame: int
78+
keyframe: bool = True
79+
segment_index: Optional[int] = None
80+
81+
@classmethod
82+
def from_time_range(cls, start_sec: float, end_sec: float, **kwargs):
83+
"""Create from seconds (user-friendly) to frames (internal)
84+
85+
Args:
86+
start_sec (float): Start time in seconds
87+
end_sec (float): End time in seconds
88+
**kwargs: Additional arguments for the annotation
89+
90+
Returns:
91+
AudioObjectAnnotation: Annotation with frame set to start_sec * 1000
92+
93+
Example:
94+
>>> AudioObjectAnnotation.from_time_range(
95+
... start_sec=10.0, end_sec=12.5,
96+
... name="transcription",
97+
... value=lb_types.TextEntity(text="Hello world")
98+
... )
99+
"""
100+
return cls(frame=int(start_sec * 1000), **kwargs)
101+
102+
@property
103+
def start_time(self) -> float:
104+
"""Convert frame to seconds for user-facing APIs
105+
106+
Returns:
107+
float: Time in seconds (e.g., 10000 -> 10.0)
108+
"""
109+
return self.frame / 1000.0

libs/labelbox/src/labelbox/data/annotation_types/label.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from .metrics import ScalarMetric, ConfusionMatrixMetric
1414
from .video import VideoClassificationAnnotation
1515
from .video import VideoObjectAnnotation, VideoMaskAnnotation
16+
from .audio import AudioClassificationAnnotation, AudioObjectAnnotation
1617
from .mmc import MessageEvaluationTaskAnnotation
1718
from pydantic import BaseModel, field_validator
1819

@@ -44,6 +45,8 @@ class Label(BaseModel):
4445
ClassificationAnnotation,
4546
ObjectAnnotation,
4647
VideoMaskAnnotation,
48+
AudioClassificationAnnotation,
49+
AudioObjectAnnotation,
4750
ScalarMetric,
4851
ConfusionMatrixMetric,
4952
RelationshipAnnotation,
@@ -85,6 +88,27 @@ def frame_annotations(
8588
frame_dict[annotation.frame].append(annotation)
8689
return frame_dict
8790

91+
def audio_annotations_by_frame(
92+
self,
93+
) -> Dict[int, List[Union[AudioObjectAnnotation, AudioClassificationAnnotation]]]:
94+
"""Get audio annotations organized by frame (millisecond)
95+
96+
Returns:
97+
Dict[int, List]: Dictionary mapping frame (milliseconds) to list of audio annotations
98+
99+
Example:
100+
>>> label.audio_annotations_by_frame()
101+
{2500: [AudioClassificationAnnotation(...)], 10000: [AudioObjectAnnotation(...)]}
102+
"""
103+
frame_dict = defaultdict(list)
104+
for annotation in self.annotations:
105+
if isinstance(
106+
annotation,
107+
(AudioObjectAnnotation, AudioClassificationAnnotation),
108+
):
109+
frame_dict[annotation.frame].append(annotation)
110+
return dict(frame_dict)
111+
88112
def add_url_to_masks(self, signer) -> "Label":
89113
"""
90114
Creates signed urls for all masks in the Label.

libs/labelbox/src/labelbox/data/serialization/ndjson/classification.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from ...annotation_types.annotation import ClassificationAnnotation
1414
from ...annotation_types.video import VideoClassificationAnnotation
15+
from ...annotation_types.audio import AudioClassificationAnnotation
1516
from ...annotation_types.llm_prompt_response.prompt import (
1617
PromptClassificationAnnotation,
1718
PromptText,
@@ -425,7 +426,7 @@ def to_common(
425426
def from_common(
426427
cls,
427428
annotation: Union[
428-
ClassificationAnnotation, VideoClassificationAnnotation
429+
ClassificationAnnotation, VideoClassificationAnnotation, AudioClassificationAnnotation
429430
],
430431
data: GenericDataRowData,
431432
) -> Union[NDTextSubclass, NDChecklistSubclass, NDRadioSubclass]:
@@ -448,7 +449,7 @@ def from_common(
448449
@staticmethod
449450
def lookup_classification(
450451
annotation: Union[
451-
ClassificationAnnotation, VideoClassificationAnnotation
452+
ClassificationAnnotation, VideoClassificationAnnotation, AudioClassificationAnnotation
452453
],
453454
) -> Union[NDText, NDChecklist, NDRadio]:
454455
return {Text: NDText, Checklist: NDChecklist, Radio: NDRadio}.get(

libs/labelbox/src/labelbox/data/serialization/ndjson/label.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
VideoMaskAnnotation,
2525
VideoObjectAnnotation,
2626
)
27+
from ...annotation_types.audio import (
28+
AudioClassificationAnnotation,
29+
AudioObjectAnnotation,
30+
)
2731
from labelbox.types import DocumentRectangle, DocumentEntity
2832
from .classification import (
2933
NDChecklistSubclass,
@@ -69,6 +73,7 @@ def from_common(
6973
yield from cls._create_relationship_annotations(label)
7074
yield from cls._create_non_video_annotations(label)
7175
yield from cls._create_video_annotations(label)
76+
yield from cls._create_audio_annotations(label)
7277

7378
@staticmethod
7479
def _get_consecutive_frames(
@@ -159,6 +164,40 @@ def _create_video_annotations(
159164
segments.append(segment)
160165
yield NDObject.from_common(segments, label.data)
161166

167+
@classmethod
168+
def _create_audio_annotations(
169+
cls, label: Label
170+
) -> Generator[Union[NDChecklistSubclass, NDRadioSubclass], None, None]:
171+
"""Create audio annotations
172+
173+
Args:
174+
label: Label containing audio annotations to be processed
175+
176+
Yields:
177+
NDClassification or NDObject: Audio annotations in NDJSON format
178+
"""
179+
audio_annotations = defaultdict(list)
180+
for annot in label.annotations:
181+
if isinstance(
182+
annot, (AudioClassificationAnnotation, AudioObjectAnnotation)
183+
):
184+
audio_annotations[annot.feature_schema_id or annot.name].append(
185+
annot
186+
)
187+
188+
for annotation_group in audio_annotations.values():
189+
# For audio, treat each annotation as a single frame (no segments needed)
190+
if isinstance(annotation_group[0], AudioClassificationAnnotation):
191+
annotation = annotation_group[0]
192+
# Add frame information to extra (milliseconds)
193+
annotation.extra.update({"frame": annotation.frame})
194+
yield NDClassification.from_common(annotation, label.data)
195+
196+
elif isinstance(annotation_group[0], AudioObjectAnnotation):
197+
# For audio objects, treat like single video frame
198+
annotation = annotation_group[0]
199+
yield NDObject.from_common(annotation, label.data)
200+
162201
@classmethod
163202
def _create_non_video_annotations(cls, label: Label):
164203
non_video_annotations = [
@@ -170,6 +209,8 @@ def _create_non_video_annotations(cls, label: Label):
170209
VideoClassificationAnnotation,
171210
VideoObjectAnnotation,
172211
VideoMaskAnnotation,
212+
AudioClassificationAnnotation,
213+
AudioObjectAnnotation,
173214
RelationshipAnnotation,
174215
),
175216
)

libs/labelbox/src/labelbox/data/serialization/ndjson/objects.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
from labelbox.data.annotation_types.video import (
1515
VideoObjectAnnotation,
1616
)
17+
from labelbox.data.annotation_types.audio import (
18+
AudioObjectAnnotation,
19+
)
1720
from labelbox.data.mixins import (
1821
ConfidenceMixin,
1922
CustomMetric,
@@ -715,6 +718,7 @@ def from_common(
715718
ObjectAnnotation,
716719
List[List[VideoObjectAnnotation]],
717720
VideoMaskAnnotation,
721+
AudioObjectAnnotation,
718722
],
719723
data: GenericDataRowData,
720724
) -> Union[
@@ -742,6 +746,9 @@ def from_common(
742746
return obj.from_common(**args)
743747
elif obj == NDVideoMasks:
744748
return obj.from_common(annotation, data)
749+
elif isinstance(annotation, AudioObjectAnnotation):
750+
# Handle audio object annotation like single video frame
751+
return cls._handle_single_audio_annotation(annotation, data)
745752

746753
subclasses = [
747754
NDSubclassification.from_common(annot)
@@ -765,6 +772,41 @@ def from_common(
765772
**optional_kwargs,
766773
)
767774

775+
@classmethod
776+
def _handle_single_audio_annotation(cls, annotation: AudioObjectAnnotation, data: GenericDataRowData):
777+
"""Handle single audio annotation like video frame
778+
779+
Args:
780+
annotation: Audio object annotation to process
781+
data: Data row data
782+
783+
Returns:
784+
NDObject: Serialized audio object annotation
785+
"""
786+
# Get the appropriate NDObject subclass based on the annotation value type
787+
obj = cls.lookup_object(annotation)
788+
789+
# Process sub-classifications if any
790+
subclasses = [
791+
NDSubclassification.from_common(annot)
792+
for annot in annotation.classifications
793+
]
794+
795+
# Add frame information to extra (milliseconds)
796+
extra = annotation.extra.copy() if annotation.extra else {}
797+
extra.update({"frame": annotation.frame})
798+
799+
# Create the NDObject with frame information
800+
return obj.from_common(
801+
str(annotation._uuid),
802+
annotation.value,
803+
subclasses,
804+
annotation.name,
805+
annotation.feature_schema_id,
806+
extra,
807+
data,
808+
)
809+
768810
@staticmethod
769811
def lookup_object(
770812
annotation: Union[ObjectAnnotation, List],

0 commit comments

Comments
 (0)