Skip to content

Commit dbcc7bf

Browse files
chore: use ms instead of s in sdk interface
1 parent e4fd630 commit dbcc7bf

File tree

3 files changed

+80
-79
lines changed

3 files changed

+80
-79
lines changed

examples/annotation_import/audio_temporal.ipynb

Lines changed: 34 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@
111111
"\n",
112112
"### Audio Classification Annotations\n",
113113
"\n",
114-
"Use `AudioClassificationAnnotation` for classifications tied to specific time ranges.\n"
114+
"Use `AudioClassificationAnnotation` for classifications tied to specific time ranges. The interface now accepts milliseconds directly for precise timing control.\n"
115115
]
116116
},
117117
{
@@ -122,8 +122,8 @@
122122
"source": [
123123
"# Speaker identification for a time range\n",
124124
"speaker_annotation = lb_types.AudioClassificationAnnotation.from_time_range(\n",
125-
" start_sec=2.5, # Start at 2.5 seconds\n",
126-
" end_sec=4.1, # End at 4.1 seconds\n",
125+
" start_ms=2500, # Start at 2500 milliseconds (2.5 seconds)\n",
126+
" end_ms=4100, # End at 4100 milliseconds (4.1 seconds)\n",
127127
" name=\"speaker_id\",\n",
128128
" value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"john\"))\n",
129129
")\n",
@@ -140,8 +140,8 @@
140140
"source": [
141141
"# Audio quality assessment for a segment\n",
142142
"quality_annotation = lb_types.AudioClassificationAnnotation.from_time_range(\n",
143-
" start_sec=0.0,\n",
144-
" end_sec=10.0,\n",
143+
" start_ms=0,\n",
144+
" end_ms=10000,\n",
145145
" name=\"audio_quality\",\n",
146146
" value=lb_types.Checklist(answer=[\n",
147147
" lb_types.ClassificationAnswer(name=\"clear_audio\"),\n",
@@ -151,8 +151,8 @@
151151
"\n",
152152
"# Emotion detection for a segment\n",
153153
"emotion_annotation = lb_types.AudioClassificationAnnotation.from_time_range(\n",
154-
" start_sec=5.2,\n",
155-
" end_sec=8.7,\n",
154+
" start_ms=5200,\n",
155+
" end_ms=8700,\n",
156156
" name=\"emotion\",\n",
157157
" value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"happy\"))\n",
158158
")\n"
@@ -164,7 +164,7 @@
164164
"source": [
165165
"### Audio Object Annotations\n",
166166
"\n",
167-
"Use `AudioObjectAnnotation` for text entities like transcriptions tied to specific time ranges.\n"
167+
"Use `AudioObjectAnnotation` for text entities like transcriptions tied to specific time ranges. The interface now accepts milliseconds directly for precise timing control.\n"
168168
]
169169
},
170170
{
@@ -175,8 +175,8 @@
175175
"source": [
176176
"# Transcription with precise timestamps\n",
177177
"transcription_annotation = lb_types.AudioObjectAnnotation.from_time_range(\n",
178-
" start_sec=2.5,\n",
179-
" end_sec=4.1,\n",
178+
" start_ms=2500,\n",
179+
" end_ms=4100,\n",
180180
" name=\"transcription\",\n",
181181
" value=lb_types.TextEntity(text=\"Hello, how are you doing today?\")\n",
182182
")\n",
@@ -193,26 +193,26 @@
193193
"source": [
194194
"# Sound event detection\n",
195195
"sound_event_annotation = lb_types.AudioObjectAnnotation.from_time_range(\n",
196-
" start_sec=10.0,\n",
197-
" end_sec=12.5,\n",
196+
" start_ms=10000,\n",
197+
" end_ms=12500,\n",
198198
" name=\"sound_event\",\n",
199199
" value=lb_types.TextEntity(text=\"Dog barking in background\")\n",
200200
")\n",
201201
"\n",
202202
"# Multiple transcription segments\n",
203203
"transcription_segments = [\n",
204204
" lb_types.AudioObjectAnnotation.from_time_range(\n",
205-
" start_sec=0.0, end_sec=2.3,\n",
205+
" start_ms=0, end_ms=2300,\n",
206206
" name=\"transcription\",\n",
207207
" value=lb_types.TextEntity(text=\"Welcome to our podcast.\")\n",
208208
" ),\n",
209209
" lb_types.AudioObjectAnnotation.from_time_range(\n",
210-
" start_sec=2.5, end_sec=5.8,\n",
210+
" start_ms=2500, end_ms=5800,\n",
211211
" name=\"transcription\", \n",
212212
" value=lb_types.TextEntity(text=\"Today we're discussing AI advancements.\")\n",
213213
" ),\n",
214214
" lb_types.AudioObjectAnnotation.from_time_range(\n",
215-
" start_sec=6.0, end_sec=9.2,\n",
215+
" start_ms=6000, end_ms=9200,\n",
216216
" name=\"transcription\",\n",
217217
" value=lb_types.TextEntity(text=\"Let's start with machine learning basics.\")\n",
218218
" )\n",
@@ -238,31 +238,31 @@
238238
"podcast_annotations = [\n",
239239
" # Host introduction\n",
240240
" lb_types.AudioClassificationAnnotation.from_time_range(\n",
241-
" start_sec=0.0, end_sec=5.0,\n",
241+
" start_ms=0, end_ms=5000,\n",
242242
" name=\"speaker_id\",\n",
243243
" value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"host\"))\n",
244244
" ),\n",
245245
" lb_types.AudioObjectAnnotation.from_time_range(\n",
246-
" start_sec=0.0, end_sec=5.0,\n",
246+
" start_ms=0, end_ms=5000,\n",
247247
" name=\"transcription\",\n",
248248
" value=lb_types.TextEntity(text=\"Welcome to Tech Talk, I'm your host Sarah.\")\n",
249249
" ),\n",
250250
" \n",
251251
" # Guest response\n",
252252
" lb_types.AudioClassificationAnnotation.from_time_range(\n",
253-
" start_sec=5.2, end_sec=8.5,\n",
253+
" start_ms=5200, end_ms=8500,\n",
254254
" name=\"speaker_id\",\n",
255255
" value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"guest\"))\n",
256256
" ),\n",
257257
" lb_types.AudioObjectAnnotation.from_time_range(\n",
258-
" start_sec=5.2, end_sec=8.5,\n",
258+
" start_ms=5200, end_ms=8500,\n",
259259
" name=\"transcription\",\n",
260260
" value=lb_types.TextEntity(text=\"Thanks for having me, Sarah!\")\n",
261261
" ),\n",
262262
" \n",
263263
" # Audio quality assessment\n",
264264
" lb_types.AudioClassificationAnnotation.from_time_range(\n",
265-
" start_sec=0.0, end_sec=10.0,\n",
265+
" start_ms=0, end_ms=10000,\n",
266266
" name=\"audio_quality\",\n",
267267
" value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"excellent\"))\n",
268268
" )\n",
@@ -288,14 +288,14 @@
288288
"call_center_annotations = [\n",
289289
" # Customer sentiment analysis\n",
290290
" lb_types.AudioClassificationAnnotation.from_time_range(\n",
291-
" start_sec=0.0, end_sec=30.0,\n",
291+
" start_ms=0, end_ms=30000,\n",
292292
" name=\"customer_sentiment\",\n",
293293
" value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"frustrated\"))\n",
294294
" ),\n",
295295
" \n",
296296
" # Agent performance\n",
297297
" lb_types.AudioClassificationAnnotation.from_time_range(\n",
298-
" start_sec=30.0, end_sec=60.0,\n",
298+
" start_ms=30000, end_ms=60000,\n",
299299
" name=\"agent_performance\",\n",
300300
" value=lb_types.Checklist(answer=[\n",
301301
" lb_types.ClassificationAnswer(name=\"professional_tone\"),\n",
@@ -306,13 +306,13 @@
306306
" \n",
307307
" # Key phrases extraction\n",
308308
" lb_types.AudioObjectAnnotation.from_time_range(\n",
309-
" start_sec=15.0, end_sec=18.0,\n",
309+
" start_ms=15000, end_ms=18000,\n",
310310
" name=\"key_phrase\",\n",
311311
" value=lb_types.TextEntity(text=\"I want to speak to your manager\")\n",
312312
" ),\n",
313313
" \n",
314314
" lb_types.AudioObjectAnnotation.from_time_range(\n",
315-
" start_sec=45.0, end_sec=48.0,\n",
315+
" start_ms=45000, end_ms=48000,\n",
316316
" name=\"key_phrase\",\n",
317317
" value=lb_types.TextEntity(text=\"Thank you for your patience\")\n",
318318
" )\n",
@@ -338,7 +338,7 @@
338338
"music_annotations = [\n",
339339
" # Musical instruments\n",
340340
" lb_types.AudioClassificationAnnotation.from_time_range(\n",
341-
" start_sec=0.0, end_sec=30.0,\n",
341+
" start_ms=0, end_ms=30000,\n",
342342
" name=\"instruments\",\n",
343343
" value=lb_types.Checklist(answer=[\n",
344344
" lb_types.ClassificationAnswer(name=\"piano\"),\n",
@@ -349,20 +349,20 @@
349349
" \n",
350350
" # Genre classification\n",
351351
" lb_types.AudioClassificationAnnotation.from_time_range(\n",
352-
" start_sec=0.0, end_sec=60.0,\n",
352+
" start_ms=0, end_ms=60000,\n",
353353
" name=\"genre\",\n",
354354
" value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"classical\"))\n",
355355
" ),\n",
356356
" \n",
357357
" # Sound events\n",
358358
" lb_types.AudioObjectAnnotation.from_time_range(\n",
359-
" start_sec=25.0, end_sec=27.0,\n",
359+
" start_ms=25000, end_ms=27000,\n",
360360
" name=\"sound_event\",\n",
361361
" value=lb_types.TextEntity(text=\"Applause from audience\")\n",
362362
" ),\n",
363363
" \n",
364364
" lb_types.AudioObjectAnnotation.from_time_range(\n",
365-
" start_sec=45.0, end_sec=46.5,\n",
365+
" start_ms=45000, end_ms=46500,\n",
366366
" name=\"sound_event\",\n",
367367
" value=lb_types.TextEntity(text=\"Door closing in background\")\n",
368368
" )\n",
@@ -681,12 +681,12 @@
681681
"\n",
682682
"# Audio: 1 frame = 1 millisecond\n",
683683
"audio_annotation = lb_types.AudioClassificationAnnotation.from_time_range(\n",
684-
" start_sec=2.5, end_sec=4.1,\n",
684+
" start_ms=2500, end_ms=4100,\n",
685685
" name=\"test\", value=lb_types.Text(answer=\"test\")\n",
686686
")\n",
687687
"\n",
688688
"print(f\"Audio Annotation:\")\n",
689-
"print(f\" Time: 2.5s → Frame: {audio_annotation.frame} (milliseconds)\")\n",
689+
"print(f\" Time: 2500ms → Frame: {audio_annotation.frame} (milliseconds)\")\n",
690690
"print(f\" Frame rate: 1000 frames/second (1 frame = 1ms)\")\n",
691691
"\n",
692692
"print(f\"\\nVideo Annotation (for comparison):\")\n",
@@ -704,8 +704,8 @@
704704
"\n",
705705
"### 1. Time Precision\n",
706706
"- Audio temporal annotations use millisecond precision (1 frame = 1ms)\n",
707-
"- Always use the `from_time_range()` method for user-friendly second-based input\n",
708-
"- Frame values are automatically calculated: `frame = int(start_sec * 1000)`\n",
707+
"- Use the `from_time_range()` method with millisecond-based input for precise timing control\n",
708+
"- Frame values are set directly: `frame = start_ms`\n",
709709
"\n",
710710
"### 2. Ontology Alignment\n",
711711
"- Ensure annotation `name` fields match your ontology tool/classification names\n",
@@ -751,7 +751,7 @@
751751
"This notebook demonstrated:\n",
752752
"\n",
753753
"1. **Creating temporal audio annotations** using `AudioClassificationAnnotation` and `AudioObjectAnnotation`\n",
754-
"2. **Time-based API** with `from_time_range()` for user-friendly input\n",
754+
"2. **Millisecond-based API** with `from_time_range()` for precise timing control\n",
755755
"3. **Multiple use cases**: podcasts, call centers, music analysis\n",
756756
"4. **MAL import pipeline** for uploading temporal prelabels\n",
757757
"5. **NDJSON serialization** compatible with existing video infrastructure\n",
@@ -762,6 +762,7 @@
762762
"- **Frame-based precision** - 1ms accuracy for audio timing\n",
763763
"- **Seamless integration** - works with existing MAL and Label Import pipelines\n",
764764
"- **Flexible annotation types** - supports classifications and text entities with timestamps\n",
765+
"- **Direct millisecond input** - precise timing control without conversion overhead\n",
765766
"\n",
766767
"### Next Steps:\n",
767768
"1. Upload your temporal audio annotations using this notebook as a template\n",

libs/labelbox/src/labelbox/data/annotation_types/audio.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ class AudioClassificationAnnotation(ClassificationAnnotation):
88
"""Audio classification for specific time range
99
1010
Examples:
11-
- Speaker identification from 2.5s to 4.1s
11+
- Speaker identification from 2500ms to 4100ms
1212
- Audio quality assessment for a segment
1313
- Language detection for audio segments
1414
@@ -25,25 +25,25 @@ class AudioClassificationAnnotation(ClassificationAnnotation):
2525
segment_index: Optional[int] = None
2626

2727
@classmethod
28-
def from_time_range(cls, start_sec: float, end_sec: float, **kwargs):
29-
"""Create from seconds (user-friendly) to frames (internal)
28+
def from_time_range(cls, start_ms: int, end_ms: int, **kwargs):
29+
"""Create from milliseconds (user-friendly) to frames (internal)
3030
3131
Args:
32-
start_sec (float): Start time in seconds
33-
end_sec (float): End time in seconds
32+
start_ms (int): Start time in milliseconds
33+
end_ms (int): End time in milliseconds
3434
**kwargs: Additional arguments for the annotation
3535
3636
Returns:
37-
AudioClassificationAnnotation: Annotation with frame set to start_sec * 1000
37+
AudioClassificationAnnotation: Annotation with frame set to start_ms
3838
3939
Example:
4040
>>> AudioClassificationAnnotation.from_time_range(
41-
... start_sec=2.5, end_sec=4.1,
41+
... start_ms=2500, end_ms=4100,
4242
... name="speaker_id",
4343
... value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name="john"))
4444
... )
4545
"""
46-
return cls(frame=int(start_sec * 1000), **kwargs)
46+
return cls(frame=start_ms, **kwargs)
4747

4848
@property
4949
def start_time(self) -> float:
@@ -59,8 +59,8 @@ class AudioObjectAnnotation(ObjectAnnotation, ConfidenceNotSupportedMixin, Custo
5959
"""Audio object annotation for specific time range
6060
6161
Examples:
62-
- Transcription: "Hello world" from 2.5s to 4.1s
63-
- Sound events: "Dog barking" from 10s to 12s
62+
- Transcription: "Hello world" from 2500ms to 4100ms
63+
- Sound events: "Dog barking" from 10000ms to 12000ms
6464
- Audio segments with metadata
6565
6666
Args:
@@ -79,25 +79,25 @@ class AudioObjectAnnotation(ObjectAnnotation, ConfidenceNotSupportedMixin, Custo
7979
segment_index: Optional[int] = None
8080

8181
@classmethod
82-
def from_time_range(cls, start_sec: float, end_sec: float, **kwargs):
83-
"""Create from seconds (user-friendly) to frames (internal)
82+
def from_time_range(cls, start_ms: int, end_ms: int, **kwargs):
83+
"""Create from milliseconds (user-friendly) to frames (internal)
8484
8585
Args:
86-
start_sec (float): Start time in seconds
87-
end_sec (float): End time in seconds
86+
start_ms (int): Start time in milliseconds
87+
end_ms (int): End time in milliseconds
8888
**kwargs: Additional arguments for the annotation
8989
9090
Returns:
91-
AudioObjectAnnotation: Annotation with frame set to start_sec * 1000
91+
AudioObjectAnnotation: Annotation with frame set to start_ms
9292
9393
Example:
9494
>>> AudioObjectAnnotation.from_time_range(
95-
... start_sec=10.0, end_sec=12.5,
95+
... start_ms=10000, end_ms=12500,
9696
... name="transcription",
9797
... value=lb_types.TextEntity(text="Hello world")
9898
... )
9999
"""
100-
return cls(frame=int(start_sec * 1000), **kwargs)
100+
return cls(frame=start_ms, **kwargs)
101101

102102
@property
103103
def start_time(self) -> float:

0 commit comments

Comments
 (0)