chore: use ms instead of s in sdk interface

rishisurana-labelbox · rishisurana-labelbox · commit dbcc7bf45c17 · 2025-09-08T10:46:16.000-07:00
diff --git a/examples/annotation_import/audio_temporal.ipynb b/examples/annotation_import/audio_temporal.ipynb
@@ -111,7 +111,7 @@
         "\n",
         "### Audio Classification Annotations\n",
         "\n",
-        "Use `AudioClassificationAnnotation` for classifications tied to specific time ranges.\n"
+        "Use `AudioClassificationAnnotation` for classifications tied to specific time ranges. The interface now accepts milliseconds directly for precise timing control.\n"
       ]
     },
     {
@@ -122,8 +122,8 @@
       "source": [
         "# Speaker identification for a time range\n",
         "speaker_annotation = lb_types.AudioClassificationAnnotation.from_time_range(\n",
-        "    start_sec=2.5,  # Start at 2.5 seconds\n",
-        "    end_sec=4.1,    # End at 4.1 seconds\n",
+        "    start_ms=2500,  # Start at 2500 milliseconds (2.5 seconds)\n",
+        "    end_ms=4100,    # End at 4100 milliseconds (4.1 seconds)\n",
         "    name=\"speaker_id\",\n",
         "    value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"john\"))\n",
         ")\n",
@@ -140,8 +140,8 @@
       "source": [
         "# Audio quality assessment for a segment\n",
         "quality_annotation = lb_types.AudioClassificationAnnotation.from_time_range(\n",
-        "    start_sec=0.0,\n",
-        "    end_sec=10.0,\n",
+        "    start_ms=0,\n",
+        "    end_ms=10000,\n",
         "    name=\"audio_quality\",\n",
         "    value=lb_types.Checklist(answer=[\n",
         "        lb_types.ClassificationAnswer(name=\"clear_audio\"),\n",
@@ -151,8 +151,8 @@
         "\n",
         "# Emotion detection for a segment\n",
         "emotion_annotation = lb_types.AudioClassificationAnnotation.from_time_range(\n",
-        "    start_sec=5.2,\n",
-        "    end_sec=8.7,\n",
+        "    start_ms=5200,\n",
+        "    end_ms=8700,\n",
         "    name=\"emotion\",\n",
         "    value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"happy\"))\n",
         ")\n"
@@ -164,7 +164,7 @@
       "source": [
         "### Audio Object Annotations\n",
         "\n",
-        "Use `AudioObjectAnnotation` for text entities like transcriptions tied to specific time ranges.\n"
+        "Use `AudioObjectAnnotation` for text entities like transcriptions tied to specific time ranges. The interface now accepts milliseconds directly for precise timing control.\n"
       ]
     },
     {
@@ -175,8 +175,8 @@
       "source": [
         "# Transcription with precise timestamps\n",
         "transcription_annotation = lb_types.AudioObjectAnnotation.from_time_range(\n",
-        "    start_sec=2.5,\n",
-        "    end_sec=4.1,\n",
+        "    start_ms=2500,\n",
+        "    end_ms=4100,\n",
         "    name=\"transcription\",\n",
         "    value=lb_types.TextEntity(text=\"Hello, how are you doing today?\")\n",
         ")\n",
@@ -193,26 +193,26 @@
       "source": [
         "# Sound event detection\n",
         "sound_event_annotation = lb_types.AudioObjectAnnotation.from_time_range(\n",
-        "    start_sec=10.0,\n",
-        "    end_sec=12.5,\n",
+        "    start_ms=10000,\n",
+        "    end_ms=12500,\n",
         "    name=\"sound_event\",\n",
         "    value=lb_types.TextEntity(text=\"Dog barking in background\")\n",
         ")\n",
         "\n",
         "# Multiple transcription segments\n",
         "transcription_segments = [\n",
         "    lb_types.AudioObjectAnnotation.from_time_range(\n",
-        "        start_sec=0.0, end_sec=2.3,\n",
+        "        start_ms=0, end_ms=2300,\n",
         "        name=\"transcription\",\n",
         "        value=lb_types.TextEntity(text=\"Welcome to our podcast.\")\n",
         "    ),\n",
         "    lb_types.AudioObjectAnnotation.from_time_range(\n",
-        "        start_sec=2.5, end_sec=5.8,\n",
+        "        start_ms=2500, end_ms=5800,\n",
         "        name=\"transcription\", \n",
         "        value=lb_types.TextEntity(text=\"Today we're discussing AI advancements.\")\n",
         "    ),\n",
         "    lb_types.AudioObjectAnnotation.from_time_range(\n",
-        "        start_sec=6.0, end_sec=9.2,\n",
+        "        start_ms=6000, end_ms=9200,\n",
         "        name=\"transcription\",\n",
         "        value=lb_types.TextEntity(text=\"Let's start with machine learning basics.\")\n",
         "    )\n",
@@ -238,31 +238,31 @@
         "podcast_annotations = [\n",
         "    # Host introduction\n",
         "    lb_types.AudioClassificationAnnotation.from_time_range(\n",
-        "        start_sec=0.0, end_sec=5.0,\n",
+        "        start_ms=0, end_ms=5000,\n",
         "        name=\"speaker_id\",\n",
         "        value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"host\"))\n",
         "    ),\n",
         "    lb_types.AudioObjectAnnotation.from_time_range(\n",
-        "        start_sec=0.0, end_sec=5.0,\n",
+        "        start_ms=0, end_ms=5000,\n",
         "        name=\"transcription\",\n",
         "        value=lb_types.TextEntity(text=\"Welcome to Tech Talk, I'm your host Sarah.\")\n",
         "    ),\n",
         "    \n",
         "    # Guest response\n",
         "    lb_types.AudioClassificationAnnotation.from_time_range(\n",
-        "        start_sec=5.2, end_sec=8.5,\n",
+        "        start_ms=5200, end_ms=8500,\n",
         "        name=\"speaker_id\",\n",
         "        value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"guest\"))\n",
         "    ),\n",
         "    lb_types.AudioObjectAnnotation.from_time_range(\n",
-        "        start_sec=5.2, end_sec=8.5,\n",
+        "        start_ms=5200, end_ms=8500,\n",
         "        name=\"transcription\",\n",
         "        value=lb_types.TextEntity(text=\"Thanks for having me, Sarah!\")\n",
         "    ),\n",
         "    \n",
         "    # Audio quality assessment\n",
         "    lb_types.AudioClassificationAnnotation.from_time_range(\n",
-        "        start_sec=0.0, end_sec=10.0,\n",
+        "        start_ms=0, end_ms=10000,\n",
         "        name=\"audio_quality\",\n",
         "        value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"excellent\"))\n",
         "    )\n",
@@ -288,14 +288,14 @@
         "call_center_annotations = [\n",
         "    # Customer sentiment analysis\n",
         "    lb_types.AudioClassificationAnnotation.from_time_range(\n",
-        "        start_sec=0.0, end_sec=30.0,\n",
+        "        start_ms=0, end_ms=30000,\n",
         "        name=\"customer_sentiment\",\n",
         "        value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"frustrated\"))\n",
         "    ),\n",
         "    \n",
         "    # Agent performance\n",
         "    lb_types.AudioClassificationAnnotation.from_time_range(\n",
-        "        start_sec=30.0, end_sec=60.0,\n",
+        "        start_ms=30000, end_ms=60000,\n",
         "        name=\"agent_performance\",\n",
         "        value=lb_types.Checklist(answer=[\n",
         "            lb_types.ClassificationAnswer(name=\"professional_tone\"),\n",
@@ -306,13 +306,13 @@
         "    \n",
         "    # Key phrases extraction\n",
         "    lb_types.AudioObjectAnnotation.from_time_range(\n",
-        "        start_sec=15.0, end_sec=18.0,\n",
+        "        start_ms=15000, end_ms=18000,\n",
         "        name=\"key_phrase\",\n",
         "        value=lb_types.TextEntity(text=\"I want to speak to your manager\")\n",
         "    ),\n",
         "    \n",
         "    lb_types.AudioObjectAnnotation.from_time_range(\n",
-        "        start_sec=45.0, end_sec=48.0,\n",
+        "        start_ms=45000, end_ms=48000,\n",
         "        name=\"key_phrase\",\n",
         "        value=lb_types.TextEntity(text=\"Thank you for your patience\")\n",
         "    )\n",
@@ -338,7 +338,7 @@
         "music_annotations = [\n",
         "    # Musical instruments\n",
         "    lb_types.AudioClassificationAnnotation.from_time_range(\n",
-        "        start_sec=0.0, end_sec=30.0,\n",
+        "        start_ms=0, end_ms=30000,\n",
         "        name=\"instruments\",\n",
         "        value=lb_types.Checklist(answer=[\n",
         "            lb_types.ClassificationAnswer(name=\"piano\"),\n",
@@ -349,20 +349,20 @@
         "    \n",
         "    # Genre classification\n",
         "    lb_types.AudioClassificationAnnotation.from_time_range(\n",
-        "        start_sec=0.0, end_sec=60.0,\n",
+        "        start_ms=0, end_ms=60000,\n",
         "        name=\"genre\",\n",
         "        value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name=\"classical\"))\n",
         "    ),\n",
         "    \n",
         "    # Sound events\n",
         "    lb_types.AudioObjectAnnotation.from_time_range(\n",
-        "        start_sec=25.0, end_sec=27.0,\n",
+        "        start_ms=25000, end_ms=27000,\n",
         "        name=\"sound_event\",\n",
         "        value=lb_types.TextEntity(text=\"Applause from audience\")\n",
         "    ),\n",
         "    \n",
         "    lb_types.AudioObjectAnnotation.from_time_range(\n",
-        "        start_sec=45.0, end_sec=46.5,\n",
+        "        start_ms=45000, end_ms=46500,\n",
         "        name=\"sound_event\",\n",
         "        value=lb_types.TextEntity(text=\"Door closing in background\")\n",
         "    )\n",
@@ -681,12 +681,12 @@
         "\n",
         "# Audio: 1 frame = 1 millisecond\n",
         "audio_annotation = lb_types.AudioClassificationAnnotation.from_time_range(\n",
-        "    start_sec=2.5, end_sec=4.1,\n",
+        "    start_ms=2500, end_ms=4100,\n",
         "    name=\"test\", value=lb_types.Text(answer=\"test\")\n",
         ")\n",
         "\n",
         "print(f\"Audio Annotation:\")\n",
-        "print(f\"  Time: 2.5s → Frame: {audio_annotation.frame} (milliseconds)\")\n",
+        "print(f\"  Time: 2500ms → Frame: {audio_annotation.frame} (milliseconds)\")\n",
         "print(f\"  Frame rate: 1000 frames/second (1 frame = 1ms)\")\n",
         "\n",
         "print(f\"\\nVideo Annotation (for comparison):\")\n",
@@ -704,8 +704,8 @@
         "\n",
         "### 1. Time Precision\n",
         "- Audio temporal annotations use millisecond precision (1 frame = 1ms)\n",
-        "- Always use the `from_time_range()` method for user-friendly second-based input\n",
-        "- Frame values are automatically calculated: `frame = int(start_sec * 1000)`\n",
+        "- Use the `from_time_range()` method with millisecond-based input for precise timing control\n",
+        "- Frame values are set directly: `frame = start_ms`\n",
         "\n",
         "### 2. Ontology Alignment\n",
         "- Ensure annotation `name` fields match your ontology tool/classification names\n",
@@ -751,7 +751,7 @@
         "This notebook demonstrated:\n",
         "\n",
         "1. **Creating temporal audio annotations** using `AudioClassificationAnnotation` and `AudioObjectAnnotation`\n",
-        "2. **Time-based API** with `from_time_range()` for user-friendly input\n",
+        "2. **Millisecond-based API** with `from_time_range()` for precise timing control\n",
         "3. **Multiple use cases**: podcasts, call centers, music analysis\n",
         "4. **MAL import pipeline** for uploading temporal prelabels\n",
         "5. **NDJSON serialization** compatible with existing video infrastructure\n",
@@ -762,6 +762,7 @@
         "- **Frame-based precision** - 1ms accuracy for audio timing\n",
         "- **Seamless integration** - works with existing MAL and Label Import pipelines\n",
         "- **Flexible annotation types** - supports classifications and text entities with timestamps\n",
+        "- **Direct millisecond input** - precise timing control without conversion overhead\n",
         "\n",
         "### Next Steps:\n",
         "1. Upload your temporal audio annotations using this notebook as a template\n",
diff --git a/libs/labelbox/src/labelbox/data/annotation_types/audio.py b/libs/labelbox/src/labelbox/data/annotation_types/audio.py
@@ -8,7 +8,7 @@ class AudioClassificationAnnotation(ClassificationAnnotation):
     """Audio classification for specific time range
     
     Examples:
-    - Speaker identification from 2.5s to 4.1s
+    - Speaker identification from 2500ms to 4100ms
     - Audio quality assessment for a segment
     - Language detection for audio segments
     
@@ -25,25 +25,25 @@ class AudioClassificationAnnotation(ClassificationAnnotation):
     segment_index: Optional[int] = None
     
     @classmethod
-    def from_time_range(cls, start_sec: float, end_sec: float, **kwargs):
-        """Create from seconds (user-friendly) to frames (internal)
+    def from_time_range(cls, start_ms: int, end_ms: int, **kwargs):
+        """Create from milliseconds (user-friendly) to frames (internal)
         
         Args:
-            start_sec (float): Start time in seconds
-            end_sec (float): End time in seconds  
+            start_ms (int): Start time in milliseconds
+            end_ms (int): End time in milliseconds  
             **kwargs: Additional arguments for the annotation
             
         Returns:
-            AudioClassificationAnnotation: Annotation with frame set to start_sec * 1000
+            AudioClassificationAnnotation: Annotation with frame set to start_ms
             
         Example:
             >>> AudioClassificationAnnotation.from_time_range(
-            ...     start_sec=2.5, end_sec=4.1,
+            ...     start_ms=2500, end_ms=4100,
             ...     name="speaker_id",
             ...     value=lb_types.Radio(answer=lb_types.ClassificationAnswer(name="john"))
             ... )
         """
-        return cls(frame=int(start_sec * 1000), **kwargs)
+        return cls(frame=start_ms, **kwargs)
     
     @property
     def start_time(self) -> float:
@@ -59,8 +59,8 @@ class AudioObjectAnnotation(ObjectAnnotation, ConfidenceNotSupportedMixin, Custo
     """Audio object annotation for specific time range
     
     Examples:
-    - Transcription: "Hello world" from 2.5s to 4.1s
-    - Sound events: "Dog barking" from 10s to 12s
+    - Transcription: "Hello world" from 2500ms to 4100ms
+    - Sound events: "Dog barking" from 10000ms to 12000ms
     - Audio segments with metadata
     
     Args:
@@ -79,25 +79,25 @@ class AudioObjectAnnotation(ObjectAnnotation, ConfidenceNotSupportedMixin, Custo
     segment_index: Optional[int] = None
     
     @classmethod
-    def from_time_range(cls, start_sec: float, end_sec: float, **kwargs):
-        """Create from seconds (user-friendly) to frames (internal)
+    def from_time_range(cls, start_ms: int, end_ms: int, **kwargs):
+        """Create from milliseconds (user-friendly) to frames (internal)
         
         Args:
-            start_sec (float): Start time in seconds
-            end_sec (float): End time in seconds
+            start_ms (int): Start time in milliseconds
+            end_ms (int): End time in milliseconds
             **kwargs: Additional arguments for the annotation
             
         Returns:
-            AudioObjectAnnotation: Annotation with frame set to start_sec * 1000
+            AudioObjectAnnotation: Annotation with frame set to start_ms
             
         Example:
             >>> AudioObjectAnnotation.from_time_range(
-            ...     start_sec=10.0, end_sec=12.5,
+            ...     start_ms=10000, end_ms=12500,
             ...     name="transcription",
             ...     value=lb_types.TextEntity(text="Hello world")
             ... )
         """
-        return cls(frame=int(start_sec * 1000), **kwargs)
+        return cls(frame=start_ms, **kwargs)
     
     @property
     def start_time(self) -> float:
diff --git a/libs/labelbox/tests/data/annotation_types/test_audio.py b/libs/labelbox/tests/data/annotation_types/test_audio.py