Add AudioDecoder docs and tutorial (#582)

NicolasHug · web-flow · commit 93f5d4784d2a · 2025-03-21T14:59:28.000Z
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -5,4 +5,5 @@ sphinx_copybutton
 sphinx-tabs
 matplotlib
 torchvision
+ipython
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
diff --git a/docs/source/api_ref_decoders.rst b/docs/source/api_ref_decoders.rst
@@ -7,7 +7,8 @@ torchcodec.decoders
 .. currentmodule:: torchcodec.decoders
 
 
-For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
+For a video decoder tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
+For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_audio_decoding.py`.
 
 
 .. autosummary::
@@ -16,6 +17,7 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
     :template: class.rst
 
     VideoDecoder
+    AudioDecoder
 
 
 .. autosummary::
@@ -24,3 +26,4 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
     :template: dataclass.rst
 
     VideoStreamMetadata
+    AudioStreamMetadata
diff --git a/docs/source/api_ref_torchcodec.rst b/docs/source/api_ref_torchcodec.rst
@@ -14,3 +14,4 @@ torchcodec
 
     Frame
     FrameBatch
+    AudioSamples
diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst
@@ -4,7 +4,7 @@ Glossary
 .. glossary::
 
     pts
-       Presentation Time Stamp. The time at which a frame should be played.
+       Presentation Time Stamp. The time at which a frame or audio sample should be played.
        In TorchCodec, pts are expressed in seconds.
 
     best stream
diff --git a/examples/audio_decoding.py b/examples/audio_decoding.py
@@ -0,0 +1,111 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+========================================
+Decoding audio streams with AudioDecoder
+========================================
+
+In this example, we'll learn how to decode an audio file using the
+:class:`~torchcodec.decoders.AudioDecoder` class.
+"""
+
+# %%
+# First, a bit of boilerplate: we'll download an audio file from the web and
+# define an audio playing utility.  You can ignore that part and jump right
+# below to :ref:`creating_decoder_audio`.
+import requests
+from IPython.display import Audio
+
+
+def play_audio(samples):
+    return Audio(samples.data, rate=samples.sample_rate)
+
+
+# Audio source is CC0: https://opengameart.org/content/town-theme-rpg
+# Attribution: cynicmusic.com pixelsphere.org
+url = "https://opengameart.org/sites/default/files/TownTheme.mp3"
+response = requests.get(url, headers={"User-Agent": ""})
+if response.status_code != 200:
+    raise RuntimeError(f"Failed to download video. {response.status_code = }.")
+
+raw_audio_bytes = response.content
+
+# %%
+# .. _creating_decoder_audio:
+#
+# Creating a decoder
+# ------------------
+#
+# We can now create a decoder from the raw (encoded) audio bytes. You can of
+# course use a local audio file and pass the path as input. You can also decode
+# audio streams from videos!
+
+from torchcodec.decoders import AudioDecoder
+
+decoder = AudioDecoder(raw_audio_bytes)
+
+# %%
+# The has not yet been decoded by the decoder, but we already have access to
+# some metadata via the ``metadata`` attribute which is an
+# :class:`~torchcodec.decoders.AudioStreamMetadata` object.
+print(decoder.metadata)
+
+# %%
+# Decoding samples
+# ----------------
+#
+# To get decoded samples, we just need to call the
+# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` method,
+# which returns an :class:`~torchcodec.AudioSamples` object:
+
+samples = decoder.get_samples_played_in_range(start_seconds=0)
+
+print(samples)
+play_audio(samples)
+
+# %%
+# The ``.data`` field is a tensor of shape ``(num_channels, num_samples)`` and
+# of float dtype with values in [-1, 1].
+#
+# The ``.pts_seconds`` field indicates the starting time of the output samples.
+# Here it's 0.025 seconds, even though we asked for samples starting from 0. Not
+# all streams start exactly at 0! This is not a bug in TorchCodec, this is a
+# property of the file that was defined when it was encoded.
+#
+# We only output the *start* of the samples, not the end or the duration. Those can
+# be easily derived from the number of samples and the sample rate:
+
+duration_seconds = samples.data.shape[1] / samples.sample_rate
+print(f"Duration = {int(duration_seconds // 60)}m{int(duration_seconds % 60)}s.")
+
+# %%
+# Specifying a range
+# ------------------
+#
+# By default,
+# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range`  decodes
+# the entire audio stream, but we can specify a custom range:
+
+samples = decoder.get_samples_played_in_range(start_seconds=10, stop_seconds=70)
+
+print(samples)
+play_audio(samples)
+
+# %%
+# Custom sample rate
+# ------------------
+#
+# We can also decode the samples into a desired sample rate using the
+# ``sample_rate`` parameter of :class:`~torchcodec.decoders.AudioDecoder`. The
+# ouput will sound the same, but note that the number of samples greatly
+# increased:
+
+decoder = AudioDecoder(raw_audio_bytes, sample_rate=16_000)
+samples = decoder.get_samples_played_in_range(start_seconds=0)
+
+print(samples)
+play_audio(samples)
diff --git a/src/torchcodec/_frame.py b/src/torchcodec/_frame.py
@@ -120,10 +120,12 @@ def __repr__(self):
 class AudioSamples(Iterable):
     """Audio samples with associated metadata."""
 
-    # TODO-AUDIO: docs
     data: Tensor
+    """The sample data (``torch.Tensor`` of float in [-1, 1], shape is ``(num_channels, num_samples)``)."""
     pts_seconds: float
+    """The :term:`pts` of the first sample, in seconds."""
     sample_rate: int
+    """The sample rate of the samples, in Hz."""
 
     def __post_init__(self):
         # This is called after __init__() when a Frame is created. We can run
diff --git a/src/torchcodec/decoders/__init__.py b/src/torchcodec/decoders/__init__.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from ._core import VideoStreamMetadata
+from ._audio_decoder import AudioDecoder  # noqa
+from ._core import AudioStreamMetadata, VideoStreamMetadata
 from ._video_decoder import VideoDecoder  # noqa
 
 SimpleVideoDecoder = VideoDecoder
diff --git a/src/torchcodec/decoders/_audio_decoder.py b/src/torchcodec/decoders/_audio_decoder.py
@@ -18,7 +18,31 @@
 
 
 class AudioDecoder:
-    """TODO-AUDIO docs"""
+    """A single-stream audio decoder.
+
+    This can be used to decode audio from pure audio files (e.g. mp3, wav,
+    etc.), or from videos that contain audio streams (e.g. mp4 videos).
+
+    Returned samples are float samples normalized in [-1, 1]
+
+    Args:
+        source (str, ``Pathlib.path``, ``torch.Tensor``, or bytes): The source of the audio:
+
+            - If ``str``: a local path or a URL to a video or audio file.
+            - If ``Pathlib.path``: a path to a local video or audio file.
+            - If ``bytes`` object or ``torch.Tensor``: the raw encoded audio data.
+        stream_index (int, optional): Specifies which stream in the file to decode samples from.
+            Note that this index is absolute across all media types. If left unspecified, then
+            the :term:`best stream` is used.
+        sample_rate (int, optional): The desired output sample rate of the decoded samples.
+            By default, the samples are returned in their original sample rate.
+
+    Attributes:
+        metadata (AudioStreamMetadata): Metadata of the audio stream.
+        stream_index (int): The stream index that this decoder is retrieving samples from. If a
+            stream index was provided at initialization, this is the same value. If it was left
+            unspecified, this is the :term:`best stream`.
+    """
 
     def __init__(
         self,
@@ -46,10 +70,23 @@ def __init__(
             sample_rate if sample_rate is not None else self.metadata.sample_rate
         )
 
+    # TODO-AUDIO: start_seconds should be 0 by default
     def get_samples_played_in_range(
         self, start_seconds: float, stop_seconds: Optional[float] = None
     ) -> AudioSamples:
-        """TODO-AUDIO docs"""
+        """Returns audio samples in the given range.
+
+        Samples are in the half open range [start_seconds, stop_seconds).
+
+        Args:
+            start_seconds (float): Time, in seconds, of the start of the
+                range.
+            stop_seconds (float): Time, in seconds, of the end of the
+                range. As a half open range, the end is excluded.
+
+        Returns:
+            AudioSamples: The samples within the specified range.
+        """
         if stop_seconds is not None and not start_seconds <= stop_seconds:
             raise ValueError(
                 f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."
diff --git a/src/torchcodec/decoders/_core/_metadata.py b/src/torchcodec/decoders/_core/_metadata.py
@@ -25,6 +25,8 @@
 # TODO-AUDIO: docs below are mostly for video streams, we should edit them and /
 # or make sure they're OK for audio streams as well. Not sure how to best handle
 # docs for such class hierarchy.
+# TODO very related, none of these common fields in this base class show up in
+# the docs right now.
 @dataclass
 class StreamMetadata:
     duration_seconds_from_header: Optional[float]
@@ -162,8 +164,11 @@ class AudioStreamMetadata(StreamMetadata):
     """Metadata of a single audio stream."""
 
     sample_rate: Optional[int]
+    """The original sample rate."""
     num_channels: Optional[int]
+    """The number of channels (1 for mono, 2 for stereo, etc.)"""
     sample_format: Optional[str]
+    """The original sample format, as described by FFmpeg. E.g. 'fltp', 's32', etc."""
 
     def __repr__(self):
         return super().__repr__()

Original file line number	Diff line number	Diff line change
`@@ -14,3 +14,4 @@ torchcodec`
`14`	`14`
`15`	`15`	`Frame`
`16`	`16`	`FrameBatch`
	`17`	`+ AudioSamples`