Skip to content

Commit 93f5d47

Browse files
authored
Add AudioDecoder docs and tutorial (#582)
1 parent 9ae0a10 commit 93f5d47

File tree

9 files changed

+167
-6
lines changed

9 files changed

+167
-6
lines changed

docs/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ sphinx_copybutton
55
sphinx-tabs
66
matplotlib
77
torchvision
8+
ipython
89
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme

docs/source/api_ref_decoders.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ torchcodec.decoders
77
.. currentmodule:: torchcodec.decoders
88

99

10-
For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
10+
For a video decoder tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
11+
For an audio decoder tutorial, see: :ref:`sphx_glr_generated_examples_audio_decoding.py`.
1112

1213

1314
.. autosummary::
@@ -16,6 +17,7 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
1617
:template: class.rst
1718

1819
VideoDecoder
20+
AudioDecoder
1921

2022

2123
.. autosummary::
@@ -24,3 +26,4 @@ For a tutorial, see: :ref:`sphx_glr_generated_examples_basic_example.py`.
2426
:template: dataclass.rst
2527

2628
VideoStreamMetadata
29+
AudioStreamMetadata

docs/source/api_ref_torchcodec.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ torchcodec
1414

1515
Frame
1616
FrameBatch
17+
AudioSamples

docs/source/glossary.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Glossary
44
.. glossary::
55

66
pts
7-
Presentation Time Stamp. The time at which a frame should be played.
7+
Presentation Time Stamp. The time at which a frame or audio sample should be played.
88
In TorchCodec, pts are expressed in seconds.
99

1010
best stream

examples/audio_decoding.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""
8+
========================================
9+
Decoding audio streams with AudioDecoder
10+
========================================
11+
12+
In this example, we'll learn how to decode an audio file using the
13+
:class:`~torchcodec.decoders.AudioDecoder` class.
14+
"""
15+
16+
# %%
17+
# First, a bit of boilerplate: we'll download an audio file from the web and
18+
# define an audio playing utility. You can ignore that part and jump right
19+
# below to :ref:`creating_decoder_audio`.
20+
import requests
21+
from IPython.display import Audio
22+
23+
24+
def play_audio(samples):
25+
return Audio(samples.data, rate=samples.sample_rate)
26+
27+
28+
# Audio source is CC0: https://opengameart.org/content/town-theme-rpg
29+
# Attribution: cynicmusic.com pixelsphere.org
30+
url = "https://opengameart.org/sites/default/files/TownTheme.mp3"
31+
response = requests.get(url, headers={"User-Agent": ""})
32+
if response.status_code != 200:
33+
raise RuntimeError(f"Failed to download video. {response.status_code = }.")
34+
35+
raw_audio_bytes = response.content
36+
37+
# %%
38+
# .. _creating_decoder_audio:
39+
#
40+
# Creating a decoder
41+
# ------------------
42+
#
43+
# We can now create a decoder from the raw (encoded) audio bytes. You can of
44+
# course use a local audio file and pass the path as input. You can also decode
45+
# audio streams from videos!
46+
47+
from torchcodec.decoders import AudioDecoder
48+
49+
decoder = AudioDecoder(raw_audio_bytes)
50+
51+
# %%
52+
# The has not yet been decoded by the decoder, but we already have access to
53+
# some metadata via the ``metadata`` attribute which is an
54+
# :class:`~torchcodec.decoders.AudioStreamMetadata` object.
55+
print(decoder.metadata)
56+
57+
# %%
58+
# Decoding samples
59+
# ----------------
60+
#
61+
# To get decoded samples, we just need to call the
62+
# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` method,
63+
# which returns an :class:`~torchcodec.AudioSamples` object:
64+
65+
samples = decoder.get_samples_played_in_range(start_seconds=0)
66+
67+
print(samples)
68+
play_audio(samples)
69+
70+
# %%
71+
# The ``.data`` field is a tensor of shape ``(num_channels, num_samples)`` and
72+
# of float dtype with values in [-1, 1].
73+
#
74+
# The ``.pts_seconds`` field indicates the starting time of the output samples.
75+
# Here it's 0.025 seconds, even though we asked for samples starting from 0. Not
76+
# all streams start exactly at 0! This is not a bug in TorchCodec, this is a
77+
# property of the file that was defined when it was encoded.
78+
#
79+
# We only output the *start* of the samples, not the end or the duration. Those can
80+
# be easily derived from the number of samples and the sample rate:
81+
82+
duration_seconds = samples.data.shape[1] / samples.sample_rate
83+
print(f"Duration = {int(duration_seconds // 60)}m{int(duration_seconds % 60)}s.")
84+
85+
# %%
86+
# Specifying a range
87+
# ------------------
88+
#
89+
# By default,
90+
# :meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range` decodes
91+
# the entire audio stream, but we can specify a custom range:
92+
93+
samples = decoder.get_samples_played_in_range(start_seconds=10, stop_seconds=70)
94+
95+
print(samples)
96+
play_audio(samples)
97+
98+
# %%
99+
# Custom sample rate
100+
# ------------------
101+
#
102+
# We can also decode the samples into a desired sample rate using the
103+
# ``sample_rate`` parameter of :class:`~torchcodec.decoders.AudioDecoder`. The
104+
# ouput will sound the same, but note that the number of samples greatly
105+
# increased:
106+
107+
decoder = AudioDecoder(raw_audio_bytes, sample_rate=16_000)
108+
samples = decoder.get_samples_played_in_range(start_seconds=0)
109+
110+
print(samples)
111+
play_audio(samples)

src/torchcodec/_frame.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,10 +120,12 @@ def __repr__(self):
120120
class AudioSamples(Iterable):
121121
"""Audio samples with associated metadata."""
122122

123-
# TODO-AUDIO: docs
124123
data: Tensor
124+
"""The sample data (``torch.Tensor`` of float in [-1, 1], shape is ``(num_channels, num_samples)``)."""
125125
pts_seconds: float
126+
"""The :term:`pts` of the first sample, in seconds."""
126127
sample_rate: int
128+
"""The sample rate of the samples, in Hz."""
127129

128130
def __post_init__(self):
129131
# This is called after __init__() when a Frame is created. We can run

src/torchcodec/decoders/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
from ._core import VideoStreamMetadata
7+
from ._audio_decoder import AudioDecoder # noqa
8+
from ._core import AudioStreamMetadata, VideoStreamMetadata
89
from ._video_decoder import VideoDecoder # noqa
910

1011
SimpleVideoDecoder = VideoDecoder

src/torchcodec/decoders/_audio_decoder.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,31 @@
1818

1919

2020
class AudioDecoder:
21-
"""TODO-AUDIO docs"""
21+
"""A single-stream audio decoder.
22+
23+
This can be used to decode audio from pure audio files (e.g. mp3, wav,
24+
etc.), or from videos that contain audio streams (e.g. mp4 videos).
25+
26+
Returned samples are float samples normalized in [-1, 1]
27+
28+
Args:
29+
source (str, ``Pathlib.path``, ``torch.Tensor``, or bytes): The source of the audio:
30+
31+
- If ``str``: a local path or a URL to a video or audio file.
32+
- If ``Pathlib.path``: a path to a local video or audio file.
33+
- If ``bytes`` object or ``torch.Tensor``: the raw encoded audio data.
34+
stream_index (int, optional): Specifies which stream in the file to decode samples from.
35+
Note that this index is absolute across all media types. If left unspecified, then
36+
the :term:`best stream` is used.
37+
sample_rate (int, optional): The desired output sample rate of the decoded samples.
38+
By default, the samples are returned in their original sample rate.
39+
40+
Attributes:
41+
metadata (AudioStreamMetadata): Metadata of the audio stream.
42+
stream_index (int): The stream index that this decoder is retrieving samples from. If a
43+
stream index was provided at initialization, this is the same value. If it was left
44+
unspecified, this is the :term:`best stream`.
45+
"""
2246

2347
def __init__(
2448
self,
@@ -46,10 +70,23 @@ def __init__(
4670
sample_rate if sample_rate is not None else self.metadata.sample_rate
4771
)
4872

73+
# TODO-AUDIO: start_seconds should be 0 by default
4974
def get_samples_played_in_range(
5075
self, start_seconds: float, stop_seconds: Optional[float] = None
5176
) -> AudioSamples:
52-
"""TODO-AUDIO docs"""
77+
"""Returns audio samples in the given range.
78+
79+
Samples are in the half open range [start_seconds, stop_seconds).
80+
81+
Args:
82+
start_seconds (float): Time, in seconds, of the start of the
83+
range.
84+
stop_seconds (float): Time, in seconds, of the end of the
85+
range. As a half open range, the end is excluded.
86+
87+
Returns:
88+
AudioSamples: The samples within the specified range.
89+
"""
5390
if stop_seconds is not None and not start_seconds <= stop_seconds:
5491
raise ValueError(
5592
f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."

src/torchcodec/decoders/_core/_metadata.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
# TODO-AUDIO: docs below are mostly for video streams, we should edit them and /
2626
# or make sure they're OK for audio streams as well. Not sure how to best handle
2727
# docs for such class hierarchy.
28+
# TODO very related, none of these common fields in this base class show up in
29+
# the docs right now.
2830
@dataclass
2931
class StreamMetadata:
3032
duration_seconds_from_header: Optional[float]
@@ -162,8 +164,11 @@ class AudioStreamMetadata(StreamMetadata):
162164
"""Metadata of a single audio stream."""
163165

164166
sample_rate: Optional[int]
167+
"""The original sample rate."""
165168
num_channels: Optional[int]
169+
"""The number of channels (1 for mono, 2 for stereo, etc.)"""
166170
sample_format: Optional[str]
171+
"""The original sample format, as described by FFmpeg. E.g. 'fltp', 's32', etc."""
167172

168173
def __repr__(self):
169174
return super().__repr__()

0 commit comments

Comments
 (0)