|
40 | 40 |
|
41 | 41 |
|
42 | 42 | ###################################################################### |
43 | | -# Applying effects and filtering |
| 43 | +# Loading the data |
44 | 44 | # ------------------------------ |
45 | 45 | # |
46 | | -# :py:class:`torchaudio.io.AudioEffector` allows for directly applying |
47 | | -# filters and codecs to Tensor objects, in a similar way as ``ffmpeg`` |
48 | | -# command |
49 | | -# |
50 | | -# `AudioEffector Usages <./effector_tutorial.html>` explains how to use |
51 | | -# this class, so for the detail, please refer to the tutorial. |
52 | | -# |
53 | 46 |
|
54 | | -# Load the data |
55 | 47 | waveform1, sample_rate = torchaudio.load(SAMPLE_WAV, channels_first=False) |
56 | 48 |
|
57 | | -# Define effects |
58 | | -effect = ",".join( |
59 | | - [ |
60 | | - "lowpass=frequency=300:poles=1", # apply single-pole lowpass filter |
61 | | - "atempo=0.8", # reduce the speed |
62 | | - "aecho=in_gain=0.8:out_gain=0.9:delays=200:decays=0.3|delays=400:decays=0.3" |
63 | | - # Applying echo gives some dramatic feeling |
64 | | - ], |
65 | | -) |
66 | | - |
67 | | - |
68 | | -# Apply effects |
69 | | -def apply_effect(waveform, sample_rate, effect): |
70 | | - effector = torchaudio.io.AudioEffector(effect=effect) |
71 | | - return effector.apply(waveform, sample_rate) |
72 | | - |
73 | | - |
74 | | -waveform2 = apply_effect(waveform1, sample_rate, effect) |
75 | | - |
76 | 49 | print(waveform1.shape, sample_rate) |
77 | | -print(waveform2.shape, sample_rate) |
78 | 50 |
|
79 | 51 | ###################################################################### |
80 | | -# Note that the number of frames and number of channels are different from |
81 | | -# those of the original after the effects are applied. Let’s listen to the |
82 | | -# audio. |
| 52 | +# Let’s listen to the audio. |
83 | 53 | # |
84 | 54 |
|
85 | 55 |
|
@@ -124,24 +94,11 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): |
124 | 94 |
|
125 | 95 |
|
126 | 96 | ###################################################################### |
127 | | -# Original |
128 | | -# ~~~~~~~~ |
129 | | -# |
130 | 97 |
|
131 | 98 | plot_waveform(waveform1.T, sample_rate, title="Original", xlim=(-0.1, 3.2)) |
132 | 99 | plot_specgram(waveform1.T, sample_rate, title="Original", xlim=(0, 3.04)) |
133 | 100 | Audio(waveform1.T, rate=sample_rate) |
134 | 101 |
|
135 | | -###################################################################### |
136 | | -# Effects applied |
137 | | -# ~~~~~~~~~~~~~~~ |
138 | | -# |
139 | | - |
140 | | -plot_waveform(waveform2.T, sample_rate, title="Effects Applied", xlim=(-0.1, 3.2)) |
141 | | -plot_specgram(waveform2.T, sample_rate, title="Effects Applied", xlim=(0, 3.04)) |
142 | | -Audio(waveform2.T, rate=sample_rate) |
143 | | - |
144 | | - |
145 | 102 | ###################################################################### |
146 | 103 | # Simulating room reverberation |
147 | 104 | # ----------------------------- |
@@ -265,143 +222,3 @@ def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): |
265 | 222 | plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") |
266 | 223 | plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") |
267 | 224 | Audio(noisy_speech, rate=sample_rate) |
268 | | - |
269 | | - |
270 | | -###################################################################### |
271 | | -# Applying codec to Tensor object |
272 | | -# ------------------------------- |
273 | | -# |
274 | | -# :py:class:`torchaudio.io.AudioEffector` can also apply codecs to |
275 | | -# a Tensor object. |
276 | | -# |
277 | | - |
278 | | -waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH, channels_first=False) |
279 | | - |
280 | | - |
281 | | -def apply_codec(waveform, sample_rate, format, encoder=None): |
282 | | - encoder = torchaudio.io.AudioEffector(format=format, encoder=encoder) |
283 | | - return encoder.apply(waveform, sample_rate) |
284 | | - |
285 | | - |
286 | | -###################################################################### |
287 | | -# Original |
288 | | -# ~~~~~~~~ |
289 | | -# |
290 | | - |
291 | | -plot_waveform(waveform.T, sample_rate, title="Original") |
292 | | -plot_specgram(waveform.T, sample_rate, title="Original") |
293 | | -Audio(waveform.T, rate=sample_rate) |
294 | | - |
295 | | -###################################################################### |
296 | | -# 8 bit mu-law |
297 | | -# ~~~~~~~~~~~~ |
298 | | -# |
299 | | - |
300 | | -mulaw = apply_codec(waveform, sample_rate, "wav", encoder="pcm_mulaw") |
301 | | -plot_waveform(mulaw.T, sample_rate, title="8 bit mu-law") |
302 | | -plot_specgram(mulaw.T, sample_rate, title="8 bit mu-law") |
303 | | -Audio(mulaw.T, rate=sample_rate) |
304 | | - |
305 | | -###################################################################### |
306 | | -# G.722 |
307 | | -# ~~~~~ |
308 | | -# |
309 | | - |
310 | | -g722 = apply_codec(waveform, sample_rate, "g722") |
311 | | -plot_waveform(g722.T, sample_rate, title="G.722") |
312 | | -plot_specgram(g722.T, sample_rate, title="G.722") |
313 | | -Audio(g722.T, rate=sample_rate) |
314 | | - |
315 | | -###################################################################### |
316 | | -# Vorbis |
317 | | -# ~~~~~~ |
318 | | -# |
319 | | - |
320 | | -vorbis = apply_codec(waveform, sample_rate, "ogg", encoder="vorbis") |
321 | | -plot_waveform(vorbis.T, sample_rate, title="Vorbis") |
322 | | -plot_specgram(vorbis.T, sample_rate, title="Vorbis") |
323 | | -Audio(vorbis.T, rate=sample_rate) |
324 | | - |
325 | | -###################################################################### |
326 | | -# Simulating a phone recoding |
327 | | -# --------------------------- |
328 | | -# |
329 | | -# Combining the previous techniques, we can simulate audio that sounds |
330 | | -# like a person talking over a phone in a echoey room with people talking |
331 | | -# in the background. |
332 | | -# |
333 | | - |
334 | | -sample_rate = 16000 |
335 | | -original_speech, sample_rate = torchaudio.load(SAMPLE_SPEECH) |
336 | | - |
337 | | -plot_specgram(original_speech, sample_rate, title="Original") |
338 | | - |
339 | | -# Apply RIR |
340 | | -rir_applied = F.fftconvolve(speech, rir) |
341 | | - |
342 | | -plot_specgram(rir_applied, sample_rate, title="RIR Applied") |
343 | | - |
344 | | -# Add background noise |
345 | | -# Because the noise is recorded in the actual environment, we consider that |
346 | | -# the noise contains the acoustic feature of the environment. Therefore, we add |
347 | | -# the noise after RIR application. |
348 | | -noise, _ = torchaudio.load(SAMPLE_NOISE) |
349 | | -noise = noise[:, : rir_applied.shape[1]] |
350 | | - |
351 | | -snr_db = torch.tensor([8]) |
352 | | -bg_added = F.add_noise(rir_applied, noise, snr_db) |
353 | | - |
354 | | -plot_specgram(bg_added, sample_rate, title="BG noise added") |
355 | | - |
356 | | -# Apply filtering and change sample rate |
357 | | -effect = ",".join( |
358 | | - [ |
359 | | - "lowpass=frequency=4000:poles=1", |
360 | | - "compand=attacks=0.02:decays=0.05:points=-60/-60|-30/-10|-20/-8|-5/-8|-2/-8:gain=-8:volume=-7:delay=0.05", |
361 | | - ] |
362 | | -) |
363 | | - |
364 | | -filtered = apply_effect(bg_added.T, sample_rate, effect) |
365 | | -sample_rate2 = 8000 |
366 | | - |
367 | | -plot_specgram(filtered.T, sample_rate2, title="Filtered") |
368 | | - |
369 | | -# Apply telephony codec |
370 | | -codec_applied = apply_codec(filtered, sample_rate2, "g722") |
371 | | -plot_specgram(codec_applied.T, sample_rate2, title="G.722 Codec Applied") |
372 | | - |
373 | | - |
374 | | -###################################################################### |
375 | | -# Original speech |
376 | | -# ~~~~~~~~~~~~~~~ |
377 | | -# |
378 | | - |
379 | | -Audio(original_speech, rate=sample_rate) |
380 | | - |
381 | | -###################################################################### |
382 | | -# RIR applied |
383 | | -# ~~~~~~~~~~~ |
384 | | -# |
385 | | - |
386 | | -Audio(rir_applied, rate=sample_rate) |
387 | | - |
388 | | -###################################################################### |
389 | | -# Background noise added |
390 | | -# ~~~~~~~~~~~~~~~~~~~~~~ |
391 | | -# |
392 | | - |
393 | | -Audio(bg_added, rate=sample_rate) |
394 | | - |
395 | | -###################################################################### |
396 | | -# Filtered |
397 | | -# ~~~~~~~~ |
398 | | -# |
399 | | - |
400 | | -Audio(filtered.T, rate=sample_rate2) |
401 | | - |
402 | | -###################################################################### |
403 | | -# Codec applied |
404 | | -# ~~~~~~~~~~~~~ |
405 | | -# |
406 | | - |
407 | | -Audio(codec_applied.T, rate=sample_rate2) |
0 commit comments