1+ import numpy as np
2+ import librosa
3+ import librosa .display
4+ import noisereduce as nr
5+
6+ class AudioPreprocessor :
7+ @staticmethod
8+ def int_to_float (array , type = np .float32 ):
9+ """
10+ Change np.array int16 into np.float32
11+ Parameters
12+ ----------
13+ array: np.array
14+ type: np.float32
15+ Returns
16+ -------
17+ result : np.array
18+ """
19+
20+ if array .dtype == type :
21+ return array
22+
23+ if array .dtype not in [np .float16 , np .float32 , np .float64 ]:
24+ if np .max (np .abs (array )) == 0 :
25+ array = array .astype (np .float32 )
26+ array [:] = 0
27+ else :
28+ array = array .astype (np .float32 ) / np .max (np .abs (array ))
29+
30+ return array
31+
32+ @staticmethod
33+ def float_to_int (array , type = np .int16 , divide_max_abs = True ):
34+ """
35+ Change np.array float32 / float64 into np.int16
36+ Parameters
37+ ----------
38+ array: np.array
39+ type: np.int16
40+ Returns
41+ -------
42+ result : np.array
43+ """
44+
45+ if array .dtype == type :
46+ return array
47+
48+ if array .dtype not in [np .int16 , np .int32 , np .int64 ]:
49+ if np .max (np .abs (array )) == 0 :
50+ array [:] = 0
51+ array = type (array * np .iinfo (type ).max )
52+ else :
53+ if divide_max_abs :
54+ array = type (array / np .max (np .abs (array )) * np .iinfo (type ).max )
55+ else :
56+ array = type (array * np .iinfo (type ).max )
57+
58+ return array
59+
60+ @staticmethod
61+ def remove_silence (y ):
62+ threshold = 0.005
63+ pause_length_in_ms = 200
64+ keep_at_start_and_end = 50
65+ counter_below_threshold = 0
66+ indices_to_remove = []
67+
68+ for i , amp in enumerate (y ):
69+ if abs (amp ) < threshold :
70+ counter_below_threshold += 1
71+ else :
72+ if counter_below_threshold > pause_length_in_ms :
73+ for index in range (i - counter_below_threshold + keep_at_start_and_end , i - keep_at_start_and_end ):
74+ indices_to_remove .append (index )
75+ counter_below_threshold = 0
76+
77+ if counter_below_threshold > pause_length_in_ms :
78+ for index in range (len (y )- counter_below_threshold + keep_at_start_and_end , len (y )- keep_at_start_and_end ):
79+ indices_to_remove .append (index )
80+
81+ y_ = np .delete (y , indices_to_remove )
82+
83+ return y_
84+
85+ @staticmethod
86+ def remove_noise (y , sr ):
87+ # prop_decrease 0.8 only reduces noise by 0.8 -> sound quality is better than at 1.0
88+ y_ = nr .reduce_noise (y = y , sr = sr , prop_decrease = 0.8 )
89+
90+ return y_
91+
92+ @staticmethod
93+ def create_frames (y , frame_size , overlap ):
94+ frames = []
95+
96+ if overlap >= frame_size or frame_size <= 0 or overlap < 0 :
97+ return frames
98+
99+ index = 0
100+
101+ while index + frame_size < y .shape [0 ]:
102+ frames .append (y [index : index + frame_size ])
103+ index = index + frame_size - overlap
104+
105+ return frames
106+
107+ @staticmethod
108+ def window_frames (frames , window_function = np .hanning ):
109+ windowed_frames = []
110+
111+ for frame in frames :
112+ windowed_frames .append (frame * window_function (frame .shape [0 ]))
113+
114+ return windowed_frames
115+
116+ @staticmethod
117+ def load_preprocessed_frames (filepath = None , y = None , sr = None ):
118+ if filepath is None and (y is None or sr is None ):
119+ raise ValueError ("Either filepath or y and sr must be given." )
120+
121+ if y is None or sr is None :
122+ y , sr = librosa .load (filepath )
123+
124+ y = AudioPreprocessor .remove_noise (y = y , sr = sr )
125+ y = AudioPreprocessor .remove_silence (y = y )
126+
127+ frames = AudioPreprocessor .create_frames (y = y , frame_size = 1000 , overlap = 100 )
128+ windowed_frames = AudioPreprocessor .window_frames (frames = frames )
129+
130+ return windowed_frames
131+
132+ def main ():
133+ frames = AudioPreprocessor .load_preprocessed_frames ("./audio.wav" )
134+ print (frames )
135+
136+ if __name__ == '__main__' :
137+ main ()
0 commit comments