forked from h2oai/driverlessai-recipes
-
Notifications
You must be signed in to change notification settings - Fork 1
/
audio_MFCC_transformer.py
102 lines (77 loc) · 3.53 KB
/
audio_MFCC_transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""Extract MFCC and spectrogram features from audio files"""
from h2oaicore.transformer_utils import CustomTransformer
import datatable as dt
import numpy as np
class AudioMFCCTransformer(CustomTransformer):
_unsupervised = True
_modules_needed_by_name = ['librosa==0.8.1']
_parallel_task = True # if enabled, params_base['n_jobs'] will be >= 1 (adaptive to system), otherwise 1
_can_use_gpu = True # if enabled, will use special job scheduler for GPUs
_can_use_multi_gpu = True # if enabled, can get access to multiple GPUs for single transformer (experimental)
_numeric_output = True
@staticmethod
def is_enabled():
return False
@staticmethod
def get_default_properties():
return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)
@staticmethod
def do_acceptance_test():
return False
def fit_transform(self, X: dt.Frame, y: np.array = None):
return self.transform(X)
# Return MFCC based on spectrogram of audio
def get_mfcc(self, file_path):
import librosa
sampling_rate = 16000
duration = 1 # in secs
hop_length = 347 * duration
fmin = 20 # min freq
fmax = sampling_rate // 2 # max freq
n_mels = 128 # number of mels
n_fft = n_mels * 20 # fft window size
padmode = 'constant'
samples = sampling_rate * duration # number of samples
n_mfcc = 13 # number of Mel FCC to use
try:
audio, sr = librosa.load(file_path, sr=sampling_rate)
# Trim silence
if len(audio) > 0:
audio, _ = librosa.effects.trim(audio)
# Trim if audio length > samples
if len(audio) > samples:
audio = audio[0:0 + samples]
# Else pad blanks if shorter
else:
padding = samples - len(audio)
offset = padding // 2
audio = np.pad(audio, (offset, samples - len(audio) - offset), padmode)
# Get Mel spectogram of audio
spectrogram = librosa.feature.melspectrogram(audio,
sr=sampling_rate,
n_mels=n_mels,
hop_length=hop_length,
n_fft=n_fft,
fmin=fmin,
fmax=fmax)
# Convert to log scale (DB)
spectrogram = librosa.power_to_db(spectrogram)
# Get MFCC and second derivatives
mfcc = librosa.feature.mfcc(S=spectrogram, n_mfcc=n_mfcc)
delta2_mfcc = librosa.feature.delta(mfcc, order=2)
# Append MFCC to spectrogram and flatten
features = np.concatenate((spectrogram, mfcc, delta2_mfcc), axis=0)
X = features.ravel()
return X
except:
spectrogram = np.zeros(((n_mels + 2 * n_mfcc) * 47), dtype=np.float32)
X = spectrogram.ravel()
return X
def transform(self, X: dt.Frame):
import pandas as pd
mels = X.to_pandas().iloc[:, 0].apply(lambda x: self.get_mfcc(x))
col_names = ['X_' + str(i) for i in range(0, len(mels[0]))]
rows = len(mels)
cols = len(mels[0])
output_df = pd.DataFrame(data=np.reshape(np.concatenate(mels), (rows, cols)), columns=col_names)
return output_df