Skip to content

Commit fb8e6c5

Browse files
authored
[audio utils] fix fft_bin_width computation (#36603)
* fix fft_bin_width computation * update docstring + enforce correct params * update test with correct value * udpate test * update feature extractors for concerned models * update * make * udpate docstring * udpate docstring
1 parent e97c760 commit fb8e6c5

File tree

5 files changed

+45
-31
lines changed

5 files changed

+45
-31
lines changed

src/transformers/audio_utils.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ def mel_filter_bank(
293293
294294
Args:
295295
num_frequency_bins (`int`):
296-
Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
296+
Number of frequency bins (should be the same as `n_fft // 2 + 1` where `n_fft` is the size of the Fourier Transform used to compute the spectrogram).
297297
num_mel_filters (`int`):
298298
Number of mel filters to generate.
299299
min_frequency (`float`):
@@ -317,6 +317,12 @@ def mel_filter_bank(
317317
if norm is not None and norm != "slaney":
318318
raise ValueError('norm must be one of None or "slaney"')
319319

320+
if num_frequency_bins < 2:
321+
raise ValueError(f"Require num_frequency_bins: {num_frequency_bins} >= 2")
322+
323+
if min_frequency > max_frequency:
324+
raise ValueError(f"Require min_frequency: {min_frequency} <= max_frequency: {max_frequency}")
325+
320326
# center points of the triangular mel filters
321327
mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
322328
mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
@@ -325,7 +331,7 @@ def mel_filter_bank(
325331

326332
if triangularize_in_mel_space:
327333
# frequencies of FFT bins in Hz, but filters triangularized in mel space
328-
fft_bin_width = sampling_rate / (num_frequency_bins * 2)
334+
fft_bin_width = sampling_rate / ((num_frequency_bins - 1) * 2)
329335
fft_freqs = hertz_to_mel(fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale)
330336
filter_freqs = mel_freqs
331337
else:

src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(
9191

9292
if not is_speech_available():
9393
mel_filters = mel_filter_bank(
94-
num_frequency_bins=256,
94+
num_frequency_bins=257,
9595
num_mel_filters=self.num_mel_bins,
9696
min_frequency=20,
9797
max_frequency=sampling_rate // 2,
@@ -101,7 +101,7 @@ def __init__(
101101
triangularize_in_mel_space=True,
102102
)
103103

104-
self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
104+
self.mel_filters = mel_filters
105105
self.window = window_function(400, "hann", periodic=False)
106106

107107
def _extract_fbank_features(

src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def __init__(
7474
self.stride = stride
7575

7676
mel_filters = mel_filter_bank(
77-
num_frequency_bins=256,
77+
num_frequency_bins=257,
7878
num_mel_filters=self.num_mel_bins,
7979
min_frequency=20,
8080
max_frequency=sampling_rate // 2,
@@ -84,7 +84,7 @@ def __init__(
8484
triangularize_in_mel_space=True,
8585
)
8686

87-
self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
87+
self.mel_filters = mel_filters
8888
self.window = window_function(400, "povey", periodic=False)
8989

9090
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)

src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(
9191

9292
if not is_speech_available():
9393
mel_filters = mel_filter_bank(
94-
num_frequency_bins=256,
94+
num_frequency_bins=257,
9595
num_mel_filters=self.num_mel_bins,
9696
min_frequency=20,
9797
max_frequency=sampling_rate // 2,
@@ -101,7 +101,7 @@ def __init__(
101101
triangularize_in_mel_space=True,
102102
)
103103

104-
self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
104+
self.mel_filters = mel_filters
105105
self.window = window_function(400, "povey", periodic=False)
106106

107107
def _extract_fbank_features(

tests/utils/test_audio_utils.py

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -194,26 +194,38 @@ def test_mel_filter_bank_kaldi(self):
194194
triangularize_in_mel_space=True,
195195
)
196196
# fmt: off
197+
# here the expected values from torchaudio.compliance.kaldi.get_mel_banks
198+
# note that we compute values in float64 while they do it in float32
197199
expected = np.array(
198-
[[0.0000, 0.0000, 0.0000, 0.0000],
199-
[0.6086, 0.0000, 0.0000, 0.0000],
200-
[0.8689, 0.1311, 0.0000, 0.0000],
201-
[0.4110, 0.5890, 0.0000, 0.0000],
202-
[0.0036, 0.9964, 0.0000, 0.0000],
203-
[0.0000, 0.6366, 0.3634, 0.0000],
204-
[0.0000, 0.3027, 0.6973, 0.0000],
205-
[0.0000, 0.0000, 0.9964, 0.0036],
206-
[0.0000, 0.0000, 0.7135, 0.2865],
207-
[0.0000, 0.0000, 0.4507, 0.5493],
208-
[0.0000, 0.0000, 0.2053, 0.7947],
209-
[0.0000, 0.0000, 0.0000, 0.9752],
210-
[0.0000, 0.0000, 0.0000, 0.7585],
211-
[0.0000, 0.0000, 0.0000, 0.5539],
212-
[0.0000, 0.0000, 0.0000, 0.3599],
213-
[0.0000, 0.0000, 0.0000, 0.1756]]
200+
[
201+
[0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.0000000000000000],
202+
[0.6457883715629578, 0.0000000000000000, 0.0000000000000000, 0.0000000000000000],
203+
[0.8044781088829041, 0.1955219060182571, 0.0000000000000000, 0.0000000000000000],
204+
[0.3258901536464691, 0.6741098165512085, 0.0000000000000000, 0.0000000000000000],
205+
[0.0000000000000000, 0.9021250009536743, 0.0978749766945839, 0.0000000000000000],
206+
[0.0000000000000000, 0.5219038724899292, 0.4780961275100708, 0.0000000000000000],
207+
[0.0000000000000000, 0.1771058291196823, 0.8228941559791565, 0.0000000000000000],
208+
[0.0000000000000000, 0.0000000000000000, 0.8616894483566284, 0.1383105516433716],
209+
[0.0000000000000000, 0.0000000000000000, 0.5710380673408508, 0.4289619624614716],
210+
[0.0000000000000000, 0.0000000000000000, 0.3015440106391907, 0.6984559893608093],
211+
[0.0000000000000000, 0.0000000000000000, 0.0503356307744980, 0.9496643543243408],
212+
[0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.8150880336761475],
213+
[0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.5938932299613953],
214+
[0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.3851676583290100],
215+
[0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.1875794380903244],
216+
],
217+
dtype=np.float64,
214218
)
215219
# fmt: on
216-
self.assertTrue(np.allclose(mel_filters, expected, atol=5e-5))
220+
221+
# kaldi implementation does not compute values for last fft bin
222+
# indeed, they enforce max_frequency <= sampling_rate / 2 and
223+
# therefore they know that last fft bin filter bank values will be all 0
224+
# and pad after with zeros
225+
# to comply with our API for `mel_filter_bank`, we need to also pad here
226+
expected = np.pad(expected, ((0, 1), (0, 0)))
227+
228+
self.assertTrue(np.allclose(mel_filters, expected))
217229

218230
def test_mel_filter_bank_slaney_norm(self):
219231
mel_filters = mel_filter_bank(
@@ -369,7 +381,7 @@ def test_spectrogram_integration_test(self):
369381
self.assertTrue(np.allclose(spec[:64, 400], expected))
370382

371383
mel_filters = mel_filter_bank(
372-
num_frequency_bins=256,
384+
num_frequency_bins=257,
373385
num_mel_filters=400,
374386
min_frequency=20,
375387
max_frequency=8000,
@@ -379,8 +391,6 @@ def test_spectrogram_integration_test(self):
379391
triangularize_in_mel_space=True,
380392
)
381393

382-
mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
383-
384394
spec = spectrogram(
385395
waveform,
386396
window_function(400, "povey", periodic=False),
@@ -510,7 +520,7 @@ def test_spectrogram_batch_integration_test(self):
510520
self.assertTrue(np.allclose(spec_list[2][:64, 400], expected3))
511521

512522
mel_filters = mel_filter_bank(
513-
num_frequency_bins=256,
523+
num_frequency_bins=257,
514524
num_mel_filters=400,
515525
min_frequency=20,
516526
max_frequency=8000,
@@ -520,8 +530,6 @@ def test_spectrogram_batch_integration_test(self):
520530
triangularize_in_mel_space=True,
521531
)
522532

523-
mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
524-
525533
spec_list = spectrogram_batch(
526534
waveform_list,
527535
window_function(400, "povey", periodic=False),

0 commit comments

Comments
 (0)