forked from lhotse-speech/lhotse
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_recording_set.py
548 lines (451 loc) · 17.4 KB
/
test_recording_set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
from functools import lru_cache
from math import isclose
import audioread
import numpy as np
import pytest
from pytest import mark, raises
from lhotse.audio import (
AudioMixer,
AudioSource,
DurationMismatchError,
Recording,
RecordingSet,
)
from lhotse.testing.dummies import DummyManifest
from lhotse.utils import INT16MAX, fastcopy
from lhotse.utils import nullcontext as does_not_raise
@pytest.fixture
def recording_set() -> RecordingSet:
return RecordingSet.from_json("test/fixtures/audio.json")
@lru_cache(1)
def expected_channel_0() -> np.ndarray:
"""Contents of test/fixtures/mono_c0.wav"""
return np.reshape(np.arange(0, 4000) / INT16MAX, (1, -1))
@lru_cache(1)
def expected_channel_1() -> np.ndarray:
"""Contents of test/fixtures/mono_c1.wav"""
return np.reshape(np.arange(4000, 8000) / INT16MAX, (1, -1))
@lru_cache(1)
def expected_stereo_two_sources() -> np.ndarray:
"""Combined contents of test/fixtures/mono_c{0,1}.wav as channels 0 and 1"""
return np.vstack([expected_channel_0(), expected_channel_1()])
@lru_cache(1)
def expected_stereo_single_source() -> np.ndarray:
"""Contents of test/fixtures/stereo.{wav,sph}"""
return (
np.vstack(
[
np.arange(8000, 16000, dtype=np.int16),
np.arange(16000, 24000, dtype=np.int16),
]
)
/ INT16MAX
)
def test_get_metadata(recording_set):
assert 2 == recording_set.num_channels("recording-1")
assert 8000 == recording_set.sampling_rate("recording-1")
assert 4000 == recording_set.num_samples("recording-1")
assert 0.5 == recording_set.duration("recording-1")
def test_iteration(recording_set):
assert all(isinstance(item, Recording) for item in recording_set)
def test_get_audio_from_multiple_files(recording_set):
samples = recording_set.load_audio("recording-1")
np.testing.assert_almost_equal(samples, expected_stereo_two_sources())
def test_get_stereo_audio_from_single_file(recording_set):
samples = recording_set.load_audio("recording-2")
np.testing.assert_almost_equal(samples, expected_stereo_single_source())
def test_load_audio_from_sphere_file(recording_set):
samples = recording_set.load_audio("recording-3")
np.testing.assert_almost_equal(samples, expected_stereo_single_source())
@mark.parametrize(
["channels", "expected_audio", "exception_expectation"],
[
(None, expected_stereo_two_sources(), does_not_raise()),
(0, expected_channel_0(), does_not_raise()),
(1, expected_channel_1(), does_not_raise()),
([0, 1], expected_stereo_two_sources(), does_not_raise()),
(1000, "irrelevant", raises(AssertionError)),
],
)
def test_get_audio_multichannel(
recording_set, channels, expected_audio, exception_expectation
):
with exception_expectation:
loaded_audio = recording_set.load_audio("recording-1", channels=channels)
np.testing.assert_almost_equal(loaded_audio, expected_audio)
@mark.parametrize(
[
"begin_at",
"duration",
"expected_start_sample",
"expected_end_sample",
"exception_expectation",
],
[
(0, None, 0, 4000, does_not_raise()),
(0.1, None, 800, 4000, does_not_raise()),
(0, 0.3, 0, 2400, does_not_raise()),
(0.1, 0.2, 800, 2400, does_not_raise()),
(
0.3,
10.0,
"irrelevant",
"irrelevant",
raises(DurationMismatchError),
), # requested more audio than available
],
)
def test_get_audio_chunks(
recording_set,
begin_at,
duration,
expected_start_sample,
expected_end_sample,
exception_expectation,
):
with exception_expectation:
actual_audio = recording_set.load_audio(
recording_id="recording-1",
channels=0,
offset_seconds=begin_at,
duration_seconds=duration,
)
expected_audio = expected_channel_0()[
:, expected_start_sample:expected_end_sample
]
np.testing.assert_almost_equal(actual_audio, expected_audio)
def test_add_recording_sets():
expected = DummyManifest(RecordingSet, begin_id=0, end_id=10)
recording_set_1 = DummyManifest(RecordingSet, begin_id=0, end_id=5)
recording_set_2 = DummyManifest(RecordingSet, begin_id=5, end_id=10)
combined = recording_set_1 + recording_set_2
assert combined.to_eager() == expected
@pytest.mark.parametrize(
["relative_path_depth", "expected_source_path"],
[
(None, "test/fixtures/stereo.sph"),
(1, "stereo.sph"),
(2, "fixtures/stereo.sph"),
(3, "test/fixtures/stereo.sph"),
(4, "test/fixtures/stereo.sph"),
],
)
def test_recording_from_sphere(relative_path_depth, expected_source_path):
rec = Recording.from_file(
"test/fixtures/stereo.sph", relative_path_depth=relative_path_depth
)
assert rec == Recording(
id="stereo",
sampling_rate=8000,
num_samples=8000,
duration=1.0,
sources=[
AudioSource(type="file", channels=[0, 1], source=expected_source_path)
],
)
@pytest.fixture
def file_source():
return AudioSource(type="file", channels=[0], source="test/fixtures/mono_c0.wav")
@pytest.fixture
def nonfile_source():
return AudioSource(
type="command", channels=[0], source="cat test/fixtures/mono_c0.wav"
)
@pytest.fixture
def recording(file_source):
return Recording(
id="rec",
sources=[file_source, fastcopy(file_source, channels=[1])],
sampling_rate=8000,
num_samples=4000,
duration=0.5,
)
@pytest.mark.parametrize(
["factor", "affix_id"],
[
(1.0, True),
(1.0, False),
(0.9, True),
(1.1, True),
],
)
def test_recording_perturb_speed(recording, factor, affix_id):
rec_sp = recording.perturb_speed(factor=factor, affix_id=affix_id)
if affix_id:
assert rec_sp.id == f"{recording.id}_sp{factor}"
else:
assert rec_sp.id == recording.id
samples = rec_sp.load_audio()
assert samples.shape[0] == rec_sp.num_channels
assert samples.shape[1] == rec_sp.num_samples
@pytest.mark.parametrize(
["factor", "affix_id"],
[
(1.0, True),
(1.0, False),
(0.9, True),
(1.1, True),
],
)
def test_recording_perturb_tempo(recording, factor, affix_id):
rec_sp = recording.perturb_tempo(factor=factor, affix_id=affix_id)
if affix_id:
assert rec_sp.id == f"{recording.id}_tp{factor}"
else:
assert rec_sp.id == recording.id
samples = rec_sp.load_audio()
assert samples.shape[0] == rec_sp.num_channels
assert samples.shape[1] == rec_sp.num_samples
@pytest.mark.parametrize(
["factor", "affix_id"],
[
(1.0, True),
(1.0, False),
(0.125, True),
(0.125, False),
(2.0, True),
(2.0, False),
],
)
def test_recording_perturb_volume(recording, factor, affix_id):
rec_vp = recording.perturb_volume(factor=factor, affix_id=affix_id)
if affix_id:
assert rec_vp.id == f"{recording.id}_vp{factor}"
else:
assert rec_vp.id == recording.id
samples = rec_vp.load_audio()
assert samples.shape[0] == rec_vp.num_channels
assert samples.shape[1] == rec_vp.num_samples
def test_recording_set_perturb_speed(recording_set):
recs_sp = recording_set.perturb_speed(factor=1.1)
for r, r_sp in zip(recording_set, recs_sp):
assert r.duration > r_sp.duration # Faster recording => shorter duration
assert r.sampling_rate == r_sp.sampling_rate
def test_recording_set_perturb_tempo(recording_set):
recs_sp = recording_set.perturb_tempo(factor=1.1)
for r, r_tp in zip(recording_set, recs_sp):
assert r.duration > r_tp.duration # Faster recording => shorter duration
assert r.sampling_rate == r_tp.sampling_rate
def test_recording_set_perturb_volume(recording_set):
recs_vp = recording_set.perturb_volume(factor=2.0)
for r, r_vp in zip(recording_set, recs_vp):
assert r.duration == r_vp.duration
assert r.sampling_rate == r_vp.sampling_rate
@pytest.mark.parametrize("sampling_rate", [8000, 16000, 22050, 32000, 44100, 48000])
def test_recording_resample(recording, sampling_rate):
rec_sp = recording.resample(sampling_rate)
assert rec_sp.id == recording.id
assert rec_sp.duration == recording.duration
samples = rec_sp.load_audio()
assert samples.shape[0] == rec_sp.num_channels
assert samples.shape[1] == rec_sp.num_samples
def test_recording_set_resample(recording_set):
recs_sp = recording_set.resample(sampling_rate=44100)
for r, r_sp in zip(recording_set, recs_sp):
assert r.duration == r_sp.duration
assert r_sp.sampling_rate == 44100
assert r_sp.num_samples > r.num_samples
@pytest.fixture
def recording_set2(recording):
return RecordingSet.from_recordings(
[fastcopy(recording, id=f"{recording.id}-{i}") for i in range(5)]
)
def test_audio_source_path_prefix(file_source):
assert (
str(file_source.with_path_prefix("/data").source)
== "/data/test/fixtures/mono_c0.wav"
)
def test_audio_source_nonfile_path_prefix(nonfile_source):
assert (
str(nonfile_source.with_path_prefix("/data").source)
== "cat test/fixtures/mono_c0.wav"
)
def test_recording_path_prefix(recording):
for source in recording.with_path_prefix("/data").sources:
assert str(source.source) == "/data/test/fixtures/mono_c0.wav"
def test_recording_set_prefix(recording_set2):
for recording in recording_set2.with_path_prefix("/data"):
for source in recording.sources:
assert str(source.source) == "/data/test/fixtures/mono_c0.wav"
class TestAudioMixer:
@classmethod
def setup_class(cls):
cls.audio1 = np.ones(8000, dtype=np.float32).reshape(1, -1)
cls.audio2 = np.ones(8000, dtype=np.float32).reshape(1, -1) * 2
def test_simple_mix(self):
mixer = AudioMixer(base_audio=self.audio1, sampling_rate=8000)
mixer.add_to_mix(self.audio2, snr=None, offset=0)
unmixed = mixer.unmixed_audio
assert len(unmixed) == 2
assert all(u.shape == (1, 8000) for u in unmixed)
assert (unmixed[0] == 1).all()
assert (unmixed[1] == 2).all()
assert all(u.dtype == np.float32 for u in unmixed)
mixed = mixer.mixed_audio
assert mixed.shape == (1, 8000)
assert (mixed == 3).all()
assert mixed.dtype == np.float32
def test_audio_mixed_with_offset(self):
mixer = AudioMixer(base_audio=self.audio1, sampling_rate=8000)
mixer.add_to_mix(self.audio2, snr=None, offset=0.5)
unmixed = mixer.unmixed_audio
assert len(unmixed) == 2
assert all(u.shape == (1, 12000) for u in unmixed)
assert (unmixed[0][:, :8000] == 1).all()
assert (unmixed[0][:, 8000:] == 0).all()
assert (unmixed[1][:, :4000] == 0).all()
assert (unmixed[1][:, 4000:] == 2).all()
assert all(u.dtype == np.float32 for u in unmixed)
mixed = mixer.mixed_audio
assert mixed.shape == (1, 12000)
assert (mixed[0, :4000] == 1).all()
assert (mixed[0, 4000:8000] == 3).all()
assert (mixed[0, 8000:] == 2).all()
assert mixed.dtype == np.float32
def test_audio_mixed_with_snr(self):
mixer = AudioMixer(base_audio=self.audio1, sampling_rate=8000)
mixer.add_to_mix(self.audio2, snr=10, offset=0)
unmixed = mixer.unmixed_audio
assert len(unmixed) == 2
assert all(u.shape == (1, 8000) for u in unmixed)
assert (unmixed[0] == 1).all()
np.testing.assert_almost_equal(unmixed[1], 0.31622776)
assert all(u.dtype == np.float32 for u in unmixed)
mixed = mixer.mixed_audio
assert mixed.shape == (1, 8000)
np.testing.assert_almost_equal(mixed[0, :], 1.31622776)
assert mixed.dtype == np.float32
def test_audio_mixed_with_offset_and_snr(self):
mixer = AudioMixer(base_audio=self.audio1, sampling_rate=8000)
mixer.add_to_mix(self.audio2, snr=10, offset=0.5)
unmixed = mixer.unmixed_audio
assert len(unmixed) == 2
assert all(u.shape == (1, 12000) for u in unmixed)
assert (unmixed[0][:, :8000] == 1).all()
assert (unmixed[0][:, 8000:] == 0).all()
assert (unmixed[1][:, :4000] == 0).all()
np.testing.assert_almost_equal(unmixed[1][:, 4000:], 0.31622776)
assert all(u.dtype == np.float32 for u in unmixed)
mixed = mixer.mixed_audio
assert mixed.shape == (1, 12000)
assert (mixed[0, :4000] == 1).all()
np.testing.assert_almost_equal(mixed[0, 4000:8000], 1.31622776)
np.testing.assert_almost_equal(mixed[0, 8000:], 0.31622776)
assert mixed.dtype == np.float32
def test_audio_mixer_handles_empty_array(self):
# Treat it more like a test of "it runs" rather than "it works"
sr = 16000
t = np.linspace(0, 1, sr, dtype=np.float32)
x1 = np.sin(440.0 * t).reshape(1, -1)
mixer = AudioMixer(
base_audio=x1,
sampling_rate=sr,
)
mixer.add_to_mix(np.array([]))
xmix = mixer.mixed_audio
np.testing.assert_equal(xmix, x1)
def test_audio_mixer_handles_empty_array_with_offset(self):
# Treat it more like a test of "it runs" rather than "it works"
sr = 16000
t = np.linspace(0, 1, sr, dtype=np.float32)
x1 = np.sin(440.0 * t).reshape(1, -1)
mixer = AudioMixer(
base_audio=x1,
sampling_rate=sr,
)
mixer.add_to_mix(np.array([]), offset=0.5)
xmix = mixer.mixed_audio
# 0s - 1s: identical
np.testing.assert_equal(xmix[:sr], x1)
# 1s - 1.5s: padding
np.testing.assert_equal(xmix[sr:], 0)
@pytest.mark.skipif(
all(
"ffmpeg" not in str(backend).lower()
for backend in audioread.available_backends()
),
reason="Requires FFmpeg to be installed.",
)
def test_opus_recording_from_file():
path = "test/fixtures/mono_c0.opus"
recording = Recording.from_file(path)
# OPUS always overrides the sampling rate to 48000
assert recording.sampling_rate == 48000
# OPUS may crate extra audio frames / samples...
assert isclose(recording.duration, 0.5054166666666666)
samples = recording.load_audio()
num_channels, num_samples = samples.shape
assert num_channels == recording.num_channels
assert num_samples == recording.num_samples
assert num_samples == 24260
# OPUS file read succesfully!
@pytest.mark.skipif(
all(
"ffmpeg" not in str(backend).lower()
for backend in audioread.available_backends()
),
reason="Requires FFmpeg to be installed.",
)
def test_opus_recording_from_file_force_sampling_rate():
path = "test/fixtures/mono_c0.opus"
recording = Recording.from_file(path, force_opus_sampling_rate=8000)
assert recording.sampling_rate == 8000
assert isclose(recording.duration, 0.5055)
samples = recording.load_audio()
num_channels, num_samples = samples.shape
assert num_channels == recording.num_channels
assert num_samples == recording.num_samples
assert num_samples == 4044
@pytest.mark.skipif(
all(
"ffmpeg" not in str(backend).lower()
for backend in audioread.available_backends()
),
reason="Requires FFmpeg to be installed.",
)
def test_opus_stereo_recording_from_file_force_sampling_rate():
path = "test/fixtures/stereo.opus"
recording = Recording.from_file(path, force_opus_sampling_rate=8000)
assert recording.sampling_rate == 8000
assert isclose(recording.duration, 1.0055)
samples = recording.load_audio()
num_channels, num_samples = samples.shape
assert num_channels == recording.num_channels
assert num_samples == recording.num_samples
assert num_samples == 8044
@pytest.mark.skipif(
all(
"ffmpeg" not in str(backend).lower()
for backend in audioread.available_backends()
),
reason="Requires FFmpeg to be installed.",
)
def test_opus_stereo_recording_from_file_force_sampling_rate_read_chunk():
path = "test/fixtures/stereo.opus"
recording = Recording.from_file(path, force_opus_sampling_rate=8000)
assert recording.sampling_rate == 8000
assert isclose(recording.duration, 1.0055)
all_samples = recording.load_audio()
samples = recording.load_audio(offset=0.5, duration=0.25)
num_channels, num_samples = samples.shape
assert num_channels == recording.num_channels
assert num_samples == 2000
np.testing.assert_almost_equal(samples, all_samples[:, 4000:6000], decimal=5)
def test_audio_source_memory_type(recording):
memory_recording = recording.move_to_memory()
np.testing.assert_equal(memory_recording.load_audio(), recording.load_audio())
def test_recording_from_bytes():
path = "test/fixtures/mono_c0.wav"
recording = Recording.from_file(path)
memory_recording = Recording.from_bytes(
data=open(path, "rb").read(),
recording_id=recording.id,
)
np.testing.assert_equal(memory_recording.load_audio(), recording.load_audio())
def test_memory_recording_dict_serialization():
path = "test/fixtures/mono_c0.wav"
rec = Recording.from_bytes(data=open(path, "rb").read(), recording_id="testrec")
data = rec.to_dict()
rec_reconstructed = Recording.from_dict(data)
assert rec == rec_reconstructed
np.testing.assert_equal(rec_reconstructed.load_audio(), rec.load_audio())