forked from MontrealCorpusTools/Montreal-Forced-Aligner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_g2p.py
82 lines (68 loc) · 2.88 KB
/
test_g2p.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
from montreal_forced_aligner.dictionary import MultispeakerDictionary
from montreal_forced_aligner.g2p.generator import (
PyniniCorpusGenerator,
PyniniWordListGenerator,
clean_up_word,
)
from montreal_forced_aligner.g2p.trainer import PyniniTrainer
from montreal_forced_aligner.helper import mfa_open
from montreal_forced_aligner.models import G2PModel
from montreal_forced_aligner.utils import get_mfa_version
def test_clean_up_word():
original_word = "+abc"
w, m = clean_up_word(original_word, {"a", "b", "c"})
assert w == "abc"
assert m == {"+"}
def test_check_bracketed(basic_dict_path):
"""Checks if the brackets are removed correctly and handling an empty string works"""
word_set = ["uh", "(the)", "sick", "<corpus>", "[a]", "{cold}", ""]
expected_result = ["uh", "sick", ""]
dictionary_config = MultispeakerDictionary(dictionary_path=basic_dict_path)
assert [x for x in word_set if not dictionary_config.check_bracketed(x)] == expected_result
def test_training(basic_dict_path, basic_g2p_model_path, temp_dir):
trainer = PyniniTrainer(
dictionary_path=basic_dict_path,
temporary_directory=temp_dir,
random_starts=1,
num_iterations=5,
evaluate=True,
)
trainer.setup()
trainer.train()
trainer.export_model(basic_g2p_model_path)
model = G2PModel(basic_g2p_model_path, root_directory=temp_dir)
assert model.meta["version"] == get_mfa_version()
assert model.meta["architecture"] == "pynini"
assert model.meta["phones"] == trainer.non_silence_phones
assert model.meta["graphemes"] == trainer.g2p_training_graphemes
trainer.cleanup()
def test_generator(basic_g2p_model_path, basic_corpus_dir, g2p_basic_output, temp_dir):
output_directory = os.path.join(temp_dir, "g2p_tests")
gen = PyniniCorpusGenerator(
g2p_model_path=basic_g2p_model_path,
corpus_directory=basic_corpus_dir,
temporary_directory=output_directory,
)
gen.setup()
assert not gen.g2p_model.validate(gen.corpus_word_set)
assert gen.g2p_model.validate([x for x in gen.corpus_word_set if not gen.check_bracketed(x)])
gen.export_pronunciations(g2p_basic_output)
assert os.path.exists(g2p_basic_output)
gen.cleanup()
def test_generator_pretrained(english_g2p_model, temp_dir):
words = ["petted", "petted-patted", "pedal"]
output_directory = os.path.join(temp_dir, "g2p_tests")
word_list_path = os.path.join(output_directory, "word_list.txt")
os.makedirs(output_directory, exist_ok=True)
with mfa_open(word_list_path, "w") as f:
for w in words:
f.write(w + "\n")
gen = PyniniWordListGenerator(
g2p_model_path=english_g2p_model, word_list_path=word_list_path, num_pronunciations=3
)
gen.setup()
results = gen.generate_pronunciations()
print(results)
assert len(results["petted"]) == 3
gen.cleanup()