Skip to content

Commit

Permalink
Fix multilingual mode bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
mmcauliffe committed Aug 12, 2021
1 parent a3b3101 commit 75798c2
Show file tree
Hide file tree
Showing 12 changed files with 175 additions and 48 deletions.
94 changes: 54 additions & 40 deletions montreal_forced_aligner/textgrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@


def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
if mode == 'word':
mapping = dictionary.reversed_word_mapping
else:
mapping = dictionary.reversed_phone_mapping
file_dict = {}
cur_utt = None
text = None
Expand All @@ -26,13 +22,13 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
duration = float(line[3])
end = round(begin + duration, 4)
label = line[4]
if dictionary.has_multiple:
d = dictionary.get_dictionary(speaker)
else:
d = dictionary
if mode == 'word':
if utt != cur_utt:
if cur_utt != None:
if dictionary.has_multiple:
d = dictionary.get_dictionary(speaker)
else:
d = dictionary
cur_ind = 0
actual_labels = []
for word in text:
Expand All @@ -42,26 +38,14 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
e = -1
for i in ints:
cur = current_labels[cur_ind]
if i == int(cur[2]):
if cur[0] < b:
b = cur[0]
if cur[1] > e:
e = cur[1]
i_begin, i_end, lab = cur
if i == int(lab):
if i_begin < b:
b = i_begin
if i_end > e:
e = i_end
cur_ind += 1
if b == 1000000 or e == -1:
print('UGH')
print(cur_utt)
print(text)
print (word)
print(ints)
print(current_labels[cur_ind])
print(current_labels)
print(dictionary._lookup(word))
print(dictionary.split_clitics(word))
print(word in dictionary.words_mapping)
print(dictionary.clitic_markers)
print(dictionary.clitic_set)
print(re.split(r'[{}]'.format(dictionary.clitic_markers), word, maxsplit=1))
initial, final = re.split(r'[{}]'.format(dictionary.clitic_markers), word, maxsplit=1)
if any(x in final for x in dictionary.clitic_markers):
final = dictionary.split_clitics(final)
Expand All @@ -77,8 +61,6 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
print('FOUND FINAL')
to_return = [initial] + final
final[0] = clitic + final[0]
print(initial, final, to_return)
print(dictionary.words[word])
raise Exception()
lab = [b, e, word]
actual_labels.append(lab)
Expand All @@ -88,10 +70,9 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
seg = corpus.segments[utt]
filename = seg['file_name']
utt_begin = seg['begin']
begin += utt_begin
end += utt_begin
else:
filename = utt
utt_begin = 0
if filename not in file_dict:
file_dict[filename] = {}
if speaker not in file_dict[filename]:
Expand All @@ -100,9 +81,15 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
text = corpus.text_mapping[utt].split()
current_labels = []

begin += utt_begin
end += utt_begin
current_labels.append([begin, end, label])
else:
speaker = corpus.utt_speak_mapping[utt]
if dictionary.has_multiple:
d = dictionary.get_dictionary(speaker)
else:
d = dictionary
if utt in corpus.segments:
seg = corpus.segments[utt]
filename = seg['file_name']
Expand All @@ -125,6 +112,10 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):

cur_ind = 0
actual_labels = []
if dictionary.has_multiple:
d = dictionary.get_dictionary(speaker)
else:
d = dictionary
for word in text:

ints = d.to_int(word)
Expand All @@ -150,9 +141,10 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
def map_to_original_pronunciation(phones, subpronunciations, strip_diacritics, digraphs):
transcription = tuple(x[2] for x in phones)
new_phones = []
pron = None
mapping_ind = 0
transcription_ind = 0
for pronunciations in subpronunciations:
pron = None
if mapping_ind >= len(phones):
break
for p in pronunciations:
Expand All @@ -161,10 +153,13 @@ def map_to_original_pronunciation(phones, subpronunciations, strip_diacritics, d
new_phones.extend(phones)
mapping_ind += len(phones)
break
if p['pronunciation'] == transcription and pron is None:

if p['pronunciation'] == transcription[transcription_ind: transcription_ind+len(p['pronunciation'])] \
and pron is None:
pron = p
if mapping_ind >= len(phones):
break
transcription_ind += len(pron['pronunciation'])
if not pron:
new_phones.extend(phones)
mapping_ind += len(phones)
Expand All @@ -174,16 +169,16 @@ def map_to_original_pronunciation(phones, subpronunciations, strip_diacritics, d
new_phones.extend(phones)
mapping_ind += len(phones)
break
for p in p['original_pronunciation']:
if p == phones[mapping_ind][2]:
for pi in p['original_pronunciation']:
if pi == phones[mapping_ind][2]:
new_phones.append(phones[mapping_ind])
else:
modded_phone = p
modded_phone = pi
new_p = phones[mapping_ind][2]
for diacritic in strip_diacritics:
modded_phone = modded_phone.replace(diacritic, '')
if modded_phone == new_p:
phones[mapping_ind][2] = p
phones[mapping_ind][2] = pi
new_phones.append(phones[mapping_ind])
elif mapping_ind != len(phones) - 1:
new_p = phones[mapping_ind][2] + phones[mapping_ind + 1][2]
Expand Down Expand Up @@ -239,7 +234,8 @@ def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, fram
phone_ind += 1
if phone_ind > len(phone_ctm[k][speaker]) - 1:
break
phones.extend(map_to_original_pronunciation(cur_phones, subprons, dictionary.strip_diacritics, dictionary.digraphs))
phones.extend(map_to_original_pronunciation(cur_phones, subprons,
dictionary.strip_diacritics, dictionary.digraphs))
if not word:
continue

Expand Down Expand Up @@ -288,6 +284,7 @@ def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, fram
speaker_directory = os.path.join(out_directory, corpus.file_directory_mapping[filename])
except KeyError:
speaker_directory = out_directory
os.makedirs(speaker_directory, exist_ok=True)
tg = tgio.Textgrid()
tg.minTimestamp = 0
tg.maxTimestamp = max_time
Expand All @@ -302,12 +299,14 @@ def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, fram
phones = []
if dictionary.multilingual_ipa:
phone_ind = 0
phone_tier_len = len(phone_ctm[filename][speaker])
for interval in word_ctm[filename][speaker]:
if max_time - interval[1] < frame_shift: # Fix rounding issues
interval[1] = max_time
end = interval[1]
word = interval[2]
subwords = d._lookup(word)
subwords = [x if x in d.words_mapping else d.oov_code for x in subwords ]
subprons = [d.words[x] for x in subwords]
cur_phones = []
while phone_ind <= len(phone_ctm[filename][speaker]) - 1 and phone_ctm[filename][speaker][phone_ind][1] <= end:
p = phone_ctm[filename][speaker][phone_ind]
Expand All @@ -318,7 +317,11 @@ def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, fram
continue
cur_phones.append(p)
phone_ind += 1
phones.extend(map_to_original_pronunciation(cur_phones, d.words[word], dictionary.strip_diacritics, dictionary.digraphs))
if phone_ind > len(phone_ctm[filename][speaker]) - 1:
break
phones.extend(map_to_original_pronunciation(cur_phones, subprons, dictionary.strip_diacritics, dictionary.digraphs))
if not word:
continue

words.append(interval)

Expand All @@ -337,7 +340,18 @@ def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, fram
phone_tier = tgio.IntervalTier(phone_tier_name, phones, minT=0, maxT=max_time)
tg.addTier(word_tier)
tg.addTier(phone_tier)
tg.save(os.path.join(speaker_directory, filename + '.TextGrid'), useShortForm=False)
relative = corpus.file_directory_mapping[filename]

if relative:
speaker_directory = os.path.join(out_directory, relative)
else:
speaker_directory = out_directory
os.makedirs(speaker_directory, exist_ok=True)
if filename in corpus.file_name_mapping:
output_name = corpus.file_name_mapping[filename]
else:
output_name = filename
tg.save(os.path.join(speaker_directory, output_name + '.TextGrid'), useShortForm=False)
except Exception as e:
exc_type, exc_value, exc_traceback = sys.exc_info()
textgrid_write_errors[filename] = '\n'.join(
Expand Down
6 changes: 4 additions & 2 deletions montreal_forced_aligner/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,13 +181,15 @@ def setup(self):
self.dictionary.write(disambig=True)
if self.ignore_acoustics:
fc = None
self.logger.info('Skipping acoustic feature generation')
if self.logger is not None:
self.logger.info('Skipping acoustic feature generation')
else:
fc = self.trainer.feature_config
try:
self.corpus.initialize_corpus(self.dictionary, fc)
except CorpusError:
self.logger.warning('There was an error when initializing the corpus, likely due to missing sound files. Ignoring acoustic generation...')
if self.logger is not None:
self.logger.warning('There was an error when initializing the corpus, likely due to missing sound files. Ignoring acoustic generation...')
self.ignore_acoustics = True

def analyze_setup(self):
Expand Down
64 changes: 61 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import shutil
import pytest
import yaml

from montreal_forced_aligner.corpus import AlignableCorpus, TranscribeCorpus
from montreal_forced_aligner.dictionary import Dictionary
Expand Down Expand Up @@ -74,6 +75,28 @@ def config_dir(generated_dir):
def english_acoustic_model():
from montreal_forced_aligner.command_line.download import download_model
download_model('acoustic', 'english')
return 'english'


@pytest.fixture(scope='session')
def english_ipa_acoustic_model():
from montreal_forced_aligner.command_line.download import download_model
download_model('acoustic', 'english_ipa')
return 'english_ipa'


@pytest.fixture(scope='session')
def english_us_ipa_dictionary():
from montreal_forced_aligner.command_line.download import download_model
download_model('dictionary', 'english_us_ipa')
return 'english_us_ipa'


@pytest.fixture(scope='session')
def english_uk_ipa_dictionary():
from montreal_forced_aligner.command_line.download import download_model
download_model('dictionary', 'english_uk_ipa')
return 'english_uk_ipa'


@pytest.fixture(scope='session')
Expand Down Expand Up @@ -135,6 +158,20 @@ def basic_corpus_dir(corpus_root_dir, wav_dir, lab_dir):
return path


@pytest.fixture(scope='session')
def multilingual_ipa_corpus_dir(corpus_root_dir, wav_dir, lab_dir):
path = os.path.join(corpus_root_dir, 'multilingual')
os.makedirs(path, exist_ok=True)
names = [('speaker', ['multilingual_ipa']), ('speaker_two', ['multilingual_ipa_us']) ]
for s, files in names:
s_dir = os.path.join(path, s)
os.makedirs(s_dir, exist_ok=True)
for name in files:
shutil.copyfile(os.path.join(wav_dir, name + '.flac'), os.path.join(s_dir, name + '.flac'))
shutil.copyfile(os.path.join(lab_dir, name + '.txt'), os.path.join(s_dir, name + '.txt'))
return path


@pytest.fixture(scope='session')
def weird_words_dir(corpus_root_dir, wav_dir, lab_dir):
path = os.path.join(corpus_root_dir, 'weird_words')
Expand Down Expand Up @@ -340,6 +377,20 @@ def sick_dict_path(dict_dir):
return os.path.join(dict_dir, 'sick.txt')


@pytest.fixture(scope='session')
def acoustic_dict_path(dict_dir):
return os.path.join(dict_dir, 'acoustic.txt')


@pytest.fixture(scope='session')
def speaker_dictionary_path(sick_dict_path, acoustic_dict_path, generated_dir):
data = {'default': acoustic_dict_path, 'sickmichael': sick_dict_path}
speaker_dict_path = os.path.join(generated_dir, 'sick_acoustic_dicts.yaml')
with open(speaker_dict_path, 'w') as f:
yaml.safe_dump(data, f)
return speaker_dict_path


@pytest.fixture(scope='session')
def acoustic_corpus_wav_path(basic_dir):
return os.path.join(basic_dir, 'acoustic_corpus.wav')
Expand Down Expand Up @@ -593,9 +644,16 @@ def ivector_train_config(config_directory):


@pytest.fixture(scope='session')
def multispeaker_dictionary_config(config_dir, sick_dict_path):
import yaml
path = os.path.join(config_dir, 'multispeaker_dictionary.yaml')
def multispeaker_dictionary_config(generated_dir, sick_dict_path):
path = os.path.join(generated_dir, 'multispeaker_dictionary.yaml')
with open(path, 'w', encoding='utf8') as f:
yaml.safe_dump({'default': 'english', 'michael': sick_dict_path}, f)
return path


@pytest.fixture(scope='session')
def ipa_speaker_dict_path(generated_dir, english_uk_ipa_dictionary, english_us_ipa_dictionary):
path = os.path.join(generated_dir, 'multispeaker_ipa_dictionary.yaml')
with open(path, 'w', encoding='utf8') as f:
yaml.safe_dump({'default': english_us_ipa_dictionary, 'speaker': english_uk_ipa_dictionary}, f)
return path
4 changes: 2 additions & 2 deletions tests/data/configs/basic_train_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
beam: 10
retry_beam: 40
beam: 100
retry_beam: 400
use_mp: false

features:
Expand Down
1 change: 1 addition & 0 deletions tests/data/lab/multilingual_ipa.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
i can't think of an animal that's less chad-like than a sloth
1 change: 1 addition & 0 deletions tests/data/lab/multilingual_ipa_us.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
uh with only like four games to go
Binary file added tests/data/wav/multilingual_ipa.flac
Binary file not shown.
Binary file added tests/data/wav/multilingual_ipa_us.flac
Binary file not shown.
17 changes: 17 additions & 0 deletions tests/test_commandline_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,23 @@ def test_align_basic(basic_corpus_dir, sick_dict_path, generated_dir, large_data
args, unknown = parser.parse_known_args(command)
run_align_corpus(args, unknown)

#@pytest.mark.skip(reason='Optimization')
def test_align_multilingual(multilingual_ipa_corpus_dir, english_uk_ipa_dictionary, generated_dir, temp_dir,
basic_align_config, english_acoustic_model, english_ipa_acoustic_model):

command = ['align', multilingual_ipa_corpus_dir, english_uk_ipa_dictionary, english_ipa_acoustic_model, os.path.join(generated_dir, 'multilingual'),
'-t', temp_dir, '-c', basic_align_config, '-q', '--clean', '-d']
args, unknown = parser.parse_known_args(command)
run_align_corpus(args, unknown)

def test_align_multilingual_speaker_dict(multilingual_ipa_corpus_dir, ipa_speaker_dict_path, generated_dir, temp_dir,
basic_align_config, english_acoustic_model, english_ipa_acoustic_model):

command = ['align', multilingual_ipa_corpus_dir, ipa_speaker_dict_path, english_ipa_acoustic_model, os.path.join(generated_dir, 'multilingual_speaker_dict'),
'-t', temp_dir, '-c', basic_align_config, '-q', '--clean', '-d']
args, unknown = parser.parse_known_args(command)
run_align_corpus(args, unknown)

def test_align_stereo(stereo_corpus_dir, sick_dict_path, generated_dir, large_dataset_dictionary, temp_dir,
basic_align_config, english_acoustic_model):

Expand Down
Loading

0 comments on commit 75798c2

Please sign in to comment.