Fix multilingual mode bugs

Aditya514 · Aug 12, 2021 · 75798c2 · 75798c2
1 parent a3b3101
commit 75798c2
Show file tree

Hide file tree

Showing 12 changed files with 175 additions and 48 deletions.
diff --git a/montreal_forced_aligner/textgrid.py b/montreal_forced_aligner/textgrid.py
@@ -6,10 +6,6 @@
 
 
 def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
-    if mode == 'word':
-        mapping = dictionary.reversed_word_mapping
-    else:
-        mapping = dictionary.reversed_phone_mapping
     file_dict = {}
     cur_utt = None
     text = None
@@ -26,13 +22,13 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
             duration = float(line[3])
             end = round(begin + duration, 4)
             label = line[4]
-            if dictionary.has_multiple:
-                d = dictionary.get_dictionary(speaker)
-            else:
-                d = dictionary
             if mode == 'word':
                 if utt != cur_utt:
                     if cur_utt != None:
+                        if dictionary.has_multiple:
+                            d = dictionary.get_dictionary(speaker)
+                        else:
+                            d = dictionary
                         cur_ind = 0
                         actual_labels = []
                         for word in text:
@@ -42,26 +38,14 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
                             e = -1
                             for i in ints:
                                 cur = current_labels[cur_ind]
-                                if i == int(cur[2]):
-                                    if cur[0] < b:
-                                        b = cur[0]
-                                    if cur[1] > e:
-                                        e = cur[1]
+                                i_begin, i_end, lab = cur
+                                if i == int(lab):
+                                    if i_begin < b:
+                                        b = i_begin
+                                    if i_end > e:
+                                        e = i_end
                                 cur_ind += 1
                             if b == 1000000 or e == -1:
-                                print('UGH')
-                                print(cur_utt)
-                                print(text)
-                                print (word)
-                                print(ints)
-                                print(current_labels[cur_ind])
-                                print(current_labels)
-                                print(dictionary._lookup(word))
-                                print(dictionary.split_clitics(word))
-                                print(word in dictionary.words_mapping)
-                                print(dictionary.clitic_markers)
-                                print(dictionary.clitic_set)
-                                print(re.split(r'[{}]'.format(dictionary.clitic_markers), word, maxsplit=1))
                                 initial, final = re.split(r'[{}]'.format(dictionary.clitic_markers), word, maxsplit=1)
                                 if any(x in final for x in dictionary.clitic_markers):
                                     final = dictionary.split_clitics(final)
@@ -77,8 +61,6 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
                                         print('FOUND FINAL')
                                         to_return = [initial] + final
                                         final[0] = clitic + final[0]
-                                print(initial, final, to_return)
-                                print(dictionary.words[word])
                                 raise Exception()
                             lab = [b, e, word]
                             actual_labels.append(lab)
@@ -88,10 +70,9 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
                         seg = corpus.segments[utt]
                         filename = seg['file_name']
                         utt_begin = seg['begin']
-                        begin += utt_begin
-                        end += utt_begin
                     else:
                         filename = utt
+                        utt_begin = 0
                     if filename not in file_dict:
                         file_dict[filename] = {}
                     if speaker not in file_dict[filename]:
@@ -100,9 +81,15 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
                     text = corpus.text_mapping[utt].split()
                     current_labels = []
 
+                begin += utt_begin
+                end += utt_begin
                 current_labels.append([begin, end, label])
             else:
                 speaker = corpus.utt_speak_mapping[utt]
+                if dictionary.has_multiple:
+                    d = dictionary.get_dictionary(speaker)
+                else:
+                    d = dictionary
                 if utt in corpus.segments:
                     seg = corpus.segments[utt]
                     filename = seg['file_name']
@@ -125,6 +112,10 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
 
         cur_ind = 0
         actual_labels = []
+        if dictionary.has_multiple:
+            d = dictionary.get_dictionary(speaker)
+        else:
+            d = dictionary
         for word in text:
 
             ints = d.to_int(word)
@@ -150,9 +141,10 @@ def parse_ctm(ctm_path, corpus, dictionary, mode='word'):
 def map_to_original_pronunciation(phones, subpronunciations, strip_diacritics, digraphs):
     transcription = tuple(x[2] for x in phones)
     new_phones = []
-    pron = None
     mapping_ind = 0
+    transcription_ind = 0
     for pronunciations in subpronunciations:
+        pron = None
         if mapping_ind >= len(phones):
             break
         for p in pronunciations:
@@ -161,10 +153,13 @@ def map_to_original_pronunciation(phones, subpronunciations, strip_diacritics, d
                 new_phones.extend(phones)
                 mapping_ind += len(phones)
                 break
-            if p['pronunciation'] == transcription and pron is None:
+
+            if p['pronunciation'] == transcription[transcription_ind: transcription_ind+len(p['pronunciation'])] \
+                    and pron is None:
                 pron = p
         if mapping_ind >= len(phones):
             break
+        transcription_ind += len(pron['pronunciation'])
         if not pron:
             new_phones.extend(phones)
             mapping_ind += len(phones)
@@ -174,16 +169,16 @@ def map_to_original_pronunciation(phones, subpronunciations, strip_diacritics, d
             new_phones.extend(phones)
             mapping_ind += len(phones)
             break
-        for p in p['original_pronunciation']:
-            if p == phones[mapping_ind][2]:
+        for pi in p['original_pronunciation']:
+            if pi == phones[mapping_ind][2]:
                 new_phones.append(phones[mapping_ind])
             else:
-                modded_phone = p
+                modded_phone = pi
                 new_p = phones[mapping_ind][2]
                 for diacritic in strip_diacritics:
                     modded_phone = modded_phone.replace(diacritic, '')
                 if modded_phone == new_p:
-                    phones[mapping_ind][2] = p
+                    phones[mapping_ind][2] = pi
                     new_phones.append(phones[mapping_ind])
                 elif mapping_ind != len(phones) - 1:
                     new_p = phones[mapping_ind][2] + phones[mapping_ind + 1][2]
@@ -239,7 +234,8 @@ def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, fram
                             phone_ind += 1
                             if phone_ind > len(phone_ctm[k][speaker]) - 1:
                                 break
-                        phones.extend(map_to_original_pronunciation(cur_phones, subprons, dictionary.strip_diacritics, dictionary.digraphs))
+                        phones.extend(map_to_original_pronunciation(cur_phones, subprons,
+                                                                    dictionary.strip_diacritics, dictionary.digraphs))
                         if not word:
                             continue
 
@@ -288,6 +284,7 @@ def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, fram
                     speaker_directory = os.path.join(out_directory, corpus.file_directory_mapping[filename])
                 except KeyError:
                     speaker_directory = out_directory
+                os.makedirs(speaker_directory, exist_ok=True)
                 tg = tgio.Textgrid()
                 tg.minTimestamp = 0
                 tg.maxTimestamp = max_time
@@ -302,12 +299,14 @@ def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, fram
                     phones = []
                     if dictionary.multilingual_ipa:
                         phone_ind = 0
-                        phone_tier_len = len(phone_ctm[filename][speaker])
                         for interval in word_ctm[filename][speaker]:
                             if max_time - interval[1] < frame_shift:  # Fix rounding issues
                                 interval[1] = max_time
                             end = interval[1]
                             word = interval[2]
+                            subwords = d._lookup(word)
+                            subwords = [x if x in d.words_mapping else d.oov_code for x in subwords ]
+                            subprons = [d.words[x] for x in subwords]
                             cur_phones = []
                             while phone_ind <= len(phone_ctm[filename][speaker]) - 1 and phone_ctm[filename][speaker][phone_ind][1] <= end:
                                 p = phone_ctm[filename][speaker][phone_ind]
@@ -318,7 +317,11 @@ def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, fram
                                     continue
                                 cur_phones.append(p)
                                 phone_ind += 1
-                            phones.extend(map_to_original_pronunciation(cur_phones, d.words[word], dictionary.strip_diacritics, dictionary.digraphs))
+                                if phone_ind > len(phone_ctm[filename][speaker]) - 1:
+                                    break
+                            phones.extend(map_to_original_pronunciation(cur_phones, subprons, dictionary.strip_diacritics, dictionary.digraphs))
+                            if not word:
+                                continue
 
                             words.append(interval)
 
@@ -337,7 +340,18 @@ def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, fram
                     phone_tier = tgio.IntervalTier(phone_tier_name, phones, minT=0, maxT=max_time)
                     tg.addTier(word_tier)
                     tg.addTier(phone_tier)
-                tg.save(os.path.join(speaker_directory, filename + '.TextGrid'), useShortForm=False)
+                relative = corpus.file_directory_mapping[filename]
+
+                if relative:
+                    speaker_directory = os.path.join(out_directory, relative)
+                else:
+                    speaker_directory = out_directory
+                os.makedirs(speaker_directory, exist_ok=True)
+                if filename in corpus.file_name_mapping:
+                    output_name = corpus.file_name_mapping[filename]
+                else:
+                    output_name = filename
+                tg.save(os.path.join(speaker_directory, output_name + '.TextGrid'), useShortForm=False)
             except Exception as e:
                 exc_type, exc_value, exc_traceback = sys.exc_info()
                 textgrid_write_errors[filename] = '\n'.join(

diff --git a/montreal_forced_aligner/validator.py b/montreal_forced_aligner/validator.py
@@ -181,13 +181,15 @@ def setup(self):
             self.dictionary.write(disambig=True)
         if self.ignore_acoustics:
             fc = None
-            self.logger.info('Skipping acoustic feature generation')
+            if self.logger is not None:
+                self.logger.info('Skipping acoustic feature generation')
         else:
             fc = self.trainer.feature_config
         try:
             self.corpus.initialize_corpus(self.dictionary, fc)
         except CorpusError:
-            self.logger.warning('There was an error when initializing the corpus, likely due to missing sound files. Ignoring acoustic generation...')
+            if self.logger is not None:
+                self.logger.warning('There was an error when initializing the corpus, likely due to missing sound files. Ignoring acoustic generation...')
             self.ignore_acoustics = True
 
     def analyze_setup(self):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,6 +5,7 @@
 import os
 import shutil
 import pytest
+import yaml
 
 from montreal_forced_aligner.corpus import AlignableCorpus, TranscribeCorpus
 from montreal_forced_aligner.dictionary import Dictionary
@@ -74,6 +75,28 @@ def config_dir(generated_dir):
 def english_acoustic_model():
     from montreal_forced_aligner.command_line.download import download_model
     download_model('acoustic', 'english')
+    return 'english'
+
+
+@pytest.fixture(scope='session')
+def english_ipa_acoustic_model():
+    from montreal_forced_aligner.command_line.download import download_model
+    download_model('acoustic', 'english_ipa')
+    return 'english_ipa'
+
+
+@pytest.fixture(scope='session')
+def english_us_ipa_dictionary():
+    from montreal_forced_aligner.command_line.download import download_model
+    download_model('dictionary', 'english_us_ipa')
+    return 'english_us_ipa'
+
+
+@pytest.fixture(scope='session')
+def english_uk_ipa_dictionary():
+    from montreal_forced_aligner.command_line.download import download_model
+    download_model('dictionary', 'english_uk_ipa')
+    return 'english_uk_ipa'
 
 
 @pytest.fixture(scope='session')
@@ -135,6 +158,20 @@ def basic_corpus_dir(corpus_root_dir, wav_dir, lab_dir):
     return path
 
 
+@pytest.fixture(scope='session')
+def multilingual_ipa_corpus_dir(corpus_root_dir, wav_dir, lab_dir):
+    path = os.path.join(corpus_root_dir, 'multilingual')
+    os.makedirs(path, exist_ok=True)
+    names = [('speaker', ['multilingual_ipa']), ('speaker_two', ['multilingual_ipa_us']) ]
+    for s, files in names:
+        s_dir = os.path.join(path, s)
+        os.makedirs(s_dir, exist_ok=True)
+        for name in files:
+            shutil.copyfile(os.path.join(wav_dir, name + '.flac'), os.path.join(s_dir, name + '.flac'))
+            shutil.copyfile(os.path.join(lab_dir, name + '.txt'), os.path.join(s_dir, name + '.txt'))
+    return path
+
+
 @pytest.fixture(scope='session')
 def weird_words_dir(corpus_root_dir, wav_dir, lab_dir):
     path = os.path.join(corpus_root_dir, 'weird_words')
@@ -340,6 +377,20 @@ def sick_dict_path(dict_dir):
     return os.path.join(dict_dir, 'sick.txt')
 
 
+@pytest.fixture(scope='session')
+def acoustic_dict_path(dict_dir):
+    return os.path.join(dict_dir, 'acoustic.txt')
+
+
+@pytest.fixture(scope='session')
+def speaker_dictionary_path(sick_dict_path, acoustic_dict_path, generated_dir):
+    data = {'default': acoustic_dict_path, 'sickmichael': sick_dict_path}
+    speaker_dict_path = os.path.join(generated_dir, 'sick_acoustic_dicts.yaml')
+    with open(speaker_dict_path, 'w') as f:
+        yaml.safe_dump(data, f)
+    return speaker_dict_path
+
+
 @pytest.fixture(scope='session')
 def acoustic_corpus_wav_path(basic_dir):
     return os.path.join(basic_dir, 'acoustic_corpus.wav')
@@ -593,9 +644,16 @@ def ivector_train_config(config_directory):
 
 
 @pytest.fixture(scope='session')
-def multispeaker_dictionary_config(config_dir, sick_dict_path):
-    import yaml
-    path = os.path.join(config_dir, 'multispeaker_dictionary.yaml')
+def multispeaker_dictionary_config(generated_dir, sick_dict_path):
+    path = os.path.join(generated_dir, 'multispeaker_dictionary.yaml')
     with open(path, 'w', encoding='utf8') as f:
         yaml.safe_dump({'default': 'english', 'michael': sick_dict_path}, f)
     return path
+
+
+@pytest.fixture(scope='session')
+def ipa_speaker_dict_path(generated_dir, english_uk_ipa_dictionary, english_us_ipa_dictionary):
+    path = os.path.join(generated_dir, 'multispeaker_ipa_dictionary.yaml')
+    with open(path, 'w', encoding='utf8') as f:
+        yaml.safe_dump({'default': english_us_ipa_dictionary, 'speaker': english_uk_ipa_dictionary}, f)
+    return path
diff --git a/tests/data/configs/basic_train_config.yaml b/tests/data/configs/basic_train_config.yaml
@@ -1,5 +1,5 @@
-beam: 10
-retry_beam: 40
+beam: 100
+retry_beam: 400
 use_mp: false
 
 features:

diff --git a/tests/data/lab/multilingual_ipa.txt b/tests/data/lab/multilingual_ipa.txt
@@ -0,0 +1 @@
+i can't think of an animal that's less chad-like than a sloth
diff --git a/tests/data/lab/multilingual_ipa_us.txt b/tests/data/lab/multilingual_ipa_us.txt
@@ -0,0 +1 @@
+uh with only like four games to go
diff --git a/tests/data/wav/multilingual_ipa.flac b/tests/data/wav/multilingual_ipa.flac
diff --git a/tests/data/wav/multilingual_ipa_us.flac b/tests/data/wav/multilingual_ipa_us.flac
diff --git a/tests/test_commandline_align.py b/tests/test_commandline_align.py
@@ -45,6 +45,23 @@ def test_align_basic(basic_corpus_dir, sick_dict_path, generated_dir, large_data
     args, unknown = parser.parse_known_args(command)
     run_align_corpus(args, unknown)
 
+#@pytest.mark.skip(reason='Optimization')
+def test_align_multilingual(multilingual_ipa_corpus_dir, english_uk_ipa_dictionary, generated_dir, temp_dir,
+                     basic_align_config, english_acoustic_model,  english_ipa_acoustic_model):
+
+    command = ['align', multilingual_ipa_corpus_dir, english_uk_ipa_dictionary, english_ipa_acoustic_model, os.path.join(generated_dir, 'multilingual'),
+               '-t', temp_dir, '-c', basic_align_config, '-q', '--clean', '-d']
+    args, unknown = parser.parse_known_args(command)
+    run_align_corpus(args, unknown)
+
+def test_align_multilingual_speaker_dict(multilingual_ipa_corpus_dir, ipa_speaker_dict_path, generated_dir, temp_dir,
+                     basic_align_config, english_acoustic_model,  english_ipa_acoustic_model):
+
+    command = ['align', multilingual_ipa_corpus_dir, ipa_speaker_dict_path, english_ipa_acoustic_model, os.path.join(generated_dir, 'multilingual_speaker_dict'),
+               '-t', temp_dir, '-c', basic_align_config, '-q', '--clean', '-d']
+    args, unknown = parser.parse_known_args(command)
+    run_align_corpus(args, unknown)
+
 def test_align_stereo(stereo_corpus_dir, sick_dict_path, generated_dir, large_dataset_dictionary, temp_dir,
                      basic_align_config, english_acoustic_model):