Added more test coverage

MontrealCorpusTools · mmcauliffe · Oct 1, 2021 · Sep 28, 2021 · Sep 28, 2021 · Sep 28, 2021
commit 62005f81961f53ed95cb2b249d37488d4f2e02fc
diff --git a/montreal_forced_aligner/command_line/transcribe.py b/montreal_forced_aligner/command_line/transcribe.py
@@ -33,6 +33,7 @@ def transcribe_corpus(args, unknown_args):
     else:
         transcribe_config = load_basic_transcribe()
     transcribe_config.use_mp = not args.disable_mp
+    transcribe_config.overwrite = args.overwrite
     if unknown_args:
         transcribe_config.update_from_args(unknown_args)
     data_directory = os.path.join(temp_dir, corpus_name)
@@ -155,7 +156,7 @@ def transcribe_corpus(args, unknown_args):
             handler.close()
             logger.removeHandler(handler)
         if os.path.exists(data_directory):
-            with open(conf_path, 'w') as f:
+            with open(conf_path, 'w', encoding='utf8') as f:
                 yaml.dump(conf, f)
 
 

diff --git a/montreal_forced_aligner/config/transcribe_config.py b/montreal_forced_aligner/config/transcribe_config.py
@@ -32,6 +32,7 @@ def __init__(self, feature_config):
         self.compound_markers = DEFAULT_COMPOUND_MARKERS
         self.strip_diacritics = DEFAULT_STRIP_DIACRITICS
         self.digraphs = DEFAULT_DIGRAPHS
+        self.overwrite = False
 
     def params(self):
         return {

diff --git a/montreal_forced_aligner/corpus/align_corpus.py b/montreal_forced_aligner/corpus/align_corpus.py
@@ -168,12 +168,10 @@ def _load_from_temp(self):
                 self.file_utt_mapping[file] = [utts]
         self.text_mapping = load_scp(text_path)
         for utt, text in self.text_mapping.items():
-            if not isinstance(text, list):
-                text = [text]
+            text = text.split()
             for w in text:
                 new_w = re.split(r"[-']", w)
                 self.word_counts.update(new_w + [w])
-            self.text_mapping[utt] = ' '.join(text)
         self.utt_wav_mapping = load_scp(wav_path)
         self.sox_strings = load_scp(sox_strings_path)
         self.wav_info = load_scp(wav_info_path, float)

diff --git a/montreal_forced_aligner/helper.py b/montreal_forced_aligner/helper.py
@@ -38,14 +38,22 @@ def make_safe(element):
         return ' '.join(map(make_safe, element))
     return str(element)
 
+def make_scp_safe(string):
+
+    return string.replace(' ', '_MFASPACE_')
+
+def load_scp_safe(string):
+    return string.replace('_MFASPACE_', ' ')
 
 def output_mapping(mapping, path):
     with open(path, 'w', encoding='utf8') as f:
         for k in sorted(mapping.keys()):
             v = mapping[k]
             if isinstance(v, (list, set, tuple)):
                 v = ' '.join(map(str, v))
-            f.write('{} {}\n'.format(k, v))
+            else:
+                v = make_scp_safe(v)
+            f.write(f'{make_scp_safe(k)} {v}\n')
 
 
 def save_scp(scp, path, sort=True, multiline=False):
@@ -100,9 +108,11 @@ def load_scp(path, data_type=str):
             if line == '':
                 continue
             line_list = line.split()
-            key = line_list.pop(0)
+            key = load_scp_safe(line_list.pop(0))
             if len(line_list) == 1:
                 value = data_type(line_list[0])
+                if isinstance(value, str):
+                    value = load_scp_safe(value)
             else:
                 value = [ data_type(x) for x in line_list if x not in ['[', ']']]
             scp[key] = value
@@ -178,7 +188,6 @@ def setup_logger(identifier, output_directory, console_level='info'):
     log_path = os.path.join(output_directory, identifier + '.log')
     if os.path.exists(log_path):
         os.remove(log_path)
-    print(log_path)
     logger = logging.getLogger(identifier)
     logger.setLevel(logging.DEBUG)
 

diff --git a/montreal_forced_aligner/multiprocessing/transcription.py b/montreal_forced_aligner/multiprocessing/transcription.py
@@ -461,7 +461,7 @@ def final_fmllr_est_func(model_directory, split_directory, sil_phones, job_name,
                                                     stderr=log_file, stdout=subprocess.PIPE)
             latt_post_proc = subprocess.Popen([thirdparty_binary('lattice-to-post'),
                                                '--acoustic-scale={}'.format(config.acoustic_scale),
-                                               'ark:' + lat_path, 'ark:-'],
+                                               'ark:-', 'ark:-'],
                                               stdin=determinize_proc.stdout, stdout=subprocess.PIPE, stderr=log_file)
             weight_silence_proc = subprocess.Popen([thirdparty_binary('weight-silence-post'),
                                                     str(config.silence_weight),
@@ -471,14 +471,13 @@ def final_fmllr_est_func(model_directory, split_directory, sil_phones, job_name,
             fmllr_proc = subprocess.Popen([thirdparty_binary('gmm-est-fmllr'),
                                            '--fmllr-update-type={}'.format(config.fmllr_update_type),
                                            '--spk2utt=ark:' + spk2utt_path, mdl, feat_string,
-                                           'ark,s,cs:-', 'ark:' + trans_tmp_path],
+                                           'ark,s,cs:-', 'ark:-'],
                                           stdin=weight_silence_proc.stdout, stdout=subprocess.PIPE, stderr=log_file)
-            fmllr_proc.communicate()
 
             compose_proc = subprocess.Popen([thirdparty_binary('compose-transforms'),
-                                             '--b-is-affine=true', 'ark:' + trans_tmp_path,
+                                             '--b-is-affine=true', 'ark:-',
                                              'ark:' + pre_trans_path, 'ark:' + trans_path],
-                                            stderr=log_file)
+                                            stderr=log_file, stdin=fmllr_proc.stdout)
             compose_proc.communicate()
     else:
         for name in dictionary_names:
@@ -510,7 +509,7 @@ def final_fmllr_est_func(model_directory, split_directory, sil_phones, job_name,
                                                         stderr=log_file, stdout=subprocess.PIPE)
                 latt_post_proc = subprocess.Popen([thirdparty_binary('lattice-to-post'),
                                                    '--acoustic-scale={}'.format(config.acoustic_scale),
-                                                   'ark:' + lat_path, 'ark:-'],
+                                                   'ark:-', 'ark:-'],
                                                   stdin=determinize_proc.stdout, stdout=subprocess.PIPE,
                                                   stderr=log_file)
                 weight_silence_proc = subprocess.Popen([thirdparty_binary('weight-silence-post'),
@@ -521,14 +520,13 @@ def final_fmllr_est_func(model_directory, split_directory, sil_phones, job_name,
                 fmllr_proc = subprocess.Popen([thirdparty_binary('gmm-est-fmllr'),
                                                '--fmllr-update-type={}'.format(config.fmllr_update_type),
                                                '--spk2utt=ark:' + spk2utt_path, mdl, dictionary_feat_string,
-                                               'ark,s,cs:-', 'ark:' + trans_tmp_path],
+                                               'ark,s,cs:-', 'ark:-'],
                                               stdin=weight_silence_proc.stdout, stdout=subprocess.PIPE, stderr=log_file)
-                fmllr_proc.communicate()
 
                 compose_proc = subprocess.Popen([thirdparty_binary('compose-transforms'),
-                                                 '--b-is-affine=true', 'ark:' + trans_tmp_path,
+                                                 '--b-is-affine=true', 'ark:-',
                                                  'ark:' + pre_trans_path, 'ark:' + trans_path],
-                                                stderr=log_file)
+                                                stderr=log_file, stdin=fmllr_proc.stdout)
                 compose_proc.communicate()
 
 

diff --git a/montreal_forced_aligner/textgrid.py b/montreal_forced_aligner/textgrid.py
@@ -173,6 +173,11 @@ def ctms_to_textgrids_non_mp(align_config, output_directory, model_directory, di
         words = dictionary.words
         speaker_mapping = None
 
+    backup_output_directory = None
+    if not align_config.overwrite:
+        backup_output_directory = os.path.join(model_directory, 'textgrids')
+        os.makedirs(backup_output_directory, exist_ok=True)
+
     def process_current_word_labels(utterance_id):
         if utterance_id in corpus.segments:
             seg = corpus.segments[utterance_id]
@@ -223,8 +228,8 @@ def process_current_phone_labels(utterance_id):
         phone_data[file_name][speaker].extend(actual_labels)
 
     export_errors = {}
+    wav_durations = corpus.file_durations
     for i in range(num_jobs):
-        wav_durations = {}
         word_data = {}
         phone_data = {}
         corpus.logger.debug(f'Parsing ctms for job {i}...')
@@ -313,9 +318,11 @@ def process_current_phone_labels(utterance_id):
                 ctm_to_textgrid(file_name, word_ctm, phone_ctm, output_directory, dictionary.silences, wav_durations, dictionary.multilingual_ipa,
                      frame_shift, words_mapping, speaker_mapping,
                      dictionary.punctuation, dictionary.clitic_set, dictionary.clitic_markers, dictionary.compound_markers, dictionary.oov_code, words,
-                     dictionary.strip_diacritics, corpus.file_directory_mapping, corpus.file_name_mapping, corpus.speaker_ordering, overwrite)
+                     dictionary.strip_diacritics, corpus.file_directory_mapping, corpus.file_name_mapping, corpus.speaker_ordering, overwrite, backup_output_directory)
                 processed_files.add(file_name)
             except Exception as e:
+                if align_config.debug:
+                    raise
                 exc_type, exc_value, exc_traceback = sys.exc_info()
                 export_errors[file_name] = '\n'.join(
                     traceback.format_exception(exc_type, exc_value, exc_traceback))

diff --git a/montreal_forced_aligner/transcriber.py b/montreal_forced_aligner/transcriber.py
@@ -9,6 +9,7 @@
 from .helper import thirdparty_binary
 from .multiprocessing import transcribe, transcribe_fmllr
 from .corpus import AlignableCorpus
+from .textgrid import construct_output_path
 from .dictionary import MultispeakerDictionary
 from .helper import score, log_kaldi_errors, parse_logs
 from .exceptions import KaldiProcessingError
@@ -467,7 +468,7 @@ def _load_transcripts(self, input_directory=None):
         lookup = self.dictionary.reversed_word_mapping
         if input_directory is None:
             input_directory = self.transcribe_directory
-            if self.transcribe_config.fmllr and not self.transcribe_config.no_speakers:
+            if self.acoustic_model.feature_config.fmllr and self.transcribe_config.fmllr and not self.transcribe_config.no_speakers:
                 input_directory = os.path.join(input_directory, 'fmllr')
         for j in range(self.corpus.num_jobs):
             tra_path = os.path.join(input_directory, 'tra.{}'.format(j))
@@ -504,40 +505,39 @@ def _load_transcripts(self, input_directory=None):
         return transcripts
 
     def export_transcriptions(self, output_directory, source=None):
+        backup_output_directory = None
+        if not self.transcribe_config.overwrite:
+            backup_output_directory = os.path.join(self.transcribe_directory, 'transcriptions')
+            os.makedirs(backup_output_directory, exist_ok=True)
         transcripts = self._load_transcripts(source)
+        wav_durations = self.corpus.file_durations
         if not self.corpus.segments:
             for utt, t in transcripts.items():
-                relative = self.corpus.file_directory_mapping[utt]
-                if relative:
-                    speaker_directory = os.path.join(output_directory, relative)
-                else:
-                    speaker_directory = output_directory
-                os.makedirs(speaker_directory, exist_ok=True)
-                outpath = os.path.join(speaker_directory, utt + '.lab')
-                with open(outpath, 'w', encoding='utf8') as f:
+                speaker = self.corpus.utt_speak_mapping[utt]
+                output_name, output_path = construct_output_path(utt, output_directory, self.corpus.file_directory_mapping,
+                                                                 self.corpus.file_name_mapping,
+                                                                 speaker, backup_output_directory)
+                output_path = output_path.replace('.TextGrid', '.lab')
+                with open(output_path, 'w', encoding='utf8') as f:
                     f.write(t)
 
         else:
 
             for filename in self.corpus.file_directory_mapping.keys():
-                maxtime = self.corpus.get_wav_duration(filename)
-                speaker_directory = output_directory
-                try:
-                    if self.corpus.file_directory_mapping[filename]:
-                        speaker_directory = os.path.join(output_directory, self.corpus.file_directory_mapping[filename])
-                except KeyError:
-                    pass
-                os.makedirs(speaker_directory, exist_ok=True)
+                output_name, output_path = construct_output_path(filename, output_directory, self.corpus.file_directory_mapping,
+                                                                 self.corpus.file_name_mapping,
+                                                                 backup_output_directory=backup_output_directory)
+                max_time = round(wav_durations[output_name], 4)
                 tiers = {}
                 if self.transcribe_config.no_speakers:
                     speaker = 'speech'
-                    tiers[speaker] = textgrid.IntervalTier(speaker, [], minT=0, maxT=maxtime)
+                    tiers[speaker] = textgrid.IntervalTier(speaker, [], minT=0, maxT=max_time)
                 else:
                     for speaker in self.corpus.speaker_ordering[filename]:
-                        tiers[speaker] = textgrid.IntervalTier(speaker, [], minT=0, maxT=maxtime)
+                        tiers[speaker] = textgrid.IntervalTier(speaker, [], minT=0, maxT=max_time)
 
                 tg = textgrid.Textgrid()
-                tg.maxTimestamp = maxtime
+                tg.maxTimestamp = max_time
                 for utt_name, text in transcripts.items():
                     seg = self.corpus.segments[utt_name]
                     utt_filename, begin, end = seg['file_name'], seg['begin'], seg['end']
@@ -552,5 +552,5 @@ def export_transcriptions(self, output_directory, source=None):
                     tiers[speaker].entryList.append(Interval(start=begin, end=end, label=text))
                 for t in tiers.values():
                     tg.addTier(t)
-                tg.save(os.path.join(speaker_directory, filename + '.TextGrid'),
+                tg.save(output_path,
                         includeBlankSpaces=True, format='long_textgrid')
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -193,7 +193,10 @@ def basic_split_dir(corpus_root_dir, wav_dir, lab_dir, textgrid_dir):
 def multilingual_ipa_corpus_dir(corpus_root_dir, wav_dir, lab_dir):
     path = os.path.join(corpus_root_dir, 'multilingual')
     os.makedirs(path, exist_ok=True)
-    names = [('speaker', ['multilingual_ipa']), ('speaker_two', ['multilingual_ipa_us']) ]
+    names = [('speaker', ['multilingual_ipa','multilingual_ipa_2','multilingual_ipa_3',
+                          'multilingual_ipa_4','multilingual_ipa_5',]),
+             ('speaker_two', ['multilingual_ipa_us','multilingual_ipa_us_2','multilingual_ipa_us_3',
+                          'multilingual_ipa_us_4','multilingual_ipa_us_5']) ]
     for s, files in names:
         s_dir = os.path.join(path, s)
         os.makedirs(s_dir, exist_ok=True)
@@ -205,7 +208,7 @@ def multilingual_ipa_corpus_dir(corpus_root_dir, wav_dir, lab_dir):
 
 @pytest.fixture(scope='session')
 def multilingual_ipa_tg_corpus_dir(corpus_root_dir, wav_dir, textgrid_dir):
-    path = os.path.join(corpus_root_dir, 'multilingual')
+    path = os.path.join(corpus_root_dir, 'multilingual_tg')
     os.makedirs(path, exist_ok=True)
     names = [('speaker', ['multilingual_ipa']), ('speaker_two', ['multilingual_ipa_us']) ]
     for s, files in names:

diff --git a/tests/data/configs/transcribe.yaml b/tests/data/configs/transcribe.yaml
@@ -1,4 +1,4 @@
 use_mp: false
-fmllr: false
+fmllr: true
 
 
diff --git a/tests/data/lab/multilingual_ipa_2.txt b/tests/data/lab/multilingual_ipa_2.txt
@@ -0,0 +1 @@
+welcome to a series of platchat videos where we're gonna tackle every single team in the overwatch league twenty twenty
diff --git a/tests/data/lab/multilingual_ipa_3.txt b/tests/data/lab/multilingual_ipa_3.txt
@@ -0,0 +1 @@
+and run you through
diff --git a/tests/data/lab/multilingual_ipa_4.txt b/tests/data/lab/multilingual_ipa_4.txt
@@ -0,0 +1 @@
+kinda our fears and also predictions for them
diff --git a/tests/data/lab/multilingual_ipa_5.txt b/tests/data/lab/multilingual_ipa_5.txt
@@ -0,0 +1 @@
+i'm sideshow joined by custa and reinforce we've got a special edition of platchat
diff --git a/tests/data/lab/multilingual_ipa_us_2.txt b/tests/data/lab/multilingual_ipa_us_2.txt
@@ -0,0 +1 @@
+hey josh could have finished it he just decided to fail it instead
diff --git a/tests/data/lab/multilingual_ipa_us_3.txt b/tests/data/lab/multilingual_ipa_us_3.txt
@@ -0,0 +1 @@
+really good performances against top teams that have ended up going their way
diff --git a/tests/data/lab/multilingual_ipa_us_4.txt b/tests/data/lab/multilingual_ipa_us_4.txt
@@ -0,0 +1 @@
+uh i i still think it's a very good team though in na i think this is uh
diff --git a/tests/data/lab/multilingual_ipa_us_5.txt b/tests/data/lab/multilingual_ipa_us_5.txt
@@ -0,0 +1 @@
+uh and this was the first time i think the justice really looked like an elite team
diff --git a/tests/data/textgrid/multilingual_ipa_2.TextGrid b/tests/data/textgrid/multilingual_ipa_2.TextGrid
@@ -0,0 +1,31 @@
+File type = "ooTextFile"
+Object class = "TextGrid"
+
+0
+6.2271
+<exists>
+3
+"IntervalTier"
+"custa"
+0
+6.2271
+1
+0
+6.2271
+""
+"IntervalTier"
+"reinforce"
+0
+6.2271
+1
+0
+6.2271
+""
+"IntervalTier"
+"sideshow"
+0
+6.2271
+1
+0
+6.2271
+"welcome to a series of platchat videos where we're gonna tackle every single team in the overwatch league twenty twenty"
diff --git a/tests/data/textgrid/multilingual_ipa_3.TextGrid b/tests/data/textgrid/multilingual_ipa_3.TextGrid
@@ -0,0 +1,31 @@
+File type = "ooTextFile"
+Object class = "TextGrid"
+
+0
+1.3062999999999994
+<exists>
+3
+"IntervalTier"
+"custa"
+0
+1.3062999999999994
+1
+0
+1.3062999999999994
+""
+"IntervalTier"
+"reinforce"
+0
+1.3062999999999994
+1
+0
+1.3062999999999994
+""
+"IntervalTier"
+"sideshow"
+0
+1.3062999999999994
+1
+0
+1.3062999999999994
+"and run you through"