2.0rc3 (MontrealCorpusTools#393)

* Better WER calculation for compounds and clitics * Fix subset bug * Fix LM training bug and multispeaker OOV issue * Fixed bug with training textgrid export
Aditya514 · Jan 20, 2022 · d5230fd · d5230fd
1 parent f87c044
commit d5230fd
Show file tree

Hide file tree

Showing 16 changed files with 95 additions and 69 deletions.
diff --git a/docs/source/changelog/changelog_2.0.rst b/docs/source/changelog/changelog_2.0.rst
@@ -10,6 +10,14 @@
 Beta releases
 =============
 
+2.0.0rc3
+--------
+- Fixed a bug where textgrids weren't being properly generated following training
+- Fixed a bug where commands were not always respecting ``--overwrite``
+- Fixed a bug where not all words in multispeaker dictionaries would be parsed
+- Improved transcription accuracy calculation to account for compounds and clitics
+- Fixed a crash when subsetting corpora that did not all have transcriptions
+
 2.0.0rc2
 --------
 - Added configuration parameter (``ignore_case=False``) to allow for disabling the default behavior of making all text and lexicon entries lower case

diff --git a/montreal_forced_aligner/abc.py b/montreal_forced_aligner/abc.py
@@ -690,9 +690,9 @@ def log_error(self, message: str) -> None:
         self.logger.error(message)
 
 
-class ModelExporterMixin(metaclass=abc.ABCMeta):
+class ExporterMixin(metaclass=abc.ABCMeta):
     """
-    Abstract mixin class for exporting MFA models
+    Abstract mixin class for exporting any kind of file
 
     Parameters
     ----------
@@ -704,6 +704,12 @@ def __init__(self, overwrite: bool = False, **kwargs):
         self.overwrite = overwrite
         super().__init__(**kwargs)
 
+
+class ModelExporterMixin(ExporterMixin, metaclass=abc.ABCMeta):
+    """
+    Abstract mixin class for exporting MFA models
+    """
+
     @property
     @abc.abstractmethod
     def meta(self) -> MetaDict:
@@ -723,19 +729,17 @@ def export_model(self, output_model_path: str) -> None:
         ...
 
 
-class FileExporterMixin(metaclass=abc.ABCMeta):
+class FileExporterMixin(ExporterMixin, metaclass=abc.ABCMeta):
     """
     Abstract mixin class for exporting TextGrid and text files
 
     Parameters
     ----------
-    overwrite: bool
-        Flag for whether to overwrite files if they already exist
-
+    cleanup_textgrids: bool
+        Flag for whether to clean up exported TextGrids
     """
 
-    def __init__(self, overwrite: bool = False, cleanup_textgrids: bool = True, **kwargs):
-        self.overwrite = overwrite
+    def __init__(self, cleanup_textgrids: bool = True, **kwargs):
         self.cleanup_textgrids = cleanup_textgrids
         super().__init__(**kwargs)
 

diff --git a/montreal_forced_aligner/acoustic_modeling/trainer.py b/montreal_forced_aligner/acoustic_modeling/trainer.py
@@ -308,6 +308,7 @@ def train(self, generate_final_alignments: bool = True) -> None:
                 previous.exported_model_path, self.working_directory
             )
             self.align()
+            self.collect_alignments()
 
     @property
     def num_utterances(self) -> int:
@@ -347,7 +348,6 @@ def align(self) -> None:
                     f"Analyzing alignment diagnostics for {self.current_aligner.identifier} on the full corpus"
                 )
             self.compile_information()
-            self.collect_alignments()
             with open(done_path, "w"):
                 pass
         except Exception as e:

diff --git a/montreal_forced_aligner/alignment/base.py b/montreal_forced_aligner/alignment/base.py
@@ -224,11 +224,13 @@ def export_textgrids(self) -> None:
             Job method for TextGrid export
         """
         begin = time.time()
-        self.logger.info("Exporting TextGrids...")
-        os.makedirs(self.export_output_directory, exist_ok=True)
-        if self.backup_output_directory:
-            os.makedirs(self.backup_output_directory, exist_ok=True)
+        export_directory = self.export_output_directory
+        if os.path.exists(export_directory) and not self.overwrite:
+            export_directory = self.backup_output_directory
+            self.log_debug(f"Not overwriting existing directory, exporting to {export_directory}")
 
+        self.logger.info(f"Exporting TextGrids to {export_directory}...")
+        os.makedirs(export_directory, exist_ok=True)
         export_errors = {}
         total_files = len(self.files)
         with tqdm.tqdm(total=total_files) as pbar:
@@ -255,9 +257,7 @@ def export_textgrids(self) -> None:
                 try:
                     for file in self.files:
                         tiers = file.aligned_data
-                        output_path = file.construct_output_path(
-                            self.export_output_directory, self.backup_output_directory
-                        )
+                        output_path = file.construct_output_path(export_directory)
                         duration = file.duration
                         for_write_queue.put((tiers, output_path, duration))
                         pbar.update(1)
@@ -276,24 +276,18 @@ def export_textgrids(self) -> None:
                 for file in self.files:
                     data = file.aligned_data
 
-                    backup_output_directory = None
-                    if not self.overwrite:
-                        backup_output_directory = self.backup_output_directory
-                        os.makedirs(backup_output_directory, exist_ok=True)
-                    output_path = file.construct_output_path(
-                        self.export_output_directory, backup_output_directory
-                    )
+                    output_path = file.construct_output_path(export_directory)
                     export_textgrid(data, output_path, file.duration, self.frame_shift)
                     pbar.update(1)
 
         if export_errors:
             self.logger.warning(
                 f"There were {len(export_errors)} errors encountered in generating TextGrids. "
-                f"Check the output_errors.txt file in {os.path.join(self.export_output_directory)} "
+                f"Check {os.path.join(export_directory, 'output_errors.txt')} "
                 f"for more details"
             )
-        output_textgrid_writing_errors(self.export_output_directory, export_errors)
-        self.logger.info("Finished exporting TextGrids!")
+        output_textgrid_writing_errors(export_directory, export_errors)
+        self.logger.info(f"Finished exporting TextGrids to {export_directory}!")
         self.logger.debug(f"Exported TextGrids in a total of {time.time() - begin} seconds")
 
     def export_files(self, output_directory: str) -> None:

diff --git a/montreal_forced_aligner/command_line/train_acoustic_model.py b/montreal_forced_aligner/command_line/train_acoustic_model.py
@@ -32,7 +32,6 @@ def train_acoustic_model(args: Namespace, unknown_args: Optional[List[str]] = No
         temporary_directory=args.temporary_directory,
         **TrainableAligner.parse_parameters(args.config_path, args, unknown_args),
     )
-
     try:
         generate_final_alignments = True
         if args.output_directory is None:

diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py
@@ -319,7 +319,7 @@ def create_subset(self, subset: int) -> None:
         if larger_subset_num < len(self.utterances):
             # Get all shorter utterances that are not one word long
             utts = sorted(
-                (utt for utt in self.utterances if " " in utt.text),
+                (utt for utt in self.utterances if utt.text and " " in utt.text),
                 key=lambda x: x.duration,
             )
             larger_subset = utts[:larger_subset_num]

diff --git a/montreal_forced_aligner/corpus/classes.py b/montreal_forced_aligner/corpus/classes.py
@@ -396,7 +396,6 @@ def __repr__(self) -> str:
     def save(
         self,
         output_directory: Optional[str] = None,
-        backup_output_directory: Optional[str] = None,
         text_type: Optional[TextFileType] = None,
         save_transcription: bool = False,
     ) -> None:
@@ -431,9 +430,7 @@ def save(
                 return
             elif utterance_count == 0:
                 return
-            output_path = self.construct_output_path(
-                output_directory, backup_output_directory, enforce_lab=True
-            )
+            output_path = self.construct_output_path(output_directory, enforce_lab=True)
             with open(output_path, "w", encoding="utf8") as f:
                 for u in self.utterances:
                     if save_transcription:
@@ -442,7 +439,7 @@ def save(
                         f.write(u.text)
             return
         elif text_type == TextFileType.TEXTGRID:
-            output_path = self.construct_output_path(output_directory, backup_output_directory)
+            output_path = self.construct_output_path(output_directory)
             max_time = self.duration
             tiers = {}
             for speaker in self.speaker_ordering:
@@ -556,7 +553,6 @@ def clean_up(self) -> None:
     def construct_output_path(
         self,
         output_directory: Optional[str] = None,
-        backup_output_directory: Optional[str] = None,
         enforce_lab: bool = False,
     ) -> str:
         """
@@ -566,8 +562,6 @@ def construct_output_path(
         ----------
         output_directory: str, optional
             Directory to output to, if None, it will overwrite the original file
-        backup_output_directory: str, optional
-            Backup directory to write to in order to avoid overwriting an existing file
         enforce_lab: bool
             Flag for whether to enforce generating a lab file over a TextGrid
 
@@ -589,8 +583,6 @@ def construct_output_path(
         else:
             relative = output_directory
         tg_path = os.path.join(relative, self._name + extension)
-        if backup_output_directory is not None and os.path.exists(tg_path):
-            tg_path = tg_path.replace(output_directory, backup_output_directory)
         os.makedirs(os.path.dirname(tg_path), exist_ok=True)
         return tg_path
 
@@ -1037,8 +1029,8 @@ def add_word_intervals(self, intervals: Union[CtmInterval, List[CtmInterval]]) -
             intervals = [intervals]
         if self.word_labels is None:
             self.word_labels = []
-        for interval in intervals:
-            if self.begin is not None:
+        if self.is_segment:
+            for interval in intervals:
                 interval.shift_times(self.begin)
         self.word_labels = intervals
 
@@ -1055,8 +1047,8 @@ def add_phone_intervals(self, intervals: Union[CtmInterval, List[CtmInterval]])
             intervals = [intervals]
         if self.phone_labels is None:
             self.phone_labels = []
-        for interval in intervals:
-            if self.begin is not None:
+        if self.is_segment:
+            for interval in intervals:
                 interval.shift_times(self.begin)
         self.phone_labels = intervals
 

diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py
@@ -338,6 +338,14 @@ def base_phones(self) -> Dict[str, Set[str]]:
 
         return base_phones
 
+    @property
+    def split_regex(self) -> re.Pattern:
+        """Pattern for splitting arbitrary text"""
+        markers = self.compound_markers
+        if "-" in markers:
+            markers = ["-"] + [x for x in self.compound_markers if x != "-"]
+        return re.compile(rf'[{"".join(markers)} ]')
+
     @property
     def extra_questions_mapping(self) -> Dict[str, List[str]]:
         """Mapping of extra questions for the given phone set type"""

diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py
@@ -141,14 +141,22 @@ def dictionary_setup(self):
         auto_set = {PhoneSetType.AUTO, PhoneSetType.UNKNOWN, "AUTO", "UNKNOWN"}
         if not isinstance(self.phone_set_type, PhoneSetType):
             self.phone_set_type = PhoneSetType[self.phone_set_type]
+
+        options = self.dictionary_options
+        pretrained = False
+        if self.non_silence_phones:
+            pretrained = True
+
         for speaker, dictionary in self.dictionary_model.load_dictionary_paths().items():
             self.speaker_mapping[speaker] = dictionary.name
             if dictionary.name not in self.dictionary_mapping:
+                if not pretrained:
+                    options["non_silence_phones"] = set()
                 self.dictionary_mapping[dictionary.name] = PronunciationDictionary(
                     dictionary_path=dictionary.path,
                     temporary_directory=self.dictionary_output_directory,
                     root_dictionary=self,
-                    **self.dictionary_options,
+                    **options,
                 )
                 if self.phone_set_type not in auto_set:
                     if (
@@ -161,15 +169,14 @@ def dictionary_setup(self):
                 else:
                     self.phone_set_type = self.dictionary_mapping[dictionary.name].phone_set_type
 
-                self.non_silence_phones.update(
-                    self.dictionary_mapping[dictionary.name].non_silence_phones
-                )
                 self.excluded_phones.update(
                     self.dictionary_mapping[dictionary.name].excluded_phones
                 )
                 self.excluded_pronunciation_count += self.dictionary_mapping[
                     dictionary.name
                 ].excluded_pronunciation_count
+        for dictionary in self.dictionary_mapping.values():
+            self.non_silence_phones.update(dictionary.non_silence_phones)
         for dictionary in self.dictionary_mapping.values():
             dictionary.non_silence_phones = self.non_silence_phones
 

diff --git a/montreal_forced_aligner/language_modeling/trainer.py b/montreal_forced_aligner/language_modeling/trainer.py
@@ -331,7 +331,7 @@ def normalized_text_iter(self, min_count: int = 1) -> Generator:
         unk_words = {k for k, v in self.word_counts.items() if v <= min_count}
         for u in self.utterances:
             normalized = u.normalized_text
-            if normalized:
+            if not normalized:
                 normalized = u.text.split()
             yield " ".join(x if x not in unk_words else self.oov_word for x in normalized)
 

diff --git a/montreal_forced_aligner/segmenter.py b/montreal_forced_aligner/segmenter.py
@@ -425,9 +425,8 @@ def export_files(self, output_directory: str) -> None:
         output_directory: str
             Directory to save segmentation TextGrids
         """
-        backup_output_directory = None
-        if not self.overwrite:
-            backup_output_directory = os.path.join(self.working_directory, "transcriptions")
-            os.makedirs(backup_output_directory, exist_ok=True)
+        if not self.overwrite and os.path.exists(output_directory):
+            output_directory = os.path.join(self.working_directory, "transcriptions")
+        os.makedirs(output_directory, exist_ok=True)
         for f in self.files:
-            f.save(output_directory, backup_output_directory, text_type=TextFileType.TEXTGRID)
+            f.save(output_directory, text_type=TextFileType.TEXTGRID)
diff --git a/montreal_forced_aligner/speaker_classifier.py b/montreal_forced_aligner/speaker_classifier.py
@@ -178,10 +178,9 @@ def export_files(self, output_directory: str) -> None:
         output_directory: str
             Output directory to save files
         """
-        backup_output_directory = None
-        if not self.overwrite:
-            backup_output_directory = os.path.join(self.working_directory, "output")
-            os.makedirs(backup_output_directory, exist_ok=True)
+        if not self.overwrite and os.path.exists(output_directory):
+            output_directory = os.path.join(self.working_directory, "transcriptions")
+        os.makedirs(output_directory, exist_ok=True)
 
         for file in self.files:
-            file.save(output_directory, backup_output_directory)
+            file.save(output_directory)
diff --git a/montreal_forced_aligner/textgrid.py b/montreal_forced_aligner/textgrid.py
@@ -155,10 +155,12 @@ def export_textgrid(
         phone_tier = tgio.IntervalTier(phone_tier_name, [], minT=0, maxT=duration)
         tg.addTier(word_tier)
         tg.addTier(phone_tier)
-
+    has_data = False
     for speaker, data in speaker_data.items():
         words = data["words"]
         phones = data["phones"]
+        if len(words) and len(phones):
+            has_data = True
         tg_words = []
         tg_phones = []
         for w in words:
@@ -180,5 +182,7 @@ def export_textgrid(
         phone_tier = tgio.IntervalTier(phone_tier_name, tg_phones, minT=0, maxT=duration)
         tg.replaceTier(word_tier_name, word_tier)
         tg.replaceTier(phone_tier_name, phone_tier)
-
-    tg.save(output_path, includeBlankSpaces=True, format="long_textgrid", reportingMode="error")
+    if has_data:
+        tg.save(
+            output_path, includeBlankSpaces=True, format="long_textgrid", reportingMode="error"
+        )