Bug fixes (MontrealCorpusTools#480)

Aditya514 · Jul 5, 2022 · 3315d9e · 3315d9e
1 parent 714fc16
commit 3315d9e
Show file tree

Hide file tree

Showing 43 changed files with 582 additions and 519 deletions.
diff --git a/docs/source/changelog/changelog_2.0.rst b/docs/source/changelog/changelog_2.0.rst
@@ -5,13 +5,27 @@
 2.0 Changelog
 *************
 
+2.0.5
+=====
+
+- Standardize :ref:`dictionary_format` to require tab delimitation between orthography, pronunciations, and any probabilities in the dictionary :github_issue:`478`
+- Fixed a bug in pronunciation probability estimation when silence words are explicitly transcribed :github_issue:`476`
+- Fixed an optimization bug introduced when fixing sparse job/subset combos
+
+2.0.4
+=====
+
+- Bug fix for phonetisaurus training error in 2.0.2
+
 2.0.2
 =====
 
 - Optimized Phonetisaurus training regime for phone and grapheme orders greater than 1
 - Fixed a bug in parsing dictionaries that included whitespace as part of the word
 - Fixed a bug in Phonetisaurus generation where insertions and deletions were not being properly generated
 - Changed the default alignment separator for Phonetisaurus to ``;`` instead of ``}`` (shouldn't conflict with most phone sets) and added extra validation to ensure special symbols are not present in the dictionary
+- Fixed a bug where a trained phonetisaurus model was not properly using its grapheme order
+- Fixed a bug when saving a phonetisaurus model after evaluating it
 
 2.0.1
 =====

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -270,9 +270,7 @@
 nitpick_ignore = [
     ("py:class", "optional"),
     ("py:class", "callable"),
-    ("py:class", "ReversedMappingType"),
     ("py:class", "WordsType"),
-    ("py:class", "MappingType"),
     ("py:class", "TextIO"),
     ("py:class", "SegmentationType"),
     ("py:class", "CtmErrorDict"),

diff --git a/docs/source/user_guide/dictionary.rst b/docs/source/user_guide/dictionary.rst
@@ -49,10 +49,10 @@ bound clitic and the stem.  Thus given a dictionary like:
 ::
 
    c'est S E
-   c S E
-   c' S
+   c     S E
+   c'    S
    etait E T E
-   un A N
+   un    A N
 
 And two example orthographic transcriptions:
 
@@ -96,16 +96,19 @@ The default behavior of the aligner to is to clean up these internal splits and
 Non-probabilistic format
 ========================
 
-Dictionaries should be specified in the following format:
+Dictionaries should be specified as a two column tab separated file:
 
 ::
 
-  WORDA PHONEA PHONEB
-  WORDA PHONEC
-  WORDB PHONEB PHONEC
+  WORDA  PHONEA PHONEB
+  WORDA  PHONEC
+  WORDB  PHONEB PHONEC
 
-where each line is a word with a transcription separated by white space.
-Each phone in the transcription should be separated by white space as well.
+Each line has a word and a transcription separated by a tab. Each phone in the transcription should be separated by non-tab white space as well.
+
+.. note::
+
+   Up to 2.0.5, words and their transcriptions could be separated by any white space, not just tab. However, given the complexities of parsing :ref:`probabilistic lexicons <probabilistic_dictionaries>` with transcription systems like X-SAMPA that use numbers as symbols, we have decided to standardize the column delimiter as tab.
 
 A dictionary for English that has good coverage is the lexicon derived
 from the LibriSpeech corpus (`LibriSpeech lexicon`_).
@@ -120,6 +123,8 @@ and one for Quebec French (`Prosodylab-aligner French dictionary`_), also see :x
    See the page on :ref:`g2p_dictionary_generating` for how to use G2P models to generate a dictionary
    from our pretrained models or how to generate pronunciation dictionaries from orthographies.
 
+.. _probabilistic_dictionaries:
+
 Dictionaries with pronunciation probability
 ===========================================
 
@@ -129,10 +134,11 @@ The format for this dictionary format is:
 
 ::
 
-  WORDA 1.0 PHONEA PHONEB
-  WORDA 0.3 PHONEC
-  WORDB 1.0 PHONEB PHONEC
+  WORDA  1.0   PHONEA PHONEB
+  WORDA  0.3   PHONEC
+  WORDB  1.0   PHONEB PHONEC
 
+The three columns should be separated by tabs, with the first column corresponding the orthographic form, the second to the pronunciation probability between 0.01 and 1.0, and the final column the space-delimited pronunciation.
 
 .. note::
 
@@ -149,11 +155,11 @@ The format for this dictionary format is:
 
 ::
 
-  the	0.16	0.08	2.17	1.13	d i
-  the	0.99	0.04	2.14	1.15	d ə
-  the	0.01	0.14	2.48	1.18	ð i
-  the	0.02	0.12	1.87	1.23	ð ə
-  the	0.11	0.15	2.99	1.15	ə
+  the    0.16	   0.08	   2.17	   1.13	   d i
+  the    0.99	   0.04	   2.14	   1.15	   d ə
+  the	   0.01	   0.14	   2.48	   1.18	   ð i
+  the	   0.02	   0.12	   1.87	   1.23	   ð ə
+  the	   0.11	   0.15	   2.99	   1.15	   ə
 
 The first float column is the probability of the pronunciation, the next float is the probability of silence following the pronunciation, and the final two floats are correction terms for preceding silence and non-silence. Given that each entry in a dictionary is independent and there is no way to encode information about the preceding context, the correction terms are calculated as how much more common was silence or non-silence compared to what we would expect factoring out the likelihood of silence from the previous word. More details are found in :kaldi_steps:`get_prons` and the `related paper <https://www.danielpovey.com/files/2015_interspeech_silprob.pdf>`_.
 
@@ -171,8 +177,8 @@ to align annotations like laughter, coughing, etc.
 
 ::
 
-  {LG} spn
-  {SL} sil
+  {LG}   spn
+  {SL}   sil
 
 
 .. _speaker_dictionaries:

diff --git a/montreal_forced_aligner/abc.py b/montreal_forced_aligner/abc.py
@@ -13,6 +13,7 @@
 import sys
 import time
 import traceback
+import typing
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -66,7 +67,7 @@ def __init__(self, args: MfaArguments):
         self.job_name = self.args.job_name
         self.log_path = self.args.log_path
 
-    def run(self):
+    def run(self) -> typing.Generator:
         """Run the function, calls :meth:`~KaldiFunction._run` with error handling"""
         try:
             yield from self._run()
@@ -75,7 +76,7 @@ def run(self):
             error_text = "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
             raise MultiprocessingError(self.job_name, error_text)
 
-    def _run(self):
+    def _run(self) -> None:
         """Internal logic for running the worker"""
         pass
 
@@ -579,7 +580,7 @@ def workflow_identifier(self) -> str:
         ...
 
     @property
-    def worker_config_path(self):
+    def worker_config_path(self) -> str:
         """Path to worker's configuration in the working directory"""
         return os.path.join(self.output_directory, f"{self.workflow_identifier}.yaml")
 
@@ -612,7 +613,7 @@ def cleanup(self) -> None:
         except (NameError, ValueError):  # already cleaned up
             pass
 
-    def save_worker_config(self):
+    def save_worker_config(self) -> None:
         """Export worker configuration to its working directory"""
         with open(self.worker_config_path, "w") as f:
             yaml.dump(self.configuration, f)
@@ -704,11 +705,11 @@ def workflow_directory(self) -> str:
         return os.path.join(self.output_directory, self.workflow_identifier)
 
     @property
-    def log_file(self):
+    def log_file(self) -> str:
         """Path to the worker's log file"""
         return os.path.join(self.output_directory, f"{self.workflow_identifier}.log")
 
-    def setup_logger(self):
+    def setup_logger(self) -> None:
         """
         Construct a logger for a command line run
         """

diff --git a/montreal_forced_aligner/acoustic_modeling/base.py b/montreal_forced_aligner/acoustic_modeling/base.py
@@ -114,7 +114,7 @@ def __init__(
         self.final_gaussian_iteration = 0  # Gets set later
 
     @property
-    def db_path(self):
+    def db_path(self) -> str:
         """Root worker's path to database file"""
         return self.worker.db_path
 
@@ -251,23 +251,9 @@ def num_current_utterances(self) -> int:
 
     def initialize_training(self) -> None:
         """Initialize training"""
-        self.compute_calculated_properties()
-        self.current_gaussians = self.initial_gaussians
         begin = time.time()
         dirty_path = os.path.join(self.working_directory, "dirty")
         done_path = os.path.join(self.working_directory, "done")
-        if os.path.exists(dirty_path):  # if there was an error, let's redo from scratch
-            shutil.rmtree(self.working_directory)
-        os.makedirs(self.working_log_directory, exist_ok=True)
-        if os.path.exists(done_path) or any(
-            x.endswith(".mdl") for x in os.listdir(self.working_directory)
-        ):
-            self.log_info(
-                f"{self.identifier} training already initialized, skipping initialization."
-            )
-            if os.path.exists(done_path):
-                self.training_complete = True
-            return
         self.log_info(f"Initializing training for {self.identifier}...")
         if self.subset and self.subset >= self.worker.num_utterances:
             self.log_warning(
@@ -278,7 +264,6 @@ def initialize_training(self) -> None:
             self.worker.current_subset = 0
         try:
             self._trainer_initialization()
-            parse_logs(self.working_log_directory)
         except Exception as e:
             with open(dirty_path, "w"):
                 pass
@@ -290,6 +275,18 @@ def initialize_training(self) -> None:
             raise
         self.iteration = 1
         self.worker.current_trainer = self
+        self.compute_calculated_properties()
+        self.current_gaussians = self.initial_gaussians
+        if self.initialized:
+            self.log_info(
+                f"{self.identifier} training already initialized, skipping initialization."
+            )
+            if os.path.exists(done_path):
+                self.training_complete = True
+            return
+        if os.path.exists(dirty_path):  # if there was an error, let's redo from scratch
+            shutil.rmtree(self.working_directory)
+        os.makedirs(self.working_log_directory, exist_ok=True)
         self.log_info("Initialization complete!")
         self.log_debug(f"Initialization for {self.identifier} took {time.time() - begin} seconds")
 
@@ -331,14 +328,14 @@ def alignment_model_path(self) -> str:
         return self.model_path
 
     @property
-    def next_model_path(self):
+    def next_model_path(self) -> str:
         """Next iteration's acoustic model path"""
         if self.training_complete:
             return os.path.join(self.working_directory, "final.mdl")
         return os.path.join(self.working_directory, f"{self.iteration + 1}.mdl")
 
     @property
-    def next_occs_path(self):
+    def next_occs_path(self) -> str:
         """Next iteration's occs file path"""
         if self.training_complete:
             return os.path.join(self.working_directory, "final.occs")
@@ -349,11 +346,11 @@ def compute_calculated_properties(self) -> None:
         """Compute any calculated properties such as alignment iterations"""
         ...
 
-    def increment_gaussians(self):
+    def increment_gaussians(self) -> None:
         """Increment the current number of gaussians"""
         self.current_gaussians += self.gaussian_increment
 
-    def acc_stats(self):
+    def acc_stats(self) -> None:
         """
         Multiprocessing function that accumulates stats for GMM training.
 
@@ -485,7 +482,7 @@ def acc_stats(self):
     def align_iteration(self) -> None:
         """Run alignment for a training iteration"""
         begin = time.time()
-        self.align_utterances()
+        self.align_utterances(training=True)
         self.log_debug(
             f"Generating alignments for iteration {self.iteration} took {time.time()-begin} seconds"
         )
@@ -496,10 +493,20 @@ def align_iteration(self) -> None:
             f"Analyzing iteration {self.iteration} alignments took {time.time()-begin} seconds"
         )
 
+    @property
+    def initialized(self) -> bool:
+        return (
+            os.path.exists(os.path.join(self.working_directory, "1.mdl"))
+            or os.path.exists(os.path.join(self.working_directory, "final.mdl"))
+            or os.path.exists(os.path.join(self.working_directory, "done"))
+        )
+
     def train_iteration(self) -> None:
         """Perform an iteration of training"""
         if os.path.exists(self.next_model_path):
             self.iteration += 1
+            if self.iteration <= self.final_gaussian_iteration:
+                self.increment_gaussians()
             return
         if self.iteration in self.realignment_iterations:
             self.align_iteration()
@@ -521,6 +528,7 @@ def train(self) -> None:
         """
         done_path = os.path.join(self.working_directory, "done")
         dirty_path = os.path.join(self.working_directory, "dirty")
+        os.makedirs(self.working_log_directory, exist_ok=True)
         try:
             self.initialize_training()
             if self.training_complete:

diff --git a/montreal_forced_aligner/acoustic_modeling/lda.py b/montreal_forced_aligner/acoustic_modeling/lda.py
@@ -6,6 +6,7 @@
 import re
 import shutil
 import subprocess
+import typing
 from queue import Empty
 from typing import TYPE_CHECKING, Dict, List
 
@@ -90,7 +91,7 @@ def __init__(self, args: LdaAccStatsArguments):
         self.acc_paths = args.acc_paths
         self.lda_options = args.lda_options
 
-    def run(self):
+    def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
         """Run the function"""
         with open(self.log_path, "w", encoding="utf8") as log_file:
             for dict_id in self.dictionaries:
@@ -173,7 +174,7 @@ def __init__(self, args: CalcLdaMlltArguments):
         self.macc_paths = args.macc_paths
         self.lda_options = args.lda_options
 
-    def run(self):
+    def _run(self) -> typing.Generator[int]:
         """Run the function"""
         # Estimating MLLT
         with open(self.log_path, "w", encoding="utf8") as log_file:
@@ -447,6 +448,8 @@ def _trainer_initialization(self) -> None:
         """Initialize LDA training"""
         self.uses_splices = True
         self.worker.uses_splices = True
+        if self.initialized:
+            return
         self.lda_acc_stats()
         self.tree_stats()
         self._setup_tree(initial_mix_up=False)

diff --git a/montreal_forced_aligner/acoustic_modeling/monophone.py b/montreal_forced_aligner/acoustic_modeling/monophone.py
@@ -68,7 +68,7 @@ def __init__(self, args: MonoAlignEqualArguments):
         self.ali_ark_paths = args.ali_ark_paths
         self.acc_paths = args.acc_paths
 
-    def run(self):
+    def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
         """Run the function"""
         with open(self.log_path, "w", encoding="utf8") as log_file:
             for dict_id in self.dictionaries:
@@ -204,7 +204,7 @@ def align_options(self) -> MetaDict:
             options["beam"] = self.initial_beam
         return options
 
-    def mono_align_equal(self):
+    def mono_align_equal(self) -> None:
         """
         Multiprocessing function that creates equal alignments for base monophone training.
 
@@ -310,6 +310,8 @@ def mono_align_equal(self):
 
     def _trainer_initialization(self) -> None:
         """Monophone training initialization"""
+        if self.initialized:
+            return
         self.iteration = 0
         tree_path = os.path.join(self.working_directory, "tree")