2.0.0rc8 (MontrealCorpusTools#458)

Resolves MontrealCorpusTools#451 Resolves MontrealCorpusTools#450 Resolves MontrealCorpusTools#452 Resolves MontrealCorpusTools#448 Resolves MontrealCorpusTools#414
Aditya514 · Jun 2, 2022 · 3944163 · 3944163
1 parent 4f4283c
commit 3944163
Show file tree

Hide file tree

Showing 83 changed files with 3,179 additions and 1,370 deletions.
diff --git a/docs/source/_static/css/style.css b/docs/source/_static/css/style.css
@@ -13,13 +13,14 @@
 }
 
 
-
 a.external::after{
-content: "  \f35d";
+content: "\f35d";
 font-size: 0.75em;
 text-align: center;
 vertical-align: middle;
 padding-bottom: 0.45em;
+font-family: "Font Awesome 5 Free";
+font-weight: 900;
 }
 
 :root {

diff --git a/docs/source/_static/interrogate_badge.svg b/docs/source/_static/interrogate_badge.svg
diff --git a/docs/source/changelog/changelog_2.0.rst b/docs/source/changelog/changelog_2.0.rst
@@ -10,6 +10,19 @@
 Release candidates
 ==================
 
+2.0.0rc8
+--------
+
+- Fixed a bug where G2P output was not correctly converted to strings :github_issue:`448`
+- Fixed a bug where specifying conda or temporary directories with spaces would cause crashes :github_issue:`450`
+- Fixed a crash with unspecified github_token values for ``mfa model`` commands
+- Added a utility function for :ref:`validating_dictionaries`
+- Fixed a bug where errors in multiprocessing workers were not properly raised by the main thread, obscuring the source of errors :github_issue:`452`
+- Added an experimental training flag for training a G2P model as part of the acoustic model training
+- Fixed a bug where models trained in 1.0 would not use speaker adaptation during alignment or transcription
+- Add support for exporting original text alongside word and phone alignments :github_issue:`414`
+- Fixed an issue with transcribing using multiple dictionaries
+
 2.0.0rc7
 --------
 

diff --git a/docs/source/external_links.py b/docs/source/external_links.py
@@ -68,6 +68,22 @@ def github_issue_role(
     return [pnode], []
 
 
+def github_pr_role(
+    typ: str,
+    rawtext: str,
+    text: str,
+    lineno: int,
+    inliner: Inliner,
+    options: dict = None,
+    content: List[str] = None,
+) -> Tuple[List[Node], List[system_message]]:
+    text = utils.unescape(text)
+    full_url = f"https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/pull/{text}"
+    title = f"GitHub #{text}"
+    pnode = nodes.reference(title, title, internal=False, refuri=full_url)
+    return [pnode], []
+
+
 def kaldi_steps_role(
     typ: str,
     rawtext: str,
@@ -444,6 +460,7 @@ def get_refs(app):
 def setup(app: Sphinx) -> Dict[str, Any]:
     app.add_config_value("xref_links", {}, "env")
     app.add_role("github_issue", github_issue_role)
+    app.add_role("github_pr", github_pr_role)
     app.add_role("kaldi_steps", kaldi_steps_role)
     app.add_role("kaldi_utils", kaldi_utils_role)
     app.add_role("kaldi_steps_sid", kaldi_steps_sid_role)

diff --git a/docs/source/user_guide/configuration/global.rst b/docs/source/user_guide/configuration/global.rst
@@ -38,7 +38,7 @@ This section is only relevant for training, as the trained model will contain ex
    "use_energy", "False", "Use energy in place of first MFCC"
    "frame_shift", 10, "In milliseconds, determines time resolution"
    "snip_edges", True, "Should provide better time resolution in alignment"
-   "use_pitch", False, "Currently not implemented"
+   "use_pitch", False, "Flag for whether to compute pitch features"
    "low_frequency", 20, "Frequency cut off for feature generation"
    "high_frequency", 7800, "Frequency cut off for feature generation"
    "sample_frequency", 16000, "Sample rate to up- or down-sample to"

diff --git a/docs/source/user_guide/data_validation.rst b/docs/source/user_guide/data_validation.rst
@@ -30,8 +30,22 @@ and logs any of the following issues:
 
 .. _running_the_validator:
 
-Running the validation utility
-==============================
+Running the corpus validation utility
+=====================================
+
+
+Command reference
+-----------------
+
+.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser
+   :prog: mfa
+   :start_command: validate
+   :groups:
+
+.. _running_the_dictionary_validator:
+
+Running the dictionary validation utility
+=========================================
 
 
 Command reference

diff --git a/docs/source/user_guide/dictionary_validation.rst b/docs/source/user_guide/dictionary_validation.rst
@@ -0,0 +1,21 @@
+
+.. _validating_dictionaries:
+
+*************************************
+Validating pronunciation dictionaries
+*************************************
+
+
+.. _running_the_dictionary_validator:
+
+Running the dictionary validation utility
+=========================================
+
+
+Command reference
+-----------------
+
+.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser
+   :prog: mfa
+   :start_command: validate_dictionary
+   :groups:
diff --git a/docs/source/user_guide/workflows/alignment.rst b/docs/source/user_guide/workflows/alignment.rst
@@ -37,7 +37,7 @@ The two metrics calculated for each utterance are overlap score and phone error
 
 .. math::
 
-   Overlap \: score = \frac{\sum\limits_{i=0}^{n-1} (\lvert begin_{aligned[i]} - begin_{ref[i]} \rvert + \lvert end_{aligned[i]} - end_{ref[i]} \rvert )}{n}
+   Alignment \: score = \frac{Overlap \: cost}{2}
 
 Phone error rate is calculated as:
 

diff --git a/docs/source/user_guide/workflows/train_acoustic_model.rst b/docs/source/user_guide/workflows/train_acoustic_model.rst
@@ -198,6 +198,13 @@ For ARPA, we use the following topology calculation.  Additionally, stress-marke
      -
 
 
+Pronunciation modeling
+======================
+
+For the default configuration, pronunciation probabilities are estimated following the second and third SAT blocks.  See :ref:`training_dictionary` for more details.
+
+A recent experimental feature for training acoustic models is the ``--train_g2p`` flag which changes the pronunciation probability estimation from a lexicon based estimation to instead using a G2P model as the lexicon. The idea here is that we have pronunciations generated by the initial blocks much like for the standard lexicon-based approach, but instead of estimating probabilities for individual word/pronunciation pairs and the likelihood of surrounding silence, it learns a mapping between the graphemes of the input texts and the phones.
+
 
 
 Command reference

diff --git a/docs/source/user_guide/workflows/training_dictionary.rst b/docs/source/user_guide/workflows/training_dictionary.rst
@@ -5,6 +5,119 @@ Add probabilities to a dictionary ``(mfa train_dictionary)``
 
 MFA includes a utility command for training :term:`pronunciation probabilities` of a dictionary given a corpus for alignment.
 
+The implementation used here follow Kaldi's :kaldi_steps:`get_prons`, :kaldi_utils:`dict_dir_add_pronprobs.sh`, and :kaldi_utils:`lang/make_lexicon_fst_silprob.py`.
+
+.. seealso::
+
+   For a more in depth description of the algorithm, see the `Chen et al (2015) <https://www.danielpovey.com/files/2015_interspeech_silprob.pdf>`_
+
+Example
+-------
+
+As an example, consider the following English and Japanese sentences:
+
+.. tab-set::
+
+   .. tab-item:: English
+      :sync: english
+
+      The red fox has read many books, but there's always more to read.
+
+      Normalized:
+
+      the red fox has read many books but there 's always more to read
+
+   .. tab-item:: Japanese
+      :sync: japanese
+
+      アカギツネはいろんな本を読んできましたけど、まだまだ読み切りがありません。
+
+      Normalized:
+
+      アカギツネ は いろんな 本 を 読んで き ました けれども まだまだ 読み 切り が あり ません
+
+The following pronunciation dictionaries:
+
+.. tab-set::
+
+   .. tab-item:: English
+      :sync: english
+
+      .. csv-table:: English US pronunciation dictionary
+         :widths: 30, 70
+         :header: "Word","Pronunciation"
+
+         In addition to lexical variants for the present and past tense of "read", function words have several variants listed. The genitive marker "'s" has variants to account for stem-final voicing (:ipa_inline:`[s]` and :ipa_inline:`[z]`) and stem-final alveolar obstruents (:ipa_inline:`[ɪ z]`). The negative conjuction "but" has variants for the pronunciation of the vowel and final :ipa_inline:`/t/` as :ipa_inline:`[ʔ]` or :ipa_inline:`[ɾ]`. Likewise, the preposition "to" has variants for the initial :ipa_inline:`/t/` and vowel reductions.  The definite determiner "the" and distal demonstrative "there" have variants for stopping :ipa_inline:`/ð/` to :ipa_inline:`[d̪]`, along with reductions for vowels.
+
+         "'s","s"
+         "'s","z"
+         "'s","ɪ z"
+         "always","ɒː ɫ w ej z"
+         "always","ɑː ɫ w ej z"
+         "always","ɒː w ej z"
+         "always","ɑː w ej z"
+         "books","b ʊ k s"
+         "but","b ɐ t"
+         "but","b ɐ ʔ"
+         "but","b ə ɾ"
+         "fox","f ɑː k s"
+         "has","h æ s"
+         "has","h æ z"
+         "many","m ɛ ɲ i"
+         "more","m ɒː ɹ"
+         "read","ɹ iː d"
+         "read","ɹ ɛ d"
+         "red","ɹ ɛ d"
+         "the","d̪ iː"
+         "the","d̪ iː ʔ"
+         "the","d̪ ə"
+         "the","ð iː"
+         "the","ð iː ʔ"
+         "the","ð ə"
+         "there","d̪ ɚ"
+         "there","d̪ ɛ ɹ"
+         "there","ð ɚ"
+         "there","ð ɛ ɹ"
+         "to","t ə"
+         "to","tʰ ʉː"
+         "to","tʰ ʊ"
+         "to","ɾ ə"
+
+
+   .. tab-item:: Japanese
+      :sync: japanese
+
+      The main pronunciation variants are in the topic particle "は", the object particle "を", past tense polite suffix "ました", and the "but" conjunction "けれども". The particles are always pronounced as :ipa_inline:`[w a]` and :ipa_inline:`[o]` and never as their hiragana readings :ipa_inline:`[h a]` and :ipa_inline:`[w o]`, respectively.  For "ました", I've included various levels of devoicing for :ipa_inline:`/i/` between the voiceless obstruents from full voiced :ipa_inline:`[i]`, to devoiced :ipa_inline:`[i̥]` to deleted.
+
+      .. csv-table:: Japanese pronunciation dictionary
+         :widths: 30, 70
+         :header: "Word","Pronunciation"
+
+         "アカギツネ","a k a ɟ i ts ɨ n e"
+         "は","h a"
+         "は","w a"
+         "いろんな","i ɾ o nː a"
+         "本","h o ɴ"
+         "を","o"
+         "を","w o"
+         "読んで","j o n d e"
+         "き","c i"
+         "ました","m a ɕ i̥ t a"
+         "ました","m a ɕ i t a"
+         "ました","m a ɕ t a"
+         "けれども","k e ɾ e d o m o"
+         "けれども","k e d o m o"
+         "けれども","k e d o"
+         "読み","j o m i"
+         "切り","c i ɾ i"
+         "が","ɡ a"
+         "あり","a ɾ i"
+         "ません","m a s e ɴ"
+
+The basic steps to calculating pronunciation and silence probabilities is as follows:
+
+1. Generate word-pronunciation pairs from the alignment lattices
+
 The resulting dictionary can then be used as a dictionary for alignment or transcription.
 
 

diff --git a/docs/source/user_guide/workflows/transcribing.rst b/docs/source/user_guide/workflows/transcribing.rst
@@ -4,10 +4,30 @@
 Transcribe audio files ``(mfa transcribe)``
 ===========================================
 
+MFA has some limited ability to use its acoustic and language models for performing transcription.  The intent of this functionality is largely to aid in offline corpus construction, and not as an online capability like most ASR systems.
+
+.. seealso::
+
+   See :ref:`train_acoustic_model` and :ref:`training_lm` details on training MFA models to use in transcription.
+
+Unlike alignment, transcription does not require transcribed audio files (except when running in :ref:`transcription_evaluation`, but instead will use the combination of acoustic model, language model, and pronunciation dictionary to create a decoding lattice and find the best path through it. When training a language model for transcription, it is recommended to train one on text/speech transcripts that are in the same domain to minimize errors.
+
 .. warning::
 
    The technology that MFA uses is several years out of date, and as such if you have other options available such as :xref:`coqui` or other production systems for :abbr:`STT (Speech to Text)`, we recommend using those.  The transcription capabilities are more here for completeness.
 
+.. _transcription_evaluation:
+
+Evaluation mode
+---------------
+
+Transcriptions can be compared to a gold-standard references by transcribing a corpus in the same format as for alignment (i.e., each sound file has a corresponding TextGrid or lab file).  Transcript will proceed as above, and then the resulting transcripts will be aligned with the gold transcriptions using the :mod:`Bio.pairwise2` alignment algorithm. From the aligned transcripts, Word Error Rate and Character Error Rate will be calculated for each utterance as follows:
+
+.. math::
+
+   Error \: rate = \frac{insertions + deletions + (2 * substitutions)} {length_{ref}}
+
+
 Command reference
 -----------------
 

diff --git a/montreal_forced_aligner/abc.py b/montreal_forced_aligner/abc.py
@@ -12,6 +12,7 @@
 import subprocess
 import sys
 import time
+import traceback
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -29,7 +30,7 @@
 import yaml
 from sqlalchemy.orm import Session
 
-from montreal_forced_aligner.exceptions import KaldiProcessingError
+from montreal_forced_aligner.exceptions import KaldiProcessingError, MultiprocessingError
 from montreal_forced_aligner.helper import comma_join, load_configuration
 
 if TYPE_CHECKING:
@@ -65,10 +66,18 @@ def __init__(self, args: MfaArguments):
         self.job_name = self.args.job_name
         self.log_path = self.args.log_path
 
-    @abc.abstractmethod
     def run(self):
-        """Run the function"""
-        ...
+        """Run the function, calls :meth:`~KaldiFunction._run` with error handling"""
+        try:
+            yield from self._run()
+        except Exception:
+            exc_type, exc_value, exc_traceback = sys.exc_info()
+            error_text = "\n".join(traceback.format_exception(exc_type, exc_value, exc_traceback))
+            raise MultiprocessingError(self.job_name, error_text)
+
+    def _run(self):
+        """Internal logic for running the worker"""
+        pass
 
     def check_call(self, proc: subprocess.Popen):
         """
@@ -178,6 +187,7 @@ def __init__(
     ):
         super().__init__(**kwargs)
         self._db_engine = None
+        self._db_path = None
 
     def initialize_database(self) -> None:
         """
@@ -198,6 +208,8 @@ def db_engine(self) -> sqlalchemy.engine.Engine:
     @property
     def db_path(self) -> str:
         """Path to SQLite database file"""
+        if self._db_path is not None:
+            return self._db_path
         return os.path.join(self.output_directory, f"{self.identifier}.db")
 
     def construct_engine(self, same_thread=True, read_only=False) -> sqlalchemy.engine.Engine: