Fix phonetisaurus training (MontrealCorpusTools#477)

Aditya514 · Jun 27, 2022 · a99c937 · a99c937
1 parent 58e95c9
commit a99c937
Show file tree

Hide file tree

Showing 19 changed files with 1,826 additions and 1,107 deletions.
diff --git a/docs/source/_static/interrogate_badge.svg b/docs/source/_static/interrogate_badge.svg
diff --git a/docs/source/changelog/changelog_2.0.rst b/docs/source/changelog/changelog_2.0.rst
diff --git a/docs/source/changelog/changelog_2.0_pre_release.rst b/docs/source/changelog/changelog_2.0_pre_release.rst
diff --git a/docs/source/changelog/index.rst b/docs/source/changelog/index.rst
@@ -148,4 +148,5 @@ The functionality of :code:`mfa_generate_dictionary` has been expanded.
    :hidden:
 
    changelog_2.0.rst
+   changelog_2.0_pre_release.rst
    changelog_1.0.rst
diff --git a/docs/source/user_guide/concepts/features.rst b/docs/source/user_guide/concepts/features.rst
@@ -0,0 +1,22 @@
+
+
+.. _acoustic_features:
+
+Acoustic features
+=================
+
+.. warning::
+
+   Still under construction, I hope to fill these sections out as I have time.
+
+
+.. _features_mfcc:
+
+Mel-Frequency Cepstrum Coefficients
+-----------------------------------
+
+
+.. _features_pitch:
+
+Pitch
+-----
diff --git a/docs/source/user_guide/concepts/fst.rst b/docs/source/user_guide/concepts/fst.rst
@@ -0,0 +1,41 @@
+
+.. _fst:
+
+Finite State Transducers
+========================
+
+.. warning::
+
+   Still under construction, I hope to fill these sections out as I have time.
+
+.. seealso::
+
+   `OpenFst Quick Tour <https://www.openfst.org/twiki/bin/view/FST/FstQuickTour>`_
+
+.. _acceptor:
+
+Acceptors
+---------
+
+.. _wfst:
+
+Weighted Finite State Transducers
+---------------------------------
+
+
+.. _lexicon_fst:
+
+Lexicon FSTs
+============
+
+MFA compiles input pronunciation dictionaries to a Weighted Finite State Transducer (:term:`WFST`), with phones as input symbols and words as output symbols.  During alignment, the :term:`lexicon FST` is composed with a linear acceptor created from the
+
+.. _grammar_fst:
+
+Grammar FSTs
+============
+
+.. _g2p_fst:
+
+G2P FSTs
+========
diff --git a/docs/source/user_guide/concepts/hmm.rst b/docs/source/user_guide/concepts/hmm.rst
@@ -0,0 +1,17 @@
+
+
+.. _hmm:
+
+Hidden Markov Models
+====================
+
+.. warning::
+
+   Still under construction, I hope to fill these sections out as I have time.
+
+
+Standard topology
+-----------------
+
+MFA topology
+------------
diff --git a/docs/source/user_guide/concepts/index.rst b/docs/source/user_guide/concepts/index.rst
@@ -0,0 +1,19 @@
+
+.. _concepts:
+
+***************
+Concepts in MFA
+***************
+
+This section will attempt to provide a blend of technical and non-technical overviews of various components and concepts used in MFA. There are much more in-depth resources for learning about various components that will be linked if you are interested in learning more about them.
+
+.. warning::
+
+   Still under construction, I hope to fill these sections out as I have time.
+
+.. toctree::
+   :hidden:
+
+   features
+   fst
+   hmm
diff --git a/docs/source/user_guide/corpus_creation/training_dictionary.rst b/docs/source/user_guide/corpus_creation/training_dictionary.rst
diff --git a/docs/source/user_guide/glossary.rst b/docs/source/user_guide/glossary.rst
@@ -30,6 +30,16 @@ Glossary
    MFCCs
         :abbr:`Mel-frequency cepstrum coefficients (MFCCs)` are the industry standard for acoustic features.  The process involves windowing the acoustic waveform, scaling the frequencies into the Mel space (an auditory representation that gives more weight to lower frequencies over higher frequencies), and then performs a :abbr:`discrete cosine transform (DCT)` on the values in each filter bank to get orthogonal coefficients.  There was a trend around 2015-2018 to use acoustic features that were more raw (i.e., not transformed to the Mel space, or the waveform directly), but in general most recent state of the art systems still use MFCC features.
 
+   WFST
+   FST
+      A :abbr:`Finite State Transducer (FST)` is a graph formalism that can transform a sequence of arbitrary input symbols into arbitrary output symbols.  A :abbr:`Weighted Finite State Transducer (WFST)` is an FST that has costs associated with its various paths, so a single best output string can be selected.  Training graphs are WFSTs of the lexicon WFST composed with linear acceptors of the transcription text.  For transcription, lexicons are composed with language models as well.  MFA's :term:`G2P models` are WFSTs trained using a pair ngram algorithm or the many to many Phonetisaurus algortithm.
+
+   lexicon FST
+      A :term:`WFST` constructed from a pronunciation dictionary that can be composed with :term:`grammar FST` and HMM-GMM acoustic model to align and transcribe speech.
+
+   grammar FST
+      A :term:`WFST` compiled from a language model that represents how likely a word is given the previous words (ngram model), or a linear acceptor from a known utterance transcription where there is only one path through the words in the transcript for use in alignment.
+
    Pronunciation probabilities
         Pronunciation probabilities in dictionaries allow for certain spoken forms to be more likely, rather than just assigning equal weight to all pronunciation variants.
 

diff --git a/montreal_forced_aligner/corpus/acoustic_corpus.py b/montreal_forced_aligner/corpus/acoustic_corpus.py
@@ -324,8 +324,6 @@ def construct_feature_proc_strings(
         """
         strings = []
         for j in self.jobs:
-            if not j.has_data:
-                continue
             lda_mat_path = None
             fmllrs = {}
             if self.working_directory is not None:

diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py
@@ -839,42 +839,6 @@ def create_subset(self, subset: int) -> None:
 
             session.commit()
 
-            # Extra check to make sure the randomness didn't end up with 1 or 2 utterances
-            # for a particular job/dictionary combo
-            subset_agg = (
-                session.query(
-                    Speaker.job_id, Speaker.dictionary_id, sqlalchemy.func.count(Utterance.id)
-                )
-                .join(Utterance.speaker)
-                .filter(Utterance.in_subset == True)  # noqa
-                .group_by(Speaker.job_id, Speaker.dictionary_id)
-            )
-            for j_id, d_id, utterance_count in subset_agg:
-                if utterance_count < 20:
-                    larger_subset_query = (
-                        session.query(Utterance.id)
-                        .join(Utterance.speaker)
-                        .filter(Speaker.dictionary_id == d_id)
-                        .filter(Speaker.job_id == j_id)
-                        .filter(Utterance.ignored == False)  # noqa
-                    )
-                    sq = larger_subset_query.subquery()
-                    subset_utts = (
-                        sqlalchemy.select(sq.c.id)
-                        .order_by(sqlalchemy.func.random())
-                        .limit(20)
-                        .scalar_subquery()
-                    )
-                    query = (
-                        sqlalchemy.update(Utterance)
-                        .execution_options(synchronize_session="fetch")
-                        .values(in_subset=True)
-                        .where(Utterance.id.in_(subset_utts))
-                    )
-                    session.execute(query)
-
-            subset_count = session.query(Utterance).filter_by(in_subset=True).count()
-            self.log_debug(f"Total subset utterances is {subset_count}")
             self.log_debug(f"Setting subset flags took {time.time()-begin} seconds")
             log_dir = os.path.join(subset_directory, "log")
             os.makedirs(log_dir, exist_ok=True)
@@ -925,6 +889,18 @@ def subset_directory(self, subset: typing.Optional[int]) -> str:
         directory = os.path.join(self.corpus_output_directory, f"subset_{subset}")
         if not os.path.exists(directory):
             self.create_subset(subset)
+        for j in self.jobs:
+            j.has_data = False
+        with self.session() as session:
+            query = (
+                session.query(Speaker.job_id, sqlalchemy.func.count(Utterance.id))
+                .join(Utterance.speaker)
+                .filter(Utterance.in_subset == True)  # noqa
+                .group_by(Speaker.job_id)
+            )
+            for job_id, utterance_count in query:
+                if utterance_count > 0:
+                    self.jobs[job_id].has_data = True
         return directory
 
     def calculate_word_counts(self) -> None: