2.0.0a22 changes

Aditya514 · Jul 13, 2021 · f602042 · f602042
1 parent 140ef11
commit f602042
Show file tree

Hide file tree

Showing 39 changed files with 1,716 additions and 591 deletions.
diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml
@@ -0,0 +1,38 @@
+name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI
+
+on: push
+
+jobs:
+  build-n-publish:
+    name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
+    runs-on: ubuntu-18.04
+    steps:
+      - uses: actions/checkout@main
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.8
+      - name: Install pypa/build
+        run: >-
+          python -m
+          pip install
+          build
+          --user
+      - name: Build a binary wheel and a source tarball
+        run: >-
+          python -m
+          build
+          --sdist
+          --wheel
+          --outdir dist/
+          .
+      - name: Publish distribution 📦 to Test PyPI
+        uses: pypa/gh-action-pypi-publish@main
+        with:
+          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+          repository_url: https://test.pypi.org/legacy/
+      - name: Publish distribution 📦 to PyPI
+        if: startsWith(github.ref, 'refs/tags')
+        uses: pypa/gh-action-pypi-publish@main
+        with:
+          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -11,6 +11,14 @@ Changelog
 --------
 
 - Add support for aligning mp3 files
+- Fix for log error in 0 probability entries in probabilistic lexicons
+- Add support for multilingual IPA mode (see :ref:`multilingual_ipa` for more details)
+- Add support for specifying per-speaker pronunciation dictionaries (see :ref:`speaker_dictionaries` for more details)
+- Fixed cases where TextGrid parsing errors were misattributed to sound file issues, and these should be properly detected
+  by the validator now
+- Add check for system version of libc to provide a more informative error message with next steps for compiling Kaldi on
+  the user's machine
+- Update annotator utility to have autosave on exit
 
 2.0.0a21
 --------

diff --git a/docs/source/configuration_align.rst b/docs/source/configuration_align.rst
@@ -27,6 +27,9 @@ or longer stretches of audio).
    "punctuation", "、。।，@<>'"'(),.:;¿?¡!\\&%#*~【】，…‥「」『』〝〟″⟨⟩♪・‹›«»～′$+=", "Characters to treat as punctuation and strip from around words"
    "clitic_markers", "'''’", "Characters to treat as clitic markers, will be collapsed to the first character in the string"
    "compound_markers", "\-", "Characters to treat as marker in compound words (i.e., doesn't need to be preserved like for clitics)"
+   "multilingual_ipa", False, "Flag for enabling multilingual IPA mode, see :ref:`multilingual_ipa` for more details"
+   "strip_diacritics", "/iː/ /iˑ/ /ĭ/ /i̯/  /t͡s/ /t‿s/ /t͜s/ /n̩/", "IPA diacritics to strip in multilingual IPA mode (phone symbols for proper display, when specifying them just have the diacritic)"
+   "digraphs", "[dt][szʒʃʐʑʂɕç], [aoɔe][ʊɪ]", "Digraphs to split up in multilingual IPA mode"
 
 
 .. _feature_config:

diff --git a/docs/source/dictionary.rst b/docs/source/dictionary.rst
@@ -72,3 +72,69 @@ to align annotations like laughter, coughing, etc.
 
   {LG} spn
   {SL} sil
+
+
+.. _speaker_dictionaries:
+
+Per-speaker dictionaries
+========================
+
+In addition to specifying a single dictionary to use when aligning or transcribing, MFA also supports specifying per-speaker
+dictionaries via a yaml file, like the following.
+
+.. code-block:: yaml
+
+   default: /mnt/d/Data/speech/english_us_ipa.txt
+
+   speaker_a: /mnt/d/Data/speech/english_uk_ipa.txt
+   speaker_b: /mnt/d/Data/speech/english_uk_ipa.txt
+   speaker_c: /mnt/d/Data/speech/english_uk_ipa.txt
+
+What the above yaml file specifies is a "default" dictionary that will be used for any speaker not explicitly listed with
+another dictionary, so it's possible to train/align/transcribe using multiple dialects or languages, provided the model
+specified is compatible with all dictionaries.
+
+The way to use this per-speaker dictionary is in place of where the dictionary argument is:
+
+.. code-block::
+
+   mfa align /path/to/corpus /path/to/speaker_dictionaries.yaml /path/to/acoustic_model.zip /path/to/output
+
+
+.. _multilingual_ipa:
+
+Multilingual IPA mode
+=====================
+
+For the purposes of training multilingual models with IPA, there is a flag for training that enables this mode
+:code:`--multilingual_ipa`. With this mode, it strips out certain diacritics that are not generally related to the quality
+of the vowel (i.e., diacritics related to length), and splits digraphs (affricates and diphthongs/triphthongs) into
+their component symbols.  The reasoning behind these are that length can be modelled through transition probabilities
+and the model can take advantage of a less confusable phone set, and for digraphs, we can model the components parts
+to account better for rarer sounds.  For example, in English /dʒ/ is more common than /ʒ/, so by modelling /dʒ/ as /d ʒ/,
+we have more data for the solo instances of /ʒ/.  The downside for this split is that that the minimum duration is increased
+to 6 frames (3 frames per phone), however in general, diphthongs and affricates should be longer than less complex sounds.
+
+The default configuration for multilingual IPA is as follows:
+
+.. code-block::
+
+   strip_diacritics:
+     - "ː"  # long, i.e. /ɑː/
+     - "ˑ"  # half long, i.e. /ɑˑ/
+     - "̆"  # extra short, i.e. /ĭ/
+     - "̯"   # non syllabic, i.e. /i̯/
+     - "͡"  # linking, i.e. /d͡ʒ/
+     - "‿"  # linking, i.e. /d‿ʒ/
+     - "͜"  # linking, i.e. /d͜ʒ/
+     - "̩"   # syllabic, i.e. /n̩/
+
+   digraphs:
+     - "[dt][szʒʃʐʑʂɕç]" # affricates
+     - "[aoɔe][ʊɪ]" # diphthongs
+
+.. note::
+   Digraphs are specified as a regular expression pattern, where the characters in first set of square brackets (i.e. ``[aoɔe]``)
+   is the the set of characters that matches the first element in the digraph, and the characters in second set of square
+   brackets (i.e. ``[ʊɪ]``) matches the second element.  Triphthongs or longer sequences can be specified with more
+   sets of square brackets, like ``[e][i][u]`` would match just a /eiu/ triphthong.
diff --git a/montreal_forced_aligner/__init__.py b/montreal_forced_aligner/__init__.py
@@ -1,6 +1,6 @@
 __ver_major__ = 2
 __ver_minor__ = 0
-__ver_patch__ = '0a21'
+__ver_patch__ = '0a22'
 __version__ = "{}.{}.{}".format(__ver_major__, __ver_minor__, __ver_patch__)
 
 __all__ = ['aligner', 'command_line', 'models', 'corpus', 'config', 'dictionary', 'exceptions',

diff --git a/montreal_forced_aligner/aligner/base.py b/montreal_forced_aligner/aligner/base.py
@@ -7,6 +7,7 @@
 
 from ..helper import log_kaldi_errors, load_scp
 from ..exceptions import KaldiProcessingError
+from ..dictionary import MultispeakerDictionary
 
 
 class BaseAligner(object):
@@ -59,14 +60,11 @@ def __init__(self, corpus, dictionary, align_config, temp_directory=None,
 
     def setup(self):
         self.dictionary.write()
-        self.corpus.initialize_corpus(self.dictionary)
-        try:
-            self.align_config.feature_config.generate_features(self.corpus, logger=self.logger)
-        except Exception as e:
-            if isinstance(e, KaldiProcessingError):
-                log_kaldi_errors(e.error_logs, self.logger)
-                e.update_log_file(self.logger.handlers[0].baseFilename)
-            raise
+        self.corpus.initialize_corpus(self.dictionary, self.align_config.feature_config)
+
+    @property
+    def use_mp(self):
+        return self.align_config.use_mp
 
     @property
     def meta(self):
@@ -77,8 +75,21 @@ def meta(self):
                 }
         return data
 
+    def dictionaries_for_job(self, job_name):
+        if isinstance(self.dictionary, MultispeakerDictionary):
+            dictionary_names = []
+            for name in self.dictionary.dictionary_mapping.keys():
+                if os.path.exists(os.path.join(self.corpus.split_directory(), 'utt2spk.{}.{}'.format(job_name, name))):
+                    dictionary_names.append(name)
+            return dictionary_names
+        return None
+
     def compile_information(self, model_directory, output_directory):
         issues = compile_information(model_directory, self.corpus, self.corpus.num_jobs, self)
+        errors_path = os.path.join(output_directory, 'output_errors.txt')
+        if os.path.exists(errors_path):
+            self.logger.warning('There were errors when generating the textgrids. See the output_errors.txt in the '
+                                'output directory for more details.')
         if issues:
             issue_path = os.path.join(output_directory, 'unaligned.txt')
             with open(issue_path, 'w', encoding='utf8') as f:

diff --git a/montreal_forced_aligner/aligner/pretrained.py b/montreal_forced_aligner/aligner/pretrained.py
@@ -85,7 +85,7 @@ def align(self):
             return
         try:
             compile_train_graphs(self.align_directory, self.dictionary.output_directory,
-                                 self.align_config.data_directory, self.corpus.num_jobs, self.align_config)
+                                 self.align_config.data_directory, self.corpus.num_jobs, self)
             self.acoustic_model.feature_config.generate_features(self.corpus)
             log_dir = os.path.join(self.align_directory, 'log')
             os.makedirs(log_dir, exist_ok=True)

diff --git a/montreal_forced_aligner/aligner/trainable.py b/montreal_forced_aligner/aligner/trainable.py
@@ -36,16 +36,8 @@ def setup(self):
         if self.dictionary is not None:
             self.dictionary.set_word_set(self.corpus.word_set)
             self.dictionary.write()
-        self.corpus.initialize_corpus(self.dictionary)
-        for identifier, trainer in self.training_config.items():
-            try:
-                trainer.feature_config.generate_features(self.corpus)
-            except Exception as e:
-                if isinstance(e, KaldiProcessingError):
-                    log_kaldi_errors(e.error_logs, self.logger)
-                    e.update_log_file(self.logger.handlers[0].baseFilename)
-                raise
-            break
+        first_trainer = list(self.training_config.items())[0][1]
+        self.corpus.initialize_corpus(self.dictionary, first_trainer.feature_config)
 
     def save(self, path, root_directory=None):
         """

diff --git a/montreal_forced_aligner/command_line/align.py b/montreal_forced_aligner/command_line/align.py
@@ -6,12 +6,12 @@
 
 from montreal_forced_aligner import __version__
 from montreal_forced_aligner.corpus.align_corpus import AlignableCorpus
-from montreal_forced_aligner.dictionary import Dictionary
+from montreal_forced_aligner.dictionary import Dictionary, MultispeakerDictionary
 from montreal_forced_aligner.aligner import PretrainedAligner
 from montreal_forced_aligner.models import AcousticModel
 from montreal_forced_aligner.config import TEMP_DIR, align_yaml_to_config, load_basic_align
 from montreal_forced_aligner.utils import get_available_acoustic_languages, get_pretrained_acoustic_path, \
-    get_available_dict_languages, get_dictionary_path
+    get_available_dict_languages, validate_dictionary_arg
 from montreal_forced_aligner.helper import setup_logger, log_config
 from montreal_forced_aligner.exceptions import ArgumentError
 
@@ -79,6 +79,8 @@ def align_corpus(args, unknown_args=None):
     model_directory = os.path.join(data_directory, 'acoustic_models')
     os.makedirs(model_directory, exist_ok=True)
     os.makedirs(args.output_directory, exist_ok=True)
+    acoustic_model = AcousticModel(args.acoustic_model_path, root_directory=model_directory)
+    acoustic_model.log_details(logger)
     try:
         corpus = AlignableCorpus(args.corpus_directory, data_directory,
                                  speaker_characters=args.speaker_characters,
@@ -87,13 +89,24 @@ def align_corpus(args, unknown_args=None):
                                  clitic_markers=align_config.clitic_markers)
         if corpus.issues_check:
             logger.warning('Some issues parsing the corpus were detected. '
-                  'Please run the validator to get more information.')
+                           'Please run the validator to get more information.')
         logger.info(corpus.speaker_utterance_info())
-        dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set, logger=logger,
-                                punctuation=align_config.punctuation, clitic_markers=align_config.clitic_markers,
-                                compound_markers=align_config.compound_markers)
-        acoustic_model = AcousticModel(args.acoustic_model_path,  root_directory=model_directory)
-        acoustic_model.log_details(logger)
+        if args.dictionary_path.lower().endswith('.yaml'):
+            dictionary = MultispeakerDictionary(args.dictionary_path, data_directory, logger=logger,
+                                                punctuation=align_config.punctuation,
+                                                clitic_markers=align_config.clitic_markers,
+                                                compound_markers=align_config.compound_markers,
+                                                multilingual_ipa=acoustic_model.meta['multilingual_ipa'],
+                                                strip_diacritics=acoustic_model.meta.get('strip_diacritics', None),
+                                                digraphs=acoustic_model.meta.get('digraphs', None))
+        else:
+            dictionary = Dictionary(args.dictionary_path, data_directory, logger=logger,
+                                    punctuation=align_config.punctuation,
+                                    clitic_markers=align_config.clitic_markers,
+                                    compound_markers=align_config.compound_markers,
+                                    multilingual_ipa=acoustic_model.meta['multilingual_ipa'],
+                                    strip_diacritics=acoustic_model.meta.get('strip_diacritics', None),
+                                    digraphs=acoustic_model.meta.get('digraphs', None))
         acoustic_model.validate(dictionary)
 
         begin = time.time()
@@ -133,12 +146,7 @@ def validate_args(args, downloaded_acoustic_models, download_dictionaries):
     if args.corpus_directory == args.output_directory:
         raise ArgumentError('Corpus directory and output directory cannot be the same folder.')
 
-    if args.dictionary_path.lower() in download_dictionaries:
-        args.dictionary_path = get_dictionary_path(args.dictionary_path.lower())
-    if not os.path.exists(args.dictionary_path):
-        raise ArgumentError('Could not find the dictionary file {}'.format(args.dictionary_path))
-    if not os.path.isfile(args.dictionary_path):
-        raise ArgumentError('The specified dictionary path ({}) is not a text file.'.format(args.dictionary_path))
+    validate_dictionary_arg(args.dictionary_path, download_dictionaries)
 
     if args.acoustic_model_path.lower() in downloaded_acoustic_models:
         args.acoustic_model_path = get_pretrained_acoustic_path(args.acoustic_model_path.lower())