From a99c93730e4d9d2f6f5562cfb831c98b96f69c0e Mon Sep 17 00:00:00 2001 From: Michael McAuliffe Date: Sun, 26 Jun 2022 22:37:38 -0700 Subject: [PATCH] Fix phonetisaurus training (#477) --- docs/source/_static/interrogate_badge.svg | 6 +- docs/source/changelog/changelog_2.0.rst | 396 +--- .../changelog/changelog_2.0_pre_release.rst | 394 ++++ docs/source/changelog/index.rst | 1 + docs/source/user_guide/concepts/features.rst | 22 + docs/source/user_guide/concepts/fst.rst | 41 + docs/source/user_guide/concepts/hmm.rst | 17 + docs/source/user_guide/concepts/index.rst | 19 + .../corpus_creation/training_dictionary.rst | 61 +- docs/source/user_guide/glossary.rst | 10 + .../corpus/acoustic_corpus.py | 2 - montreal_forced_aligner/corpus/base.py | 48 +- .../corpus/multiprocessing.py | 172 +- montreal_forced_aligner/db.py | 112 ++ .../dictionary/multispeaker.py | 8 +- montreal_forced_aligner/exceptions.py | 13 + montreal_forced_aligner/g2p/generator.py | 6 +- .../g2p/phonetisaurus_trainer.py | 1597 +++++++++++------ .../dictionaries/mixed_format_dictionary.txt | 8 +- 19 files changed, 1826 insertions(+), 1107 deletions(-) create mode 100644 docs/source/changelog/changelog_2.0_pre_release.rst create mode 100644 docs/source/user_guide/concepts/features.rst create mode 100644 docs/source/user_guide/concepts/fst.rst create mode 100644 docs/source/user_guide/concepts/hmm.rst create mode 100644 docs/source/user_guide/concepts/index.rst diff --git a/docs/source/_static/interrogate_badge.svg b/docs/source/_static/interrogate_badge.svg index 93fc2182..17dfab32 100644 --- a/docs/source/_static/interrogate_badge.svg +++ b/docs/source/_static/interrogate_badge.svg @@ -1,5 +1,5 @@ - interrogate: 99.4% + interrogate: 99.2% @@ -12,8 +12,8 @@ interrogate interrogate - 99.4% - 99.4% + 99.2% + 99.2% diff --git a/docs/source/changelog/changelog_2.0.rst b/docs/source/changelog/changelog_2.0.rst index 2712a959..f9069059 100644 --- a/docs/source/changelog/changelog_2.0.rst +++ b/docs/source/changelog/changelog_2.0.rst @@ -5,6 +5,14 @@ 2.0 Changelog ************* +2.0.2 +===== + +- Optimized Phonetisaurus training regime for phone and grapheme orders greater than 1 +- Fixed a bug in parsing dictionaries that included whitespace as part of the word +- Fixed a bug in Phonetisaurus generation where insertions and deletions were not being properly generated +- Changed the default alignment separator for Phonetisaurus to ``;`` instead of ``}`` (shouldn't conflict with most phone sets) and added extra validation to ensure special symbols are not present in the dictionary + 2.0.1 ===== @@ -20,391 +28,3 @@ - Added support for exporting alignments in CSV format - Updated JSON export format to be more idiomatic JSON :github_issue:`453` - Fixed a crash where initial training rounds with many jobs would result in jobs that had no utterances :github_issue:`468` - -.. _2.0r: - -Release candidates -================== - -2.0.0rc9 --------- - -- Fixed a bug where exporting TextGrids would fail if any were empty :github_issue:`459` - -2.0.0rc8 --------- - -- Fixed a bug where G2P output was not correctly converted to strings :github_issue:`448` -- Fixed a bug where specifying conda or temporary directories with spaces would cause crashes :github_issue:`450` -- Fixed a crash with unspecified github_token values for ``mfa model`` commands -- Added a utility function for :ref:`validating_dictionaries` -- Fixed a bug where errors in multiprocessing workers were not properly raised by the main thread, obscuring the source of errors :github_issue:`452` -- Fixed an error in parsing text files in corpora for G2P generation or language model training :github_issue:`449` -- Added an experimental training flag for training a G2P model as part of the acoustic model training -- Fixed a bug where models trained in 1.0 would not use speaker adaptation during alignment or transcription -- Add support for exporting original text alongside word and phone alignments :github_issue:`414` -- Fixed an issue with transcribing using multiple dictionaries - -2.0.0rc7 --------- - -- Fixed a bug where silence correction was not being calculated correctly -- Fixed a bug where sample rate could not be specified when not using multiprocessing :github_pr:`444` -- Fixed an incompatibility with the Kaldi version 1016 where BLAS libraries were not operating in single-threaded mode -- Further optimized large multispeaker dictionary loading -- Fixed a bug where subsets were not properly generated when multiple dictionaries were used - -2.0.0rc6 --------- - -- Reverted the default export type to ``long_textgrid``, which can be changed to ``short_textgrid`` or ``json`` via the ``--output_format`` flag for commands that export TextGrids :github_issue:`434` -- Added more information for when malformed dictionary lines fail to parse (i.e., lines with just tabs on them) :github_issue:`411` -- Fixed a bug where phones with underscores in them would cause export to crash :github_issue:`432` -- Changed the overwrite behavior in export to specifically avoid overwriting input files, rather than testing the existence of the overall output directory :github_issue:`431` -- Added additional initial check to ensure that Kaldi and OpenFst binaries can be successfully invoked, rather than throwing an unintuitive error during feature creation -- Optimized initial load and TextGrid export :github_issue:`437` and :github_issue:`249` -- Allow for dictionaries with the same base name in different locations to be used side-by-side :github_issue:`417` -- Fixed a bug where initial silence was not being properly handled if silence probability training had not been done -- Removed PronunciationDictionaryMixin and PronunciationDictionary classes and refactored functionality into :class:`~montreal_forced_aligner.dictionary.multispeaker.MultispeakerDictionaryMixin` and :class:`~montreal_forced_aligner.db.Dictionary` -- Fixed a bug where alignment models would not be adapted during adaptation :github_issue:`421` - -2.0.0rc5 --------- - -- Fixed a bug where a list of downloadable models wasn't getting output for commands like ``mfa models download acoustic`` -- Added option to specify ``--output_format`` for exporting alignments for ``short_textgrids`` (the default to save space), ``long_textgrids`` (original behavior), or ``json`` - -2.0.0rc4 --------- - -- Added ``--quiet`` flag to suppress printing output to the console -- Added ability to specify ``pronunciation_probabilities`` in training blocks where probabilities of pronunciation variants and their probabilities of appearing before/after silence will be calculated based on alignment at that stage. The lexicon files will be regenerated and use these probabilities for later training blocks -- Added a flag to export per-pronunciation silence probabilities to :ref:`training_dictionary` -- Added a flag to :ref:`transcribing` for specifying the language model weight and word insertion penalties to speed up evaluation of transcripts -- Added a final SAT training block equivalent to the :kaldi_steps:`train_quick` script -- Added early stopping of SAT training blocks if the corpus size is below the specified subset (at least two rounds of SAT training will be performed) -- Refactored how transcription parsing is done, so that you can specify word break characters other than whitespace (i.e., instances of ``.`` or ``?`` in embedded in words that are typos in the corpus) -- Refactored quotations and clitic markers, so if there happens to be a word like ``kid'``, MFA can recover the word ``kid`` from it. If there is no word entry for ``kid`` or ``kid'`` is in the dictionary, the apostrophe will be kept. -- Refactored the ``--test_transcription`` functionality of :ref:`validating_data` to use small language models built from all transcripts of a speaker, mixed with an even smaller language model per utterance, following :kaldi_steps:`cleanup/make_biased_lm_graphs`. -- Refactored how internal storage is done to use a sqlite database rather than having everything in memory. Bigger corpora should not need as much memory when aligning/training. -- Fixed an issue in lexicon construction where explicit silences were not being respected (:github_issue:`392`) -- Fixed an issue in training where initial gaussians were not being properly used -- Changed the behavior of assigning speakers to jobs, so that it now tries to balance the number of utterances across jobs -- Changed the default topology to allow for more variable length phones (minimum duration is now one frame, 10ms by default) -- Changed how models and dictionaries are downloaded with the changes to the `MFA Models `_ -- Added the ability to use pitch features for models, with the ``--use_pitch`` flag or configuration option -- Added a ``[bracketed]`` word that will capture any transcriptions like ``[wor-]`` or ````, as these are typically restarts, hesitations, speech errors, etc that have separate characteristics compared to a word that happen to not be in the dictionary. The same phone is used for both, but having a separate word symbol allows silence probabilities to be modelled separately. -- Added words for ``[laugh]`` and ``[laughter]`` to capture laughter annotations as separate from both OOV ```` items and ``[bracketed]`` words. As with ``[bracketed]``, the laughter words use the same ``spn`` phone, but allow for separate silence probabilities. -- Fixed a bug where models trained in earlier version were not correctly reporting their phone set (:github_issue:`422`) -- Fixed a bug where TextGrids were never exported to the specified output directory with out ``--overwrite`` (:github_issue:`408`) -- Fixed a bug where spaces in sound file names would throw an error for that file (:github_issue:`407`) - - -2.0.0rc3 --------- -- Fixed a bug where textgrids weren't being properly generated following training -- Fixed a bug where commands were not always respecting ``--overwrite`` -- Fixed a bug where not all words in multispeaker dictionaries would be parsed -- Improved transcription accuracy calculation to account for compounds and clitics -- Fixed a crash when subsetting corpora that did not all have transcriptions - -2.0.0rc2 --------- -- Added configuration parameter (``ignore_case=False``) to allow for disabling the default behavior of making all text and lexicon entries lower case -- Added some metadata about training data to acoustic models - -2.0.0rc1 --------- - -- Getting closer to stable release! -- Fixed some bugs in how transcription and alignment accuracy were calculated -- Added additional information to evaluation output files -- Added file listing average per-frame log-likelihoods by utterance for alignment -- Fixed a bug where having "" in a transcript would cause MFA to crash - -.. _2.0b: - -Beta releases -============= - -2.0.0b11 --------- - -- Re-optimized corpus loading following the switch to a more class-based API. -- Optimized validation, particularly when acoustics are being ignored -- Added better progress bars for corpus loading, acoustic modeling, G2P training, transcription and alignment -- Changed the default behavior of G2P generation to use a threshold system rather than returning a single top pronunciation. The threshold defaults to 0.99, but can be specified through ``--g2p_threshold``. Specifying number of pronunciations will override this behavior (use ``--num_pronunciation 1`` for the old behavior). -- Changed the behavior of G2P evaluation to check whether the generated hypothesis is in the golden pronunciation set, so languages with pronunciation variation will be less penalized in evaluation -- Added :class:`~montreal_forced_aligner.data.WordData` and :class:`~montreal_forced_aligner.db.Pronunciation` data classes -- Refactored and simplified TextGrid export process -- Removed the ``multilingual_ipa`` mode in favor of a more general approach to better modeling phones -- Added functionality to evaluate alignments against golden alignment set -- Added the ability to compare alignments to a reference aligned, such as human annotated data. The evaluation will compute overlap score (sum of difference in aligned phone boundaries versus the reference phone boundaries) and overall phone error rate for each utterance. - -2.0.0b10 --------- - -- Changed the functionality of validating dictionary phones and acoustic model phones so that the aligner will simply ignore pronunciations containing phones not in the acoustic model (and print a warning). The validator utility will provide further detail on what was ignored. -- Fixed a bug where evaluation of training G2P models was not actually triggered -- Refactored PairNGramAligner into the :class:`~montreal_forced_aligner.g2p.trainer.PyniniTrainer` class to improve logging output -- Changed the starting index of training blocks with the same name. Old behavior was ``sat``, ``sat1``, ``sat2``, etc. The new behavior is ``sat``, ``sat2``, ``sat3``, etc. -- Revert a change with how sets, roots and extra questions are handled - -2.0.0b9 -------- - -- Fixed a bug where unknown word phones were showing up as blank -- Fixed a bug where TextGrid export would hang -- Fixed compatibility issues with Python 3.8 -- Added logging for when configuration parameters are ignored -- Added some functionality from the LibriSpeech recipe for triphone training with Arpabet - - - Not sure if it'll improve anything, but I'll run some tests and maybe extend it to other phone sets - -- Added better logging to TextGrid export -- Added new classes for managing collections of utterances, speakers, and files -- Fixed a bug where oovs were not being properly reported by the validation tool - -2.0.0b8 -------- - -- Refactored internal organization to rely on mixins more than monolithic classes, and moved internal functions to be organized by what they're used for instead of the general type. - - - For instance, there used to be a ``montreal_forced_aligner.multiprocessing`` module with ``alignment.py``, ``transcription.py``, etc that all did multiprocessing for various workers. Now that functionality is located closer to where it's used, i.e. ``montreal_forced_aligner.transcription.multiprocessing``. - - Mixins should allow for more easy extension to new use cases and allow for better configuration - -- Updated documentation to reflect the refactoring and did a pass over the User Guide -- Added the ability to change the location of root MFA directory based on the ``MFA_ROOT_DIR`` environment variable -- Fixed an issue where the version was incorrectly reported as "2.0.0" - -2.0.0b5 -------- - -- Documentation refresh! Docs now use the :xref:`pydata_sphinx_theme` and should have a better landing page and flow, as well as up to date API reference -- Some refactoring to use type hinting and abstract class interfaces (still a work in progress) - - -2.0.0b4 -------- - -- Massive refactor to a proper class-based API for interacting with MFA corpora - - - Sorry, I really do hope this is the last big refactor of 2.0 - - montreal_forced_aligner.corpus.classes.Speaker, :class:`~montreal_forced_aligner.corpus.classes.FileData`, and :class:`~montreal_forced_aligner.corpus.classes.UtteranceData` have dedicated classes rather than having their information split across dictionaries mimicking Kaldi files, so they should be more useful for interacting with outside of MFA - - Added :class:`~montreal_forced_aligner.corpus.multiprocessing.Job` class as well to make it easier to generate and keep track of information about different processes -- Updated installation style to be more dependent on conda-forge packages - - - Kaldi and MFA are now on conda-forge! |:tada:| - -- Added a :code:`mfa model` command for inspecting, listing, downloading, and saving pretrained models, see :ref:`pretrained_models` for more information. -- Fixed a bug where saving command history with errors would throw an error of its own -- Fixed an issue where one Job could process another Job's data, result in an error -- Updated API documentation to reflect refactor changes - - -2.0.0b3 -------- - -- Fixed a bug involving non-escaped orthographic characters -- Improved SAT alignment with speaker-independent alignment model -- Fixed a bug where models would not function properly if they were renamed -- Added a history subcommand to list previous commands - -2.0.0b1 -------- - -- Fixed bug in training (:mfa_pr:`337`) -- Fixed bug when using Ctrl-C in loading - -2.0.0b0 -------- - -Beta release! - -- Fixed an issue in transcription when using a .ARPA language model rather than one built in MFA -- Fixed an issue in parsing filenames containing spaces -- Added a ``mfa configure`` command to set global options. Users can now specify a new default for arguments like ``--num_jobs``, ``--clean`` or ``--temp_directory``, see :ref:`configuration` for more details. -- Added a new flag for overwriting output files. By default now, MFA will not output files if the path already exists, and will instead write to a directory in the temporary directory. You can revert this change by running ``mfa configure --always_overwrite`` -- Added a ``--disable_textgrid_cleanup`` flag to disable for post-processing that MFA has implemented recently (not outputting silence labels and recombining subwords that got split up as part of dictionary look up). You can set this to be the default by running ``mfa configure --disable_textgrid_cleanup`` -- Refactored and optimized the TextGrid export process to use multiple processes by default, you should be significant speed ups. -- Removed shorthand flags for ``-c`` and ``-d`` since they could represent multiple different flags/arguments. - -.. _2.0a: - -2.0 alpha releases -================== - -2.0.0a24 --------- - -- Fixed some miscellaneous bugs and cleaned up old and unused code - -2.0.0a23 --------- - -- Fix bugs in transcription and aligning with using multiple dictionaries -- Fixed an issue where filenames were output with ``-`` rather than ``_`` if they originally had them -- Changed how output text different from input text when there was a compound marker (i.e., ``-``), these should now - have a single interval for the whole compound rather than two intervals for each subword -- Changed how OOV items are output, so they will be present in the output rather than ```` - -2.0.0a22 --------- - -- Add support for aligning mp3 files -- Fix for log error in 0 probability entries in probabilistic lexicons -- Add support for multilingual IPA mode -- Add support for specifying per-speaker pronunciation dictionaries (see :ref:`speaker_dictionaries` for more details) -- Fixed cases where TextGrid parsing errors were misattributed to sound file issues, and these should be properly detected - by the validator now -- Add check for system version of libc to provide a more informative error message with next steps for compiling Kaldi on - the user's machine -- Update annotator utility to have autosave on exit -- Fixed cases where not all phones in a dictionary were present in phone_mapping -- Changed TextGrid export to not put "sp" or "sil" in the phone tier - -2.0.0a21 --------- - -- Fixed a memory leak in corpus parsing introduced by 2.0.0a20 - -2.0.0a20 --------- - -- Fixed an issue with :code:`create_segments` where it would assue singular speakers -- Fixed a race condition in multiprocessing where the queue could finish with the jobs still running and unable to join -- Updated transcription to use a small language model for first pass decoding followed by LM rescoring in line with Kaldi recipes -- Added an optional :code:`--audio_directory` argument for finding sound files in a directory separate from the transcriptions -- Added perplexity calculations for language model training -- Updated annotator GUI to support new improvements, mainly playback of :code:`.flac` files -- Added annotator GUI functionality for showing all speaker tiers -- Added annotator GUI functionality for changing speakers of utterances by clicking and dragging them -- Updated annotator GUI to no longer aggressively zoom when selecting, merging, or splitting utterances, instead zoom - functionality is achieved through double clicks - - -2.0.0a19 --------- - -- Fixed a bug where command line arguments were not being correctly passed to ``train`` and other commands - -2.0.0a18 --------- - -- Changes G2P model training validation to not do a full round of training after the validation model is trained -- Adds the ability to change in alignment config yamls the punctuation, clitic, and compound marker sets used in - sanitizing words in dictionary and corpus uses -- Changed configuration in G2P to fit with the model used in alignment, allow for configuration yamls to be passed, as - well as arguments from command line -- Fix a bug where floating point wav files could not be parsed - -2.0.0a17 --------- - -- Optimizes G2P model training for 0.3.6 and exposes :code:`--batch_size`, :code:`--max_iterations`, and :code:`--learning_rate` - from the command line -- Changes where models are stored to make them specific to the alignment run rather than storing them globally in the temporary - directory - -2.0.0a16 --------- - -- Changed how punctuation is stripped from beginning/end of words (:mfa_pr:`288`) -- Added more logging for alignment (validating acoustic models and generating overall log-likelihood of the alignment) -- Changed subsetting features prior to initializing monophone trainer to prevent erroneous error detection -- Fixed parsing of boolean arguments on command line to be passed to aligners - -2.0.0a15 --------- - -- Fixed a bug with dictionary parsing that misparsed clitics as words -- Added a :code:`--clean` flag for :code:`mfa g2p` and :code:`mfa train_g2p` to remove temporary files from - previous runs -- Added support for using :code:`sox` in feature generation, allowing for use of audio files other than WAV -- Switched library for TextGrid parsing from :code:`textgrid` to :code:`praatio`, allowing support for TextGrid files in - the short format. - -2.0.0a14 --------- - -- Fixed a bug in running fMMLR for speaker adaptation where utterances were not properly sorted (MFA now uses dashes to - separate elements in utterance names rather than underscores) - -2.0.0a13 --------- - -- Updated how sample rates are handled. MFA now generates features between 80 Hz and 7800 Hz and allows downsampling and - upsampling, so there will be no more errors or warnings about unsupported sample rates or speakers with different sample - rates -- Fixed a bug where some options for generating MFCCs weren't properly getting picked up (e.g., snip-edges) -- (EXPERIMENTAL) Added better support for varying frame shift. In :code:`mfa align`, you can now add a flag of :code:`--frame_shift 1` to align - with millisecond shifts between frames. Please note this is more on the experimental side, as it increases computational - time significantly and I don't know fully the correct options to use for :code:`self_loop_scale`, :code:`transition_scale`, - and :code:`acoustic_scale` to generate good alignments. -- Fixed a bug in G2P training with relative paths for output model -- Cleaned up validator output - -2.0.0a11 --------- - -- Fixed a bug in analyzing unaligned utterances introduced by changes in segment representation - -2.0.0a9 -------- - -- Fixed a bug when loading :code:`utterance_lengths.scp` from previous failed runs -- Added the ability to generate multiple pronunciations per word when running G2P, see the extra options in - :ref:`g2p_dictionary_generating` for more details. - -2.0.0a8 -------- - -- Fixed a bug in generating alignments for TextGrid corpora - -2.0.0a7 -------- - -- Upgraded dependency of Pynini version to 2.1.4, please update package versions via :code:`conda upgrade -c conda-forge openfst pynini ngram baumwelch` - if you had previously installed MFA. -- Allowed for splitting clitics on multiple apostrophes -- Fixed bug in checking for brackets in G2P (:mfa_pr:`235`) -- Updated Annotator utility (:ref:`anchor` for more details) to be generally more usable for TextGrid use cases and - adjusting segments and their transcriptions -- Improved handling of stereo files with TextGrids so that MFA doesn't need to generate temporary files for each channel - -2.0.0a5 -------- - -- Fixed a bug in feature where sorting was not correct due to lack of speakers at the beginnings - of utterances -- Fixed a bug where alignment was not performing speaker adaptation correctly -- Added a flag to :code:`align` command to disable speaker adaptation if desired -- Fixed a bug where the aligner was not properly ignored short utterances (< 0.13 seconds) -- Changed the internal handling of stereo files to use :code:`_channelX` rather than :code:`_A` and :code:`_B` -- Add a :code:`version` subcommand to output the version - -2.0.0a4 -------- - -- Fixed a corpus parsing bug introduced by new optimized parsing system in 2.0.0a3 - -2.0.0a3 -------- - -- Further optimized corpus parsing algorithm to use multiprocessing and to load from saved files in temporary directories -- Revamped and fixed training using subsets of the corpora -- Fixed issue with training LDA systems -- Fixed a long-standing issue with words being marked as OOV due to improperly parsing clitics -- Updated logging to better capture when errors occur due to Kaldi binaries to better locate sources of issues - -2.0.0 ------ - -Currently under development with major changes, see :ref:`whats_new_2_0`. - -- Fixed a bug in dictionary parsing that caused initial numbers in pronunciations to be misparsed and ignored -- Updated sound file parsing to use PySoundFile rather than inbuilt wave module, which should lead to more informative error - messages for files that do not meet Kaldi's input requirements -- Removed multiprocessing from speaker adaptation, as the executables use multiple threads leading to a bottleneck in - performance. This change should result in faster speaker adaptation. -- Optimized corpus parsing algorithm to be O(n log n) instead of O(n^2) (:mfa_pr:`194`) diff --git a/docs/source/changelog/changelog_2.0_pre_release.rst b/docs/source/changelog/changelog_2.0_pre_release.rst new file mode 100644 index 00000000..83bb1bdc --- /dev/null +++ b/docs/source/changelog/changelog_2.0_pre_release.rst @@ -0,0 +1,394 @@ + +.. _changelog_2.0_pre: + +************************* +2.0 Pre-release Changelog +************************* + +.. _2.0r: + +Release candidates +================== + +2.0.0rc9 +-------- + +- Fixed a bug where exporting TextGrids would fail if any were empty :github_issue:`459` + +2.0.0rc8 +-------- + +- Fixed a bug where G2P output was not correctly converted to strings :github_issue:`448` +- Fixed a bug where specifying conda or temporary directories with spaces would cause crashes :github_issue:`450` +- Fixed a crash with unspecified github_token values for ``mfa model`` commands +- Added a utility function for :ref:`validating_dictionaries` +- Fixed a bug where errors in multiprocessing workers were not properly raised by the main thread, obscuring the source of errors :github_issue:`452` +- Fixed an error in parsing text files in corpora for G2P generation or language model training :github_issue:`449` +- Added an experimental training flag for training a G2P model as part of the acoustic model training +- Fixed a bug where models trained in 1.0 would not use speaker adaptation during alignment or transcription +- Add support for exporting original text alongside word and phone alignments :github_issue:`414` +- Fixed an issue with transcribing using multiple dictionaries + +2.0.0rc7 +-------- + +- Fixed a bug where silence correction was not being calculated correctly +- Fixed a bug where sample rate could not be specified when not using multiprocessing :github_pr:`444` +- Fixed an incompatibility with the Kaldi version 1016 where BLAS libraries were not operating in single-threaded mode +- Further optimized large multispeaker dictionary loading +- Fixed a bug where subsets were not properly generated when multiple dictionaries were used + +2.0.0rc6 +-------- + +- Reverted the default export type to ``long_textgrid``, which can be changed to ``short_textgrid`` or ``json`` via the ``--output_format`` flag for commands that export TextGrids :github_issue:`434` +- Added more information for when malformed dictionary lines fail to parse (i.e., lines with just tabs on them) :github_issue:`411` +- Fixed a bug where phones with underscores in them would cause export to crash :github_issue:`432` +- Changed the overwrite behavior in export to specifically avoid overwriting input files, rather than testing the existence of the overall output directory :github_issue:`431` +- Added additional initial check to ensure that Kaldi and OpenFst binaries can be successfully invoked, rather than throwing an unintuitive error during feature creation +- Optimized initial load and TextGrid export :github_issue:`437` and :github_issue:`249` +- Allow for dictionaries with the same base name in different locations to be used side-by-side :github_issue:`417` +- Fixed a bug where initial silence was not being properly handled if silence probability training had not been done +- Removed PronunciationDictionaryMixin and PronunciationDictionary classes and refactored functionality into :class:`~montreal_forced_aligner.dictionary.multispeaker.MultispeakerDictionaryMixin` and :class:`~montreal_forced_aligner.db.Dictionary` +- Fixed a bug where alignment models would not be adapted during adaptation :github_issue:`421` + +2.0.0rc5 +-------- + +- Fixed a bug where a list of downloadable models wasn't getting output for commands like ``mfa models download acoustic`` +- Added option to specify ``--output_format`` for exporting alignments for ``short_textgrids`` (the default to save space), ``long_textgrids`` (original behavior), or ``json`` + +2.0.0rc4 +-------- + +- Added ``--quiet`` flag to suppress printing output to the console +- Added ability to specify ``pronunciation_probabilities`` in training blocks where probabilities of pronunciation variants and their probabilities of appearing before/after silence will be calculated based on alignment at that stage. The lexicon files will be regenerated and use these probabilities for later training blocks +- Added a flag to export per-pronunciation silence probabilities to :ref:`training_dictionary` +- Added a flag to :ref:`transcribing` for specifying the language model weight and word insertion penalties to speed up evaluation of transcripts +- Added a final SAT training block equivalent to the :kaldi_steps:`train_quick` script +- Added early stopping of SAT training blocks if the corpus size is below the specified subset (at least two rounds of SAT training will be performed) +- Refactored how transcription parsing is done, so that you can specify word break characters other than whitespace (i.e., instances of ``.`` or ``?`` in embedded in words that are typos in the corpus) +- Refactored quotations and clitic markers, so if there happens to be a word like ``kid'``, MFA can recover the word ``kid`` from it. If there is no word entry for ``kid`` or ``kid'`` is in the dictionary, the apostrophe will be kept. +- Refactored the ``--test_transcription`` functionality of :ref:`validating_data` to use small language models built from all transcripts of a speaker, mixed with an even smaller language model per utterance, following :kaldi_steps:`cleanup/make_biased_lm_graphs`. +- Refactored how internal storage is done to use a sqlite database rather than having everything in memory. Bigger corpora should not need as much memory when aligning/training. +- Fixed an issue in lexicon construction where explicit silences were not being respected (:github_issue:`392`) +- Fixed an issue in training where initial gaussians were not being properly used +- Changed the behavior of assigning speakers to jobs, so that it now tries to balance the number of utterances across jobs +- Changed the default topology to allow for more variable length phones (minimum duration is now one frame, 10ms by default) +- Changed how models and dictionaries are downloaded with the changes to the `MFA Models `_ +- Added the ability to use pitch features for models, with the ``--use_pitch`` flag or configuration option +- Added a ``[bracketed]`` word that will capture any transcriptions like ``[wor-]`` or ````, as these are typically restarts, hesitations, speech errors, etc that have separate characteristics compared to a word that happen to not be in the dictionary. The same phone is used for both, but having a separate word symbol allows silence probabilities to be modelled separately. +- Added words for ``[laugh]`` and ``[laughter]`` to capture laughter annotations as separate from both OOV ```` items and ``[bracketed]`` words. As with ``[bracketed]``, the laughter words use the same ``spn`` phone, but allow for separate silence probabilities. +- Fixed a bug where models trained in earlier version were not correctly reporting their phone set (:github_issue:`422`) +- Fixed a bug where TextGrids were never exported to the specified output directory with out ``--overwrite`` (:github_issue:`408`) +- Fixed a bug where spaces in sound file names would throw an error for that file (:github_issue:`407`) + + +2.0.0rc3 +-------- +- Fixed a bug where textgrids weren't being properly generated following training +- Fixed a bug where commands were not always respecting ``--overwrite`` +- Fixed a bug where not all words in multispeaker dictionaries would be parsed +- Improved transcription accuracy calculation to account for compounds and clitics +- Fixed a crash when subsetting corpora that did not all have transcriptions + +2.0.0rc2 +-------- +- Added configuration parameter (``ignore_case=False``) to allow for disabling the default behavior of making all text and lexicon entries lower case +- Added some metadata about training data to acoustic models + +2.0.0rc1 +-------- + +- Getting closer to stable release! +- Fixed some bugs in how transcription and alignment accuracy were calculated +- Added additional information to evaluation output files +- Added file listing average per-frame log-likelihoods by utterance for alignment +- Fixed a bug where having "" in a transcript would cause MFA to crash + +.. _2.0b: + +Beta releases +============= + +2.0.0b11 +-------- + +- Re-optimized corpus loading following the switch to a more class-based API. +- Optimized validation, particularly when acoustics are being ignored +- Added better progress bars for corpus loading, acoustic modeling, G2P training, transcription and alignment +- Changed the default behavior of G2P generation to use a threshold system rather than returning a single top pronunciation. The threshold defaults to 0.99, but can be specified through ``--g2p_threshold``. Specifying number of pronunciations will override this behavior (use ``--num_pronunciation 1`` for the old behavior). +- Changed the behavior of G2P evaluation to check whether the generated hypothesis is in the golden pronunciation set, so languages with pronunciation variation will be less penalized in evaluation +- Added :class:`~montreal_forced_aligner.data.WordData` and :class:`~montreal_forced_aligner.db.Pronunciation` data classes +- Refactored and simplified TextGrid export process +- Removed the ``multilingual_ipa`` mode in favor of a more general approach to better modeling phones +- Added functionality to evaluate alignments against golden alignment set +- Added the ability to compare alignments to a reference aligned, such as human annotated data. The evaluation will compute overlap score (sum of difference in aligned phone boundaries versus the reference phone boundaries) and overall phone error rate for each utterance. + +2.0.0b10 +-------- + +- Changed the functionality of validating dictionary phones and acoustic model phones so that the aligner will simply ignore pronunciations containing phones not in the acoustic model (and print a warning). The validator utility will provide further detail on what was ignored. +- Fixed a bug where evaluation of training G2P models was not actually triggered +- Refactored PairNGramAligner into the :class:`~montreal_forced_aligner.g2p.trainer.PyniniTrainer` class to improve logging output +- Changed the starting index of training blocks with the same name. Old behavior was ``sat``, ``sat1``, ``sat2``, etc. The new behavior is ``sat``, ``sat2``, ``sat3``, etc. +- Revert a change with how sets, roots and extra questions are handled + +2.0.0b9 +------- + +- Fixed a bug where unknown word phones were showing up as blank +- Fixed a bug where TextGrid export would hang +- Fixed compatibility issues with Python 3.8 +- Added logging for when configuration parameters are ignored +- Added some functionality from the LibriSpeech recipe for triphone training with Arpabet + + - Not sure if it'll improve anything, but I'll run some tests and maybe extend it to other phone sets + +- Added better logging to TextGrid export +- Added new classes for managing collections of utterances, speakers, and files +- Fixed a bug where oovs were not being properly reported by the validation tool + +2.0.0b8 +------- + +- Refactored internal organization to rely on mixins more than monolithic classes, and moved internal functions to be organized by what they're used for instead of the general type. + + - For instance, there used to be a ``montreal_forced_aligner.multiprocessing`` module with ``alignment.py``, ``transcription.py``, etc that all did multiprocessing for various workers. Now that functionality is located closer to where it's used, i.e. ``montreal_forced_aligner.transcription.multiprocessing``. + - Mixins should allow for more easy extension to new use cases and allow for better configuration + +- Updated documentation to reflect the refactoring and did a pass over the User Guide +- Added the ability to change the location of root MFA directory based on the ``MFA_ROOT_DIR`` environment variable +- Fixed an issue where the version was incorrectly reported as "2.0.0" + +2.0.0b5 +------- + +- Documentation refresh! Docs now use the :xref:`pydata_sphinx_theme` and should have a better landing page and flow, as well as up to date API reference +- Some refactoring to use type hinting and abstract class interfaces (still a work in progress) + + +2.0.0b4 +------- + +- Massive refactor to a proper class-based API for interacting with MFA corpora + + - Sorry, I really do hope this is the last big refactor of 2.0 + - montreal_forced_aligner.corpus.classes.Speaker, :class:`~montreal_forced_aligner.corpus.classes.FileData`, and :class:`~montreal_forced_aligner.corpus.classes.UtteranceData` have dedicated classes rather than having their information split across dictionaries mimicking Kaldi files, so they should be more useful for interacting with outside of MFA + - Added :class:`~montreal_forced_aligner.corpus.multiprocessing.Job` class as well to make it easier to generate and keep track of information about different processes +- Updated installation style to be more dependent on conda-forge packages + + - Kaldi and MFA are now on conda-forge! |:tada:| + +- Added a :code:`mfa model` command for inspecting, listing, downloading, and saving pretrained models, see :ref:`pretrained_models` for more information. +- Fixed a bug where saving command history with errors would throw an error of its own +- Fixed an issue where one Job could process another Job's data, result in an error +- Updated API documentation to reflect refactor changes + + +2.0.0b3 +------- + +- Fixed a bug involving non-escaped orthographic characters +- Improved SAT alignment with speaker-independent alignment model +- Fixed a bug where models would not function properly if they were renamed +- Added a history subcommand to list previous commands + +2.0.0b1 +------- + +- Fixed bug in training (:mfa_pr:`337`) +- Fixed bug when using Ctrl-C in loading + +2.0.0b0 +------- + +Beta release! + +- Fixed an issue in transcription when using a .ARPA language model rather than one built in MFA +- Fixed an issue in parsing filenames containing spaces +- Added a ``mfa configure`` command to set global options. Users can now specify a new default for arguments like ``--num_jobs``, ``--clean`` or ``--temp_directory``, see :ref:`configuration` for more details. +- Added a new flag for overwriting output files. By default now, MFA will not output files if the path already exists, and will instead write to a directory in the temporary directory. You can revert this change by running ``mfa configure --always_overwrite`` +- Added a ``--disable_textgrid_cleanup`` flag to disable for post-processing that MFA has implemented recently (not outputting silence labels and recombining subwords that got split up as part of dictionary look up). You can set this to be the default by running ``mfa configure --disable_textgrid_cleanup`` +- Refactored and optimized the TextGrid export process to use multiple processes by default, you should be significant speed ups. +- Removed shorthand flags for ``-c`` and ``-d`` since they could represent multiple different flags/arguments. + +.. _2.0a: + +2.0 alpha releases +================== + +2.0.0a24 +-------- + +- Fixed some miscellaneous bugs and cleaned up old and unused code + +2.0.0a23 +-------- + +- Fix bugs in transcription and aligning with using multiple dictionaries +- Fixed an issue where filenames were output with ``-`` rather than ``_`` if they originally had them +- Changed how output text different from input text when there was a compound marker (i.e., ``-``), these should now + have a single interval for the whole compound rather than two intervals for each subword +- Changed how OOV items are output, so they will be present in the output rather than ```` + +2.0.0a22 +-------- + +- Add support for aligning mp3 files +- Fix for log error in 0 probability entries in probabilistic lexicons +- Add support for multilingual IPA mode +- Add support for specifying per-speaker pronunciation dictionaries (see :ref:`speaker_dictionaries` for more details) +- Fixed cases where TextGrid parsing errors were misattributed to sound file issues, and these should be properly detected + by the validator now +- Add check for system version of libc to provide a more informative error message with next steps for compiling Kaldi on + the user's machine +- Update annotator utility to have autosave on exit +- Fixed cases where not all phones in a dictionary were present in phone_mapping +- Changed TextGrid export to not put "sp" or "sil" in the phone tier + +2.0.0a21 +-------- + +- Fixed a memory leak in corpus parsing introduced by 2.0.0a20 + +2.0.0a20 +-------- + +- Fixed an issue with :code:`create_segments` where it would assue singular speakers +- Fixed a race condition in multiprocessing where the queue could finish with the jobs still running and unable to join +- Updated transcription to use a small language model for first pass decoding followed by LM rescoring in line with Kaldi recipes +- Added an optional :code:`--audio_directory` argument for finding sound files in a directory separate from the transcriptions +- Added perplexity calculations for language model training +- Updated annotator GUI to support new improvements, mainly playback of :code:`.flac` files +- Added annotator GUI functionality for showing all speaker tiers +- Added annotator GUI functionality for changing speakers of utterances by clicking and dragging them +- Updated annotator GUI to no longer aggressively zoom when selecting, merging, or splitting utterances, instead zoom + functionality is achieved through double clicks + + +2.0.0a19 +-------- + +- Fixed a bug where command line arguments were not being correctly passed to ``train`` and other commands + +2.0.0a18 +-------- + +- Changes G2P model training validation to not do a full round of training after the validation model is trained +- Adds the ability to change in alignment config yamls the punctuation, clitic, and compound marker sets used in + sanitizing words in dictionary and corpus uses +- Changed configuration in G2P to fit with the model used in alignment, allow for configuration yamls to be passed, as + well as arguments from command line +- Fix a bug where floating point wav files could not be parsed + +2.0.0a17 +-------- + +- Optimizes G2P model training for 0.3.6 and exposes :code:`--batch_size`, :code:`--max_iterations`, and :code:`--learning_rate` + from the command line +- Changes where models are stored to make them specific to the alignment run rather than storing them globally in the temporary + directory + +2.0.0a16 +-------- + +- Changed how punctuation is stripped from beginning/end of words (:mfa_pr:`288`) +- Added more logging for alignment (validating acoustic models and generating overall log-likelihood of the alignment) +- Changed subsetting features prior to initializing monophone trainer to prevent erroneous error detection +- Fixed parsing of boolean arguments on command line to be passed to aligners + +2.0.0a15 +-------- + +- Fixed a bug with dictionary parsing that misparsed clitics as words +- Added a :code:`--clean` flag for :code:`mfa g2p` and :code:`mfa train_g2p` to remove temporary files from + previous runs +- Added support for using :code:`sox` in feature generation, allowing for use of audio files other than WAV +- Switched library for TextGrid parsing from :code:`textgrid` to :code:`praatio`, allowing support for TextGrid files in + the short format. + +2.0.0a14 +-------- + +- Fixed a bug in running fMMLR for speaker adaptation where utterances were not properly sorted (MFA now uses dashes to + separate elements in utterance names rather than underscores) + +2.0.0a13 +-------- + +- Updated how sample rates are handled. MFA now generates features between 80 Hz and 7800 Hz and allows downsampling and + upsampling, so there will be no more errors or warnings about unsupported sample rates or speakers with different sample + rates +- Fixed a bug where some options for generating MFCCs weren't properly getting picked up (e.g., snip-edges) +- (EXPERIMENTAL) Added better support for varying frame shift. In :code:`mfa align`, you can now add a flag of :code:`--frame_shift 1` to align + with millisecond shifts between frames. Please note this is more on the experimental side, as it increases computational + time significantly and I don't know fully the correct options to use for :code:`self_loop_scale`, :code:`transition_scale`, + and :code:`acoustic_scale` to generate good alignments. +- Fixed a bug in G2P training with relative paths for output model +- Cleaned up validator output + +2.0.0a11 +-------- + +- Fixed a bug in analyzing unaligned utterances introduced by changes in segment representation + +2.0.0a9 +------- + +- Fixed a bug when loading :code:`utterance_lengths.scp` from previous failed runs +- Added the ability to generate multiple pronunciations per word when running G2P, see the extra options in + :ref:`g2p_dictionary_generating` for more details. + +2.0.0a8 +------- + +- Fixed a bug in generating alignments for TextGrid corpora + +2.0.0a7 +------- + +- Upgraded dependency of Pynini version to 2.1.4, please update package versions via :code:`conda upgrade -c conda-forge openfst pynini ngram baumwelch` + if you had previously installed MFA. +- Allowed for splitting clitics on multiple apostrophes +- Fixed bug in checking for brackets in G2P (:mfa_pr:`235`) +- Updated Annotator utility (:ref:`anchor` for more details) to be generally more usable for TextGrid use cases and + adjusting segments and their transcriptions +- Improved handling of stereo files with TextGrids so that MFA doesn't need to generate temporary files for each channel + +2.0.0a5 +------- + +- Fixed a bug in feature where sorting was not correct due to lack of speakers at the beginnings + of utterances +- Fixed a bug where alignment was not performing speaker adaptation correctly +- Added a flag to :code:`align` command to disable speaker adaptation if desired +- Fixed a bug where the aligner was not properly ignored short utterances (< 0.13 seconds) +- Changed the internal handling of stereo files to use :code:`_channelX` rather than :code:`_A` and :code:`_B` +- Add a :code:`version` subcommand to output the version + +2.0.0a4 +------- + +- Fixed a corpus parsing bug introduced by new optimized parsing system in 2.0.0a3 + +2.0.0a3 +------- + +- Further optimized corpus parsing algorithm to use multiprocessing and to load from saved files in temporary directories +- Revamped and fixed training using subsets of the corpora +- Fixed issue with training LDA systems +- Fixed a long-standing issue with words being marked as OOV due to improperly parsing clitics +- Updated logging to better capture when errors occur due to Kaldi binaries to better locate sources of issues + +2.0.0 +----- + +Currently under development with major changes, see :ref:`whats_new_2_0`. + +- Fixed a bug in dictionary parsing that caused initial numbers in pronunciations to be misparsed and ignored +- Updated sound file parsing to use PySoundFile rather than inbuilt wave module, which should lead to more informative error + messages for files that do not meet Kaldi's input requirements +- Removed multiprocessing from speaker adaptation, as the executables use multiple threads leading to a bottleneck in + performance. This change should result in faster speaker adaptation. +- Optimized corpus parsing algorithm to be O(n log n) instead of O(n^2) (:mfa_pr:`194`) diff --git a/docs/source/changelog/index.rst b/docs/source/changelog/index.rst index 9ad58a96..babf197c 100644 --- a/docs/source/changelog/index.rst +++ b/docs/source/changelog/index.rst @@ -148,4 +148,5 @@ The functionality of :code:`mfa_generate_dictionary` has been expanded. :hidden: changelog_2.0.rst + changelog_2.0_pre_release.rst changelog_1.0.rst diff --git a/docs/source/user_guide/concepts/features.rst b/docs/source/user_guide/concepts/features.rst new file mode 100644 index 00000000..37e5c935 --- /dev/null +++ b/docs/source/user_guide/concepts/features.rst @@ -0,0 +1,22 @@ + + +.. _acoustic_features: + +Acoustic features +================= + +.. warning:: + + Still under construction, I hope to fill these sections out as I have time. + + +.. _features_mfcc: + +Mel-Frequency Cepstrum Coefficients +----------------------------------- + + +.. _features_pitch: + +Pitch +----- diff --git a/docs/source/user_guide/concepts/fst.rst b/docs/source/user_guide/concepts/fst.rst new file mode 100644 index 00000000..275b3f64 --- /dev/null +++ b/docs/source/user_guide/concepts/fst.rst @@ -0,0 +1,41 @@ + +.. _fst: + +Finite State Transducers +======================== + +.. warning:: + + Still under construction, I hope to fill these sections out as I have time. + +.. seealso:: + + `OpenFst Quick Tour `_ + +.. _acceptor: + +Acceptors +--------- + +.. _wfst: + +Weighted Finite State Transducers +--------------------------------- + + +.. _lexicon_fst: + +Lexicon FSTs +============ + +MFA compiles input pronunciation dictionaries to a Weighted Finite State Transducer (:term:`WFST`), with phones as input symbols and words as output symbols. During alignment, the :term:`lexicon FST` is composed with a linear acceptor created from the + +.. _grammar_fst: + +Grammar FSTs +============ + +.. _g2p_fst: + +G2P FSTs +======== diff --git a/docs/source/user_guide/concepts/hmm.rst b/docs/source/user_guide/concepts/hmm.rst new file mode 100644 index 00000000..e9da1e4a --- /dev/null +++ b/docs/source/user_guide/concepts/hmm.rst @@ -0,0 +1,17 @@ + + +.. _hmm: + +Hidden Markov Models +==================== + +.. warning:: + + Still under construction, I hope to fill these sections out as I have time. + + +Standard topology +----------------- + +MFA topology +------------ diff --git a/docs/source/user_guide/concepts/index.rst b/docs/source/user_guide/concepts/index.rst new file mode 100644 index 00000000..f925bca0 --- /dev/null +++ b/docs/source/user_guide/concepts/index.rst @@ -0,0 +1,19 @@ + +.. _concepts: + +*************** +Concepts in MFA +*************** + +This section will attempt to provide a blend of technical and non-technical overviews of various components and concepts used in MFA. There are much more in-depth resources for learning about various components that will be linked if you are interested in learning more about them. + +.. warning:: + + Still under construction, I hope to fill these sections out as I have time. + +.. toctree:: + :hidden: + + features + fst + hmm diff --git a/docs/source/user_guide/corpus_creation/training_dictionary.rst b/docs/source/user_guide/corpus_creation/training_dictionary.rst index 9093d21f..0472a625 100644 --- a/docs/source/user_guide/corpus_creation/training_dictionary.rst +++ b/docs/source/user_guide/corpus_creation/training_dictionary.rst @@ -1,6 +1,7 @@ .. _`Chen et al (2015)`: https://www.danielpovey.com/files/2015_interspeech_silprob.pdf .. _`English US MFA dictionary`: https://mfa-models.readthedocs.io/en/latest/dictionary/English/English%20%28US%29%20MFA%20dictionary%20v2_0_0a.html#English%20(US)%20MFA%20dictionary%20v2_0_0a +.. _`Japanese MFA dictionary`: https://mfa-models.readthedocs.io/en/latest/dictionary/Japanese/Japanese%20MFA%20dictionary%20v2_0_0.html#Japanese%20MFA%20dictionary%20v2_0_0 .. _training_dictionary: @@ -13,7 +14,16 @@ The implementation used here follow Kaldi's :kaldi_steps:`get_prons`, :kaldi_uti .. seealso:: - For a more in depth description of the algorithm, see the `Chen et al (2015)`_. + Refer to the :ref:`lexicon FST concept section ` for an introduction and overview of how MFA compiles pronunciation dictionaries to a :term:`WFST`. The algorithm and calculations below are based on `Chen et al (2015)`_. + +Consider the following :term:`WFST` with two pronunciations of "because" from the trained `English US MFA dictionary`_. + + + .. figure:: ../../_static/because.svg + :align: center + :alt: :term:`FST` for two pronunciations of "the" in the English US dictionary + +In the above figure, there are are two final states, with 0 corresponding to a word preceded by ``non-silence`` and 1 corresponding to a word preceded by ``silence``. The costs associated with each transition are negative log-probabilities, so that less likely paths cost more. The state 0 refers to the beginning of speech, so the paths to the silence and non silence state are equal in this case. The cost for ending on silence is lower at -0.77 than ending on non-silence with a cost of 1.66, meaning that most utterances in the training data had trailing silence at the end of the recordings. .. _train_pronunciation_probability: @@ -26,7 +36,7 @@ Pronunciation probabilities are estimated based on the counts of a specific pron p(w.p_{i} | w) = \frac{c(w.p_{i} | w)}{max_{1\le i \le N_{w}}c(w.p_{i} | w)} -The reason for using max normalization is to not penalize words with many pronunciations. Even though the probabilities no longer sum to 1, the log of the probabilities is used as summed costs in the lexicon FST, so summing to 1 within a word is not problematic. +The reason for using max normalization is to not penalize words with many pronunciations. Even though the probabilities no longer sum to 1, the log of the probabilities is used as summed costs in the :term:`lexicon FST`, so summing to 1 within a word is not problematic. If a word is not seen in the training data, pronunciation probabilities are not estimated for its pronunciations. @@ -35,18 +45,15 @@ If a word is not seen in the training data, pronunciation probabilities are not Silence probability and correction factors ------------------------------------------ -Words different in their likelihood to appear before or after silence. In English, a word like "the" is more likely to appear after silence than a word like "us". An pronoun in the accusative case like "us" is not grammatical as the start of a sentence or phrase, whereas "the" starts sentences and phrases regularly. That is not to say that a speaker would not pause before saying "us" for paralinguistic effect or due to a disfluency or simple pause, it's just less likely than for "the". +Words differ in their likelihood to appear before or after silence. In English, a word like "the" is more likely to appear after silence than a word like "us". An pronoun in the accusative case like "us" is not grammatical at the start of a sentence or phrase, whereas "the" starts sentences and phrases regularly. That is not to say that a speaker would not pause before saying "us" for paralinguistic effect or due to a disfluency or simple pause, it's just less likely to occur after silence than "the". -By the same token, silence following "the" is also less likely than for "us" due to syntax, but pauses are more likely to follow some pronunciations of "the" than others. For instance, if a speaker produces a full vowel variant like :ipa_inline:`[ð i]`, a pause is more likely to follow than a reduced variant like :ipa_line:`[ð ə]`. The reduced variant will be more likely overall, but it often occurs in running connected speech at normal speech rates. The full vowel variant is more likely to occur in less connected speech, such as when the speaker is planning upcoming speech or speaking more slowly. Accounting for the likelihood of silence before and after a variant allows the model to output a variant that is less likely overall, but more likely given the context. +By the same token, silence following "the" is also less likely than for "us" due to syntax, but pauses are more likely to follow some pronunciations of "the" than others. For instance, if a speaker produces a full vowel variant like :ipa_inline:`[ð i]`, a pause is more likely to follow than a reduced variant like :ipa_inline:`[ð ə]`. The reduced variant will be more likely overall, but it often occurs in running connected speech at normal speech rates. The full vowel variant is more likely to occur in less connected speech, such as when the speaker is planning upcoming speech or speaking more slowly. Accounting for the likelihood of silence before and after a variant allows the model to output a variant that is less likely overall, but more likely given the context. -Consider the following FST with two pronunciations of "because" from the `English US MFA dictionary`_. +However, when we take into account more context, it is not just the single word that determines the likelihood of silence, but also the preceding/following words. The difficulty in estimating the overall likelihood of silence is that the lexicon FST is predicated on each word being independent and composable with any preceding/following word. Thus, for each word, we estimate a probability of silence following (independent of the following words), and two correction factors for silence and non-silence before. The two correction factors take into account the general likelihood of silence following each of the preceding words and gives two factors that represent "is silence or not silence more likely than we would expect given the previous word". These factors are only an approximation, however they do help in alignment. +.. note:: - .. figure:: ../../_static/because.svg - :align: center - :alt: FST for two pronunciations of "the" in the English US dictionary - -In the above, there are are two start states, with 0 corresponding to a word preceded by ``non-silence`` and 1 corresponding to a word preceded by ``silence``. + Probabilities of multi-word sequences are the domain of the :term:`grammar FST`, please refer :ref:`grammar FST concept section `. MFA uses three variables to capture the probabilities of silence before and after a pronunciation. The most straightforward is ``probability of silence following``, which is calculated as the count of instances where the word was followed by silence divided by the overall count of that pronunciation, with a smoothing factor. Reproducing equation 3 of `Chen et al (2015)`_: @@ -54,7 +61,7 @@ MFA uses three variables to capture the probabilities of silence before and afte P(s_{r} | w.p) = \frac{C(w.p \: s) + \lambda_{2}P(s)}{C(w.p) + \lambda_{2}} -Given that we're using a lexicon where words are completely independent, modelling the silence before the pronunciation is a little tricky. The approach used in silprob is to estimate two correction factors for silence and non-silence before the pronunciation. These correction factors capture that for a given pronunciation, it is more or less likely than average to have silence. The factors are estimated as follows, reproducing equations 4-6 from `Chen et al (2015)`_: +Given that we're using a lexicon where words are assumed to be completely independent, modelling the silence before the pronunciation is a little tricky. The approach used in `Chen et al (2015)`_ is to estimate two correction factors for silence and non-silence before the pronunciation. These correction factors capture that for a given pronunciation, it is more or less likely than average to have silence. The factors are estimated as follows, reproducing equations 4-6 from `Chen et al (2015)`_: .. math:: @@ -67,12 +74,12 @@ Given that we're using a lexicon where words are completely independent, modelli The estimate count :math:`\tilde{C}` represents a "mean" count of silence or non-silence preceding a given pronunciation, taking into account the likelihood of silence from the preceding pronunciation. The correction factors are weights on the FST transitions from silence and non-silence state. -Consider the following FST with three pronunciations of "lot" from the `English US MFA dictionary`_. +Consider the following :term:`FST` with three pronunciations of "lot" from the `English US MFA dictionary`_. .. figure:: ../../_static/lot.svg :align: center - :alt: FST for three pronunciations of "lot" in the English US dictionary + :alt: :term:`FST` for three pronunciations of "lot" in the English US dictionary @@ -102,7 +109,7 @@ As an example, consider the following English and Japanese sentences: アカギツネ さん は 本 を 読んだ こと が たくさん あり ます けれども 読む べき 本 は まだまだ いっぱい 残って い ます -A couple recordings of each language is below at different speaking rates +For each of the above sentences (please pardon my Japanese), I recorded a normal speaking rate version and a fast speaking rate version. The two speech rates induce variation in pronunciation, as well as different pause placement. We'll then walk through the calculations that result in the final trained lexicon. .. tab-set:: @@ -162,7 +169,7 @@ A couple recordings of each language is below at different speaking rates :align: center :alt: Waveform, spectrogram, and aligned labels for the fast reading of the Japanese text -The following pronunciation dictionaries: +For alignment, we use the following pronunciation dictionaries, taking pronunciation variants from the `English US MFA dictionary`_ and the `Japanese MFA dictionary`_. .. tab-set:: @@ -249,7 +256,7 @@ The following pronunciation dictionaries: The basic steps to calculating pronunciation and silence probabilities is as follows: 1. Generate word-pronunciation pairs (along with silence labels) from the alignment lattices -2. Use these pairs as input to :ref:`calculating pronunciation probability ` and :ref:`calculating silence probability `. See the results below for walk-throughs of results for various words. +2. Use these pairs as input to :ref:`calculating pronunciation probability ` and :ref:`calculating silence probability `. See the results table below for a walk-through of results for various words across the two reading passage styles. .. tab-set:: @@ -304,23 +311,29 @@ The basic steps to calculating pronunciation and silence probabilities is as fol **Pronunciation probabilities** - Using the alignments above for the two speech rates, the word "red" has 0.99 pronunciation probability as that's the only pronunciation variant. The word "read" pronounced as :ipa_inline:`[ɹ ɛ d]` has 0.99 probability, as will the pronunciation as :ipa_inline:`[ɹ iː d]`, as they both appeared once in the sentence (and twice across the two speech rates), but note that it is not 0.5, as the probabilities are max-normalized. All other words will have one pronunciation with 0.99, if the have one realized pronunciation, unrealized pronunciations will have a smoothed probability close to 0, based on the number of pronunciations. + Using the alignments above for the two speech rates, the word "red" has 0.99 pronunciation probability as that's the only pronunciation variant. The word "read" pronounced as :ipa_inline:`[ɹ ɛ d]` has 0.99 probability, as will the pronunciation as :ipa_inline:`[ɹ iː d]`, as they both appeared once in the sentence (and twice across the two speech rates), but note that it is not 0.5, as the probabilities are max-normalized. Both full and reduced forms of "but" (:ipa_inline:`[b ɐ t]` and :ipa_inline:`[b ɐ ʔ]`) have pronunciation probability of 0.99, as they each occur once across the passages. - **Probabilities of having silence following** + .. note:: - The word "books" has a silence following probability of 0.34, as it only occurs before silence in the slower speech rate sentence. You might expect it to have a silence probability of 0.5, but recall from the equation of :math:`P(s_{r} | w.p)`, the smoothing factor is influenced by the overall rate of silence following words, which is quite low for the sentences with connected speech. + I'm not sure why the :ipa_inline:`[b ɐ ʔ]` variant is chosen over the :ipa_inline:`[b ə ɾ]`, this will require future investigation to figure out a root cause. - The pronunciation of "read" as :ipa_inline:`[ɹ iː d]` has a higher probability of following silence of 0.59, as both instances of that pronunciation are followed by silence at the end of the sentence. The pronunciation of "read" as :ipa_inline:`[ɹ ɛ d]` will have a probability of following silence of 0.09, as the only instances are in the middle of speech in the first clause. Likewise, both full and reduced forms of "but" (:ipa_inline:`[b ɐ t]` and :ipa_inline:`[b ɐ ʔ]`) have pronunciation probability of 0.99. + All other words will have one pronunciation with 0.99, if they have one realized pronunciation, unrealized pronunciations will have a smoothed probability close to 0, based on the number of pronunciations. .. note:: - I'm not sure why the :ipa_inline:`[b ɐ ʔ]` variant is chosen over the :ipa_inline:`[b ə ɾ]`, but this could be an issue with the multi-dialectal model training having glottal stops more predicted for actual realizations of flap, or the English US MFA dictionary could benefit from more words ending in flap instead of just :ipa_inline:`[ʔ]`, :ipa_inline:`[t]`, and :ipa_inline:`[d]`, since only certain function/common words have final flapps. + "Unrealized pronunciations" refer to pronunciation variants that are not represented in training data, i.e., for the word "to", only the :ipa_inline:`[t ə]` was used, so :ipa_inline:`[tʰ ʉː]`, :ipa_inline:`[tʰ ʊ]`, and :ipa_inline:`[ɾ ə]` are unrealized. + + **Probabilities of having silence following** + + The word "books" has a probability of silence following at 0.34, as it only occurs before silence in the slower speech rate sentence. You might expect it to have a silence probability of 0.5, but recall from the equation of :math:`P(s_{r} | w.p)`, the smoothing factor is influenced by the overall rate of silence following words, which is quite low for the sentences with connected speech. + + The pronunciation of "read" as :ipa_inline:`[ɹ iː d]` has a higher probability of silence following of 0.59, as both instances of that pronunciation are followed by silence at the end of the sentence. The pronunciation of "read" as :ipa_inline:`[ɹ ɛ d]` will have a probability of following silence of 0.09, as the only instances are in the middle of speech in the first clause. - **Probabilities of having silence before** + **Probabilities of having silence preceding** - The both pronunciations present of word "the" (:ipa_inline:`[ð iː]` and :ipa_inline:`[ð ə]`) has a silence before correction factor (1.49) greater than the non-silence correction factor (0.67), as it only appears after silence in both speech rates. With the non-silence correction factor below 1, the cost in the FST of transitioning out of the non-silence state will be much higher than transitioning out of the silence state. When the silence correction factor is greater than 1, the pronunciation is more likely following silence than you would expect given all the previous words, which will reduce the cost of transitioning out of the silence state. + Both pronunciations present of word "the" (:ipa_inline:`[ð iː]` and :ipa_inline:`[ð ə]`) have a silence preceding correction factor (1.49) greater than the non-silence correction factor (0.67), as it only appears after silence in both speech rates. With the non-silence correction factor below 1, the cost in the FST of transitioning out of the non-silence state will be much higher than transitioning out of the silence state. When the silence correction factor is greater than 1, the pronunciation is more likely following silence than you would expect given all the previous words, which will reduce the cost of transitioning out of the silence state. - The fuller form of the word "but" (:ipa_inline:`[b ɐ t]`) has a silence before correction factor (1.28) greater than the non-silence correction factor (0.75), so the full form will have lower cost transitioning out of the silence state and than the non-silence state. On the other hand, the more reduced form :ipa_inline:`[b ɐ ʔ]` has the opposite patten, with a silence before correction factor (0.85) greater than the non-silence correction factor (1.13), so the reduced form will have a lower cost transitioning out of the non-silence state than the silence state. + The fuller form of the word "but" (:ipa_inline:`[b ɐ t]`) has a silence preceding correction factor (1.28) greater than the non-silence correction factor (0.75), so the full form will have lower cost transitioning out of the silence state and than the non-silence state. On the other hand, the more reduced form :ipa_inline:`[b ɐ ʔ]` has the opposite patten, with a silence before correction factor (0.85) greater than the non-silence correction factor (1.13), so the reduced form will have a lower cost transitioning out of the non-silence state than the silence state. .. tab-item:: Japanese diff --git a/docs/source/user_guide/glossary.rst b/docs/source/user_guide/glossary.rst index 93b8899f..e489f646 100644 --- a/docs/source/user_guide/glossary.rst +++ b/docs/source/user_guide/glossary.rst @@ -30,6 +30,16 @@ Glossary MFCCs :abbr:`Mel-frequency cepstrum coefficients (MFCCs)` are the industry standard for acoustic features. The process involves windowing the acoustic waveform, scaling the frequencies into the Mel space (an auditory representation that gives more weight to lower frequencies over higher frequencies), and then performs a :abbr:`discrete cosine transform (DCT)` on the values in each filter bank to get orthogonal coefficients. There was a trend around 2015-2018 to use acoustic features that were more raw (i.e., not transformed to the Mel space, or the waveform directly), but in general most recent state of the art systems still use MFCC features. + WFST + FST + A :abbr:`Finite State Transducer (FST)` is a graph formalism that can transform a sequence of arbitrary input symbols into arbitrary output symbols. A :abbr:`Weighted Finite State Transducer (WFST)` is an FST that has costs associated with its various paths, so a single best output string can be selected. Training graphs are WFSTs of the lexicon WFST composed with linear acceptors of the transcription text. For transcription, lexicons are composed with language models as well. MFA's :term:`G2P models` are WFSTs trained using a pair ngram algorithm or the many to many Phonetisaurus algortithm. + + lexicon FST + A :term:`WFST` constructed from a pronunciation dictionary that can be composed with :term:`grammar FST` and HMM-GMM acoustic model to align and transcribe speech. + + grammar FST + A :term:`WFST` compiled from a language model that represents how likely a word is given the previous words (ngram model), or a linear acceptor from a known utterance transcription where there is only one path through the words in the transcript for use in alignment. + Pronunciation probabilities Pronunciation probabilities in dictionaries allow for certain spoken forms to be more likely, rather than just assigning equal weight to all pronunciation variants. diff --git a/montreal_forced_aligner/corpus/acoustic_corpus.py b/montreal_forced_aligner/corpus/acoustic_corpus.py index f7d0394d..09cca4c8 100644 --- a/montreal_forced_aligner/corpus/acoustic_corpus.py +++ b/montreal_forced_aligner/corpus/acoustic_corpus.py @@ -324,8 +324,6 @@ def construct_feature_proc_strings( """ strings = [] for j in self.jobs: - if not j.has_data: - continue lda_mat_path = None fmllrs = {} if self.working_directory is not None: diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py index 7ec32e1e..cd9d9480 100644 --- a/montreal_forced_aligner/corpus/base.py +++ b/montreal_forced_aligner/corpus/base.py @@ -839,42 +839,6 @@ def create_subset(self, subset: int) -> None: session.commit() - # Extra check to make sure the randomness didn't end up with 1 or 2 utterances - # for a particular job/dictionary combo - subset_agg = ( - session.query( - Speaker.job_id, Speaker.dictionary_id, sqlalchemy.func.count(Utterance.id) - ) - .join(Utterance.speaker) - .filter(Utterance.in_subset == True) # noqa - .group_by(Speaker.job_id, Speaker.dictionary_id) - ) - for j_id, d_id, utterance_count in subset_agg: - if utterance_count < 20: - larger_subset_query = ( - session.query(Utterance.id) - .join(Utterance.speaker) - .filter(Speaker.dictionary_id == d_id) - .filter(Speaker.job_id == j_id) - .filter(Utterance.ignored == False) # noqa - ) - sq = larger_subset_query.subquery() - subset_utts = ( - sqlalchemy.select(sq.c.id) - .order_by(sqlalchemy.func.random()) - .limit(20) - .scalar_subquery() - ) - query = ( - sqlalchemy.update(Utterance) - .execution_options(synchronize_session="fetch") - .values(in_subset=True) - .where(Utterance.id.in_(subset_utts)) - ) - session.execute(query) - - subset_count = session.query(Utterance).filter_by(in_subset=True).count() - self.log_debug(f"Total subset utterances is {subset_count}") self.log_debug(f"Setting subset flags took {time.time()-begin} seconds") log_dir = os.path.join(subset_directory, "log") os.makedirs(log_dir, exist_ok=True) @@ -925,6 +889,18 @@ def subset_directory(self, subset: typing.Optional[int]) -> str: directory = os.path.join(self.corpus_output_directory, f"subset_{subset}") if not os.path.exists(directory): self.create_subset(subset) + for j in self.jobs: + j.has_data = False + with self.session() as session: + query = ( + session.query(Speaker.job_id, sqlalchemy.func.count(Utterance.id)) + .join(Utterance.speaker) + .filter(Utterance.in_subset == True) # noqa + .group_by(Speaker.job_id) + ) + for job_id, utterance_count in query: + if utterance_count > 0: + self.jobs[job_id].has_data = True return directory def calculate_word_counts(self) -> None: diff --git a/montreal_forced_aligner/corpus/multiprocessing.py b/montreal_forced_aligner/corpus/multiprocessing.py index 70c5d291..c6efadb1 100644 --- a/montreal_forced_aligner/corpus/multiprocessing.py +++ b/montreal_forced_aligner/corpus/multiprocessing.py @@ -380,51 +380,7 @@ def output_to_directory(self, split_directory: str, session, subset=False) -> No break else: return - no_data = [] - - def _write_current() -> None: - """Write the current data to disk""" - if not utt2spk: - if _current_dict_id is not None: - no_data.append(_current_dict_id) - return - dict_pattern = f"{self.name}" - if _current_dict_id is not None: - dict_pattern = f"{_current_dict_id}.{self.name}" - scp_path = os.path.join(split_directory, f"spk2utt.{dict_pattern}.scp") - with open(scp_path, "w", encoding="utf8") as f: - for speaker in sorted(spk2utt.keys()): - utts = " ".join(sorted(spk2utt[speaker])) - f.write(f"{speaker} {utts}\n") - scp_path = os.path.join(split_directory, f"cmvn.{dict_pattern}.scp") - with open(scp_path, "w", encoding="utf8") as f: - for speaker in sorted(cmvns.keys()): - f.write(f"{speaker} {cmvns[speaker]}\n") - - scp_path = os.path.join(split_directory, f"utt2spk.{dict_pattern}.scp") - with open(scp_path, "w", encoding="utf8") as f: - for utt in sorted(utt2spk.keys()): - f.write(f"{utt} {utt2spk[utt]}\n") - scp_path = os.path.join(split_directory, f"feats.{dict_pattern}.scp") - with open(scp_path, "w", encoding="utf8") as f: - for utt in sorted(feats.keys()): - f.write(f"{utt} {feats[utt]}\n") - scp_path = os.path.join(split_directory, f"text.{dict_pattern}.int.scp") - with open(scp_path, "w", encoding="utf8") as f: - for utt in sorted(text_ints.keys()): - f.write(f"{utt} {text_ints[utt]}\n") - scp_path = os.path.join(split_directory, f"text.{dict_pattern}.scp") - with open(scp_path, "w", encoding="utf8") as f: - for utt in sorted(texts.keys()): - f.write(f"{utt} {texts[utt]}\n") - - spk2utt = {} - feats = {} - cmvns = {} - utt2spk = {} - text_ints = {} - texts = {} - _current_dict_id = None + data = {} utterances = ( session.query( Utterance.id, @@ -433,6 +389,7 @@ def _write_current() -> None: Utterance.normalized_text, Utterance.normalized_text_int, Speaker.cmvn, + Speaker.dictionary_id, ) .join(Utterance.speaker) .filter(Speaker.job_id == self.name) @@ -442,64 +399,69 @@ def _write_current() -> None: if subset: utterances = utterances.filter(Utterance.in_subset == True) # noqa if utterances.count() == 0: - self.has_data = False - return - if not self.dictionary_ids: - for u_id, s_id, features, normalized_text, normalized_text_int, cmvn in utterances: - utterance = str(u_id) - speaker = str(s_id) - utterance = f"{speaker}-{utterance}" - if speaker not in spk2utt: - spk2utt[speaker] = [] - spk2utt[speaker].append(utterance) - utt2spk[utterance] = speaker - feats[utterance] = features - cmvns[speaker] = cmvn - text_ints[utterance] = normalized_text_int - texts[utterance] = normalized_text - _write_current() return - for _current_dict_id in self.dictionary_ids: - spk2utt = {} - feats = {} - cmvns = {} - utt2spk = {} - text_ints = {} - utterances = ( - session.query( - Utterance.kaldi_id, - Utterance.speaker_id, - Utterance.features, - Utterance.normalized_text, - Utterance.normalized_text_int, - Speaker.cmvn, - ) - .join(Utterance.speaker) - .filter(Speaker.job_id == self.name) - .filter(Speaker.dictionary_id == _current_dict_id) - .filter(Utterance.ignored == False) # noqa - .order_by(Utterance.kaldi_id) - ) - if subset: - utterances = utterances.filter(Utterance.in_subset == True) # noqa - for ( - utterance, - s_id, - features, - normalized_text, - normalized_text_int, - cmvn, - ) in utterances: - speaker = str(s_id) - if speaker not in spk2utt: - spk2utt[speaker] = [] - spk2utt[speaker].append(utterance) - utt2spk[utterance] = speaker - feats[utterance] = features - cmvns[speaker] = cmvn - text_ints[utterance] = normalized_text_int - texts[utterance] = normalized_text - _write_current() - for d in no_data: - ind = self.dictionary_ids.index(d) - self.dictionary_ids.pop(ind) + for ( + u_id, + s_id, + features, + normalized_text, + normalized_text_int, + cmvn, + dictionary_id, + ) in utterances: + if dictionary_id not in data: + data[dictionary_id] = { + "spk2utt": {}, + "feats": {}, + "cmvns": {}, + "utt2spk": {}, + "text_ints": {}, + "texts": {}, + } + utterance = str(u_id) + speaker = str(s_id) + utterance = f"{speaker}-{utterance}" + if speaker not in data[dictionary_id]["spk2utt"]: + data[dictionary_id]["spk2utt"][speaker] = [] + data[dictionary_id]["spk2utt"][speaker].append(utterance) + data[dictionary_id]["utt2spk"][utterance] = speaker + data[dictionary_id]["feats"][utterance] = features + data[dictionary_id]["cmvns"][speaker] = cmvn + data[dictionary_id]["text_ints"][utterance] = normalized_text_int + data[dictionary_id]["texts"][utterance] = normalized_text + + for dict_id, d in data.items(): + dict_pattern = f"{self.name}" + if dict_id is not None: + dict_pattern = f"{dict_id}.{self.name}" + + scp_path = os.path.join(split_directory, f"spk2utt.{dict_pattern}.scp") + with open(scp_path, "w", encoding="utf8") as f: + for speaker in sorted(d["spk2utt"].keys()): + utts = " ".join(sorted(d["spk2utt"][speaker])) + f.write(f"{speaker} {utts}\n") + + scp_path = os.path.join(split_directory, f"cmvn.{dict_pattern}.scp") + with open(scp_path, "w", encoding="utf8") as f: + for speaker in sorted(d["cmvns"].keys()): + f.write(f"{speaker} {d['cmvns'][speaker]}\n") + + scp_path = os.path.join(split_directory, f"utt2spk.{dict_pattern}.scp") + with open(scp_path, "w", encoding="utf8") as f: + for utt in sorted(d["utt2spk"].keys()): + f.write(f"{utt} {d['utt2spk'][utt]}\n") + + scp_path = os.path.join(split_directory, f"feats.{dict_pattern}.scp") + with open(scp_path, "w", encoding="utf8") as f: + for utt in sorted(d["feats"].keys()): + f.write(f"{utt} {d['feats'][utt]}\n") + + scp_path = os.path.join(split_directory, f"text.{dict_pattern}.int.scp") + with open(scp_path, "w", encoding="utf8") as f: + for utt in sorted(d["text_ints"].keys()): + f.write(f"{utt} {d['text_ints'][utt]}\n") + + scp_path = os.path.join(split_directory, f"text.{dict_pattern}.scp") + with open(scp_path, "w", encoding="utf8") as f: + for utt in sorted(d["texts"].keys()): + f.write(f"{utt} {d['texts'][utt]}\n") diff --git a/montreal_forced_aligner/db.py b/montreal_forced_aligner/db.py index f8ac91cf..13051587 100644 --- a/montreal_forced_aligner/db.py +++ b/montreal_forced_aligner/db.py @@ -356,6 +356,12 @@ class Word(MfaSqlBase): dictionary: Dictionary = relationship("Dictionary", back_populates="words") pronunciations = relationship("Pronunciation", back_populates="word") + job = relationship( + "Word2Job", + back_populates="word", + uselist=False, + ) + __table_args__ = ( sqlalchemy.Index("dictionary_word_type_index", "dictionary_id", "word_type"), sqlalchemy.Index("word_dictionary_index", "word", "dictionary_id"), @@ -1212,3 +1218,109 @@ def as_ctm(self) -> CtmInterval: CTM interval object """ return CtmInterval(self.begin, self.end, self.label, self.utterance_id) + + +class Job(MfaSqlBase): + + __tablename__ = "job" + + id = Column(Integer, primary_key=True) + + symbols = relationship( + "M2M2Job", + back_populates="job", + ) + + words = relationship( + "Word2Job", + back_populates="job", + ) + + +class M2MSymbol(MfaSqlBase): + """ + + Database class for storing information many to many G2P training information + + Parameters + ---------- + id: int + Primary key + symbol: str + Symbol + total_order: int + Summed order of graphemes and phones + max_order: int + Maximum order between graphemes and phones + grapheme_order: int + Grapheme order + phone_order: int + Phone order + weight: float + Weight of arcs + """ + + __tablename__ = "m2m_symbol" + + id = Column(Integer, primary_key=True) + symbol = Column(String, nullable=False, index=True, unique=True) + total_order = Column(Integer, nullable=False) + max_order = Column(Integer, nullable=False) + grapheme_order = Column(Integer, nullable=False) + phone_order = Column(Integer, nullable=False) + weight = Column(Float, nullable=False) + + jobs = relationship( + "M2M2Job", + back_populates="m2m_symbol", + ) + + +class M2M2Job(MfaSqlBase): + """ + Mapping class between :class:`~montreal_forced_aligner.db.M2MSymbol` + and :class:`~montreal_forced_aligner.db.Job` + + Parameters + ---------- + m2m_id: int + Foreign key to :class:`~montreal_forced_aligner.db.M2MSymbol` + job_id: int + Foreign key to :class:`~montreal_forced_aligner.db.Job` + m2m_symbol: :class:`~montreal_forced_aligner.db.M2MSymbol` + M2MSymbol object + job: :class:`~montreal_forced_aligner.db.Job` + Job object + """ + + __tablename__ = "m2m_job" + m2m_id = Column(ForeignKey("m2m_symbol.id"), primary_key=True) + job_id = Column(ForeignKey("job.id"), primary_key=True) + m2m_symbol: M2MSymbol = relationship("M2MSymbol", back_populates="jobs") + job: Job = relationship("Job", back_populates="symbols") + + +class Word2Job(MfaSqlBase): + """ + Mapping class between :class:`~montreal_forced_aligner.db.Word` + and :class:`~montreal_forced_aligner.db.Job` + + Parameters + ---------- + word_id: int + Foreign key to :class:`~montreal_forced_aligner.db.M2MSymbol` + job_id: int + Foreign key to :class:`~montreal_forced_aligner.db.Job` + word: :class:`~montreal_forced_aligner.db.Word` + Word object + job: :class:`~montreal_forced_aligner.db.Job` + Job object + """ + + __tablename__ = "word_job" + + word_id = Column(ForeignKey("word.id"), primary_key=True) + job_id = Column(ForeignKey("job.id"), primary_key=True) + training = Column(Boolean, index=True) + word: Word = relationship("Word", back_populates="job") + job: Job = relationship("Job", back_populates="words") diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py index 32031487..59491820 100644 --- a/montreal_forced_aligner/dictionary/multispeaker.py +++ b/montreal_forced_aligner/dictionary/multispeaker.py @@ -433,8 +433,12 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]: line = line.strip() if not line: continue - line = line.split() - word = line.pop(0) + if "\t" in line: + word, line = line.split("\t", maxsplit=1) + line = line.split() + else: + line = line.split() + word = line.pop(0) if len(line) == 0: raise DictionaryError( f'Error parsing line {i} of {dictionary_model.path}: "{line}" did not have a pronunciation' diff --git a/montreal_forced_aligner/exceptions.py b/montreal_forced_aligner/exceptions.py index 9366182e..e6d23979 100644 --- a/montreal_forced_aligner/exceptions.py +++ b/montreal_forced_aligner/exceptions.py @@ -751,6 +751,19 @@ def __init__(self, error_dict: Dict[str, Exception]): ) +class PhonetisaurusSymbolError(G2PError): + """ + Exception class for errors generating pronunciations with Pynini + """ + + def __init__(self, symbol, variable): + super().__init__("") + self.message_lines = [ + f'The symbol "{symbol}" is reserved for "{variable}", but is found in the graphemes or phonemes of your dictionary.', + f'Please re-run and specify another symbol that is not used in your dictionary with the "--{variable}" flag.', + ] + + class LMError(MFAError): """ Exception class for errors in language models diff --git a/montreal_forced_aligner/g2p/generator.py b/montreal_forced_aligner/g2p/generator.py index 762786a0..658b82f8 100644 --- a/montreal_forced_aligner/g2p/generator.py +++ b/montreal_forced_aligner/g2p/generator.py @@ -216,9 +216,9 @@ def __call__(self, graphemes: str) -> List[Tuple[str, ...]]: # pragma: no cover for j in range(1, self.grapheme_order + 1): if i + j <= len(graphemes): substring = self.seq_sep.join(graphemes[i : i + j]) - state = self.input_token_type.find(substring) - if state != pynini.NO_SYMBOL: - fst.add_arc(start_state, pynini.Arc(state, state, one, i + j)) + ilabel = self.input_token_type.find(substring) + if ilabel != pynini.NO_LABEL: + fst.add_arc(start_state, pynini.Arc(ilabel, ilabel, one, i + j)) if i + j >= max_state: max_state = i + j for _ in range(fst.num_states(), max_state + 1): diff --git a/montreal_forced_aligner/g2p/phonetisaurus_trainer.py b/montreal_forced_aligner/g2p/phonetisaurus_trainer.py index bb15de59..1397fe96 100644 --- a/montreal_forced_aligner/g2p/phonetisaurus_trainer.py +++ b/montreal_forced_aligner/g2p/phonetisaurus_trainer.py @@ -3,19 +3,20 @@ import multiprocessing as mp import os import queue -import random import subprocess import time -from typing import Dict import dataclassy import numpy +import sqlalchemy import tqdm +from sqlalchemy.orm import scoped_session, sessionmaker from montreal_forced_aligner.abc import MetaDict, TopLevelMfaWorker from montreal_forced_aligner.data import WordType -from montreal_forced_aligner.db import Pronunciation, Word +from montreal_forced_aligner.db import Job, M2M2Job, M2MSymbol, Pronunciation, Word, Word2Job from montreal_forced_aligner.dictionary.multispeaker import MultispeakerDictionaryMixin +from montreal_forced_aligner.exceptions import PhonetisaurusSymbolError from montreal_forced_aligner.g2p.generator import PyniniValidator from montreal_forced_aligner.g2p.trainer import G2PTrainer from montreal_forced_aligner.models import G2PModel @@ -40,40 +41,61 @@ @dataclassy.dataclass(slots=True) -class LabelData: - """Label data class for penalizing alignments based on the size of their left/right side""" +class MaximizationArguments: + """Arguments for the MaximizationWorker""" - max: int - tot: int - lhs: int - rhs: int - lhsE: bool - rhsE: bool + db_path: str + far_path: str + penalize_em: bool + batch_size: int @dataclassy.dataclass(slots=True) -class MaximizationArguments: - """Arguments for the maximization worker""" +class ExpectationArguments: + """Arguments for the ExpectationWorker""" + db_path: str far_path: str - alignment_model: Dict[int, pynini.Weight] - penalize_em: bool - penalties: Dict[int, LabelData] + batch_size: int + + +@dataclassy.dataclass(slots=True) +class AlignmentExportArguments: + """Arguments for the NgramCountWorker""" + + db_path: str + log_path: str + far_path: str + penalize: bool + + +@dataclassy.dataclass(slots=True) +class NgramCountArguments: + """Arguments for the NgramCountWorker""" + + log_path: str + far_path: str + alignment_symbols_path: str + order: int @dataclassy.dataclass(slots=True) class AlignmentInitArguments: """Arguments for the alignment initialization worker""" + db_path: str + log_path: str far_path: str deletions: bool insertions: bool restrict: bool phone_order: int grapheme_order: int + eps: str s1s2_sep: str seq_sep: str skip: str + batch_size: int class AlignmentInitWorker(mp.Process): @@ -82,45 +104,30 @@ class AlignmentInitWorker(mp.Process): Parameters ---------- - job_q: mp.Queue - Queue of grapheme-phoneme transcriptions to process - return_queue: mp.Queue + job_name: int + Integer ID for the job + return_queue: :class:`multiprocessing.Queue` Queue to return data - stopped: Stopped + stopped: :class:`~montreal_forced_aligner.utils.Stopped` Stop check - finished_adding: Stopped + finished_adding: :class:`~montreal_forced_aligner.utils.Stopped` Check for whether the job queue is done - symbol_dict: dict - Symbol to integer ID mapping dictionary - next_symbol: mp.Value - Integer value to use for the next symbol - lock: mp.Lock - Lock to use for data shared across processes args: :class:`~montreal_forced_aligner.g2p.phonetisaurus_trainer.AlignmentInitArguments` Arguments for initialization """ def __init__( self, - job_q: mp.Queue, + job_name: int, return_queue: mp.Queue, stopped: Stopped, finished_adding: Stopped, - symbol_dict: dict, - reverse_symbol_dict: dict, - next_symbol: mp.Value, - lock: mp.Lock, args: AlignmentInitArguments, ): mp.Process.__init__(self) - self.job_q = job_q + self.job_name = job_name self.return_queue = return_queue self.stopped = stopped - self.symbol_cache = {} - self.symbol_dict = symbol_dict - self.reverse_symbol_dict = reverse_symbol_dict - self.next_symbol: mp.Value = next_symbol - self.lock = lock self.finished = Stopped() self.finished_adding = finished_adding self.deletions = args.deletions @@ -128,222 +135,233 @@ def __init__( self.restrict = args.restrict self.phone_order = args.phone_order self.grapheme_order = args.grapheme_order + self.eps = args.eps self.s1s2_sep = args.s1s2_sep self.seq_sep = args.seq_sep self.skip = args.skip self.far_path = args.far_path - - def look_up_symbol(self, symbol: str) -> int: - """ - Look up a symbol based on the process's symbol cache and symbol table - - Parameters - ---------- - symbol: str - Symbol to look up - - Returns - ------- - int - Symbol ID in table - """ - if symbol not in self.symbol_cache: - with self.lock: - if symbol not in self.symbol_dict: - id = self.next_symbol.value - self.symbol_dict[symbol] = id - self.reverse_symbol_dict[id] = symbol - self.next_symbol.value += 1 - self.symbol_cache[symbol] = self.symbol_dict[symbol] - return self.symbol_cache[symbol] + self.sym_path = self.far_path.replace(".far", ".syms") + self.log_path = args.log_path + self.db_path = args.db_path + self.batch_size = args.batch_size def run(self) -> None: """Run the function""" - current_index = 0 - far_writer = pywrapfst.FarWriter.create(self.far_path, arc_type="log") - while True: - try: - graphemes, phones = self.job_q.get(timeout=1) - except queue.Empty: - if self.finished_adding.stop_check(): - break - continue - if self.stopped.stop_check(): - continue - try: - key = f"{current_index:08x}" - fst = pynini.Fst(arc_type="log") - initial_weight = pywrapfst.Weight(fst.weight_type(), 99) - final_state = ((len(graphemes) + 1) * (len(phones) + 1)) - 1 - for _ in range(final_state + 1): - fst.add_state() - for i in range(len(graphemes) + 1): - for j in range(len(phones) + 1): - istate = i * (len(phones) + 1) + j - if self.deletions: - for phone_range in range(1, self.phone_order + 1): - if j + phone_range <= len(phones): - subseq_phones = phones[j : j + phone_range] - symbol = self.look_up_symbol( - self.s1s2_sep.join( - [self.skip, self.seq_sep.join(subseq_phones)] + symbol_table = pynini.SymbolTable() + symbol_table.add_symbol(self.eps) + engine = sqlalchemy.create_engine( + f"sqlite:///file:{self.db_path}?mode=ro&nolock=1&uri=true" + ) + Session = scoped_session(sessionmaker(bind=engine, autoflush=False, autocommit=False)) + valid_phone_ngrams = set() + base_dir = os.path.dirname(self.far_path) + with open(os.path.join(base_dir, "phone_ngram.ngrams"), "r", encoding="utf8") as f: + for line in f: + line = line.strip() + valid_phone_ngrams.add(line) + valid_grapheme_ngrams = set() + with open(os.path.join(base_dir, "grapheme_ngram.ngrams"), "r", encoding="utf8") as f: + for line in f: + line = line.strip() + valid_grapheme_ngrams.add(line) + count = 0 + data = {} + with open(self.log_path, "w", encoding="utf8") as log_file, Session() as session: + far_writer = pywrapfst.FarWriter.create(self.far_path, arc_type="log") + query = ( + session.query(Pronunciation.pronunciation, Word.word) + .join(Pronunciation.word) + .join(Word.job) + .filter(Word2Job.training == True) # noqa + .filter(Word2Job.job_id == self.job_name) + ) + for current_index, (phones, graphemes) in enumerate(query): + graphemes = list(graphemes) + phones = phones.split() + if self.stopped.stop_check(): + continue + try: + key = f"{current_index:08x}" + fst = pynini.Fst(arc_type="log") + final_state = ((len(graphemes) + 1) * (len(phones) + 1)) - 1 + + for _ in range(final_state + 1): + fst.add_state() + for i in range(len(graphemes) + 1): + for j in range(len(phones) + 1): + istate = i * (len(phones) + 1) + j + if self.deletions: + for phone_range in range(1, self.phone_order + 1): + if j + phone_range <= len(phones): + subseq_phones = phones[j : j + phone_range] + phone_string = self.seq_sep.join(subseq_phones) + if ( + phone_range > 1 + and phone_string not in valid_phone_ngrams + ): + continue + symbol = self.s1s2_sep.join([self.skip, phone_string]) + ilabel = symbol_table.find(symbol) + if ilabel == pynini.NO_LABEL: + ilabel = symbol_table.add_symbol(symbol) + ostate = i * (len(phones) + 1) + (j + phone_range) + fst.add_arc( + istate, + pywrapfst.Arc( + ilabel, ilabel, pynini.Weight("log", 99.0), ostate + ), ) - ) - ostate = i * (len(phones) + 1) + (j + phone_range) - fst.add_arc( - istate, - pywrapfst.Arc(symbol, symbol, initial_weight, ostate), - ) - if self.insertions: - for k in range(1, self.grapheme_order + 1): - if i + k <= len(graphemes): - subseq_graphemes = graphemes[i : i + k] - symbol = self.look_up_symbol( - self.s1s2_sep.join( - [self.seq_sep.join(subseq_graphemes), self.skip] + if self.insertions: + for grapheme_range in range(1, self.grapheme_order + 1): + if i + grapheme_range <= len(graphemes): + subseq_graphemes = graphemes[i : i + grapheme_range] + grapheme_string = self.seq_sep.join(subseq_graphemes) + if ( + grapheme_range > 1 + and grapheme_string not in valid_grapheme_ngrams + ): + continue + symbol = self.s1s2_sep.join([grapheme_string, self.skip]) + ilabel = symbol_table.find(symbol) + if ilabel == pynini.NO_LABEL: + ilabel = symbol_table.add_symbol(symbol) + ostate = (i + grapheme_range) * (len(phones) + 1) + j + fst.add_arc( + istate, + pywrapfst.Arc( + ilabel, ilabel, pynini.Weight("log", 99.0), ostate + ), ) - ) - ostate = (i + k) * (len(phones) + 1) + j - fst.add_arc( - istate, - pywrapfst.Arc(symbol, symbol, initial_weight, ostate), - ) - - for grapheme_range in range(1, self.grapheme_order + 1): - for phone_range in range(1, self.phone_order + 1): - if i + grapheme_range <= len(graphemes) and j + phone_range <= len( - phones - ): - if self.restrict and grapheme_range > 1 and phone_range > 1: - continue - subseq_phones = phones[j : j + phone_range] - phone_string = self.seq_sep.join(subseq_phones) - subseq_graphemes = graphemes[i : i + grapheme_range] - grapheme_string = self.seq_sep.join(subseq_graphemes) - symbol = self.look_up_symbol( - self.s1s2_sep.join([grapheme_string, phone_string]) - ) - ostate = (i + grapheme_range) * (len(phones) + 1) + ( - j + phone_range - ) - weight = pywrapfst.Weight( - fst.weight_type(), float(grapheme_range * phone_range) - ) - fst.add_arc( - istate, pywrapfst.Arc(symbol, symbol, weight, ostate) - ) - fst.set_start(0) - fst.set_final(final_state, pywrapfst.Weight.one(fst.weight_type())) - if not self.insertions or not self.deletions: + for grapheme_range in range(1, self.grapheme_order + 1): + for phone_range in range(1, self.phone_order + 1): + if i + grapheme_range <= len( + graphemes + ) and j + phone_range <= len(phones): + if ( + self.restrict + and grapheme_range > 1 + and phone_range > 1 + ): + continue + subseq_phones = phones[j : j + phone_range] + phone_string = self.seq_sep.join(subseq_phones) + if ( + phone_range > 1 + and phone_string not in valid_phone_ngrams + ): + continue + subseq_graphemes = graphemes[i : i + grapheme_range] + grapheme_string = self.seq_sep.join(subseq_graphemes) + if ( + grapheme_range > 1 + and grapheme_string not in valid_grapheme_ngrams + ): + continue + symbol = self.s1s2_sep.join( + [grapheme_string, phone_string] + ) + ilabel = symbol_table.find(symbol) + if ilabel == pynini.NO_LABEL: + ilabel = symbol_table.add_symbol(symbol) + ostate = (i + grapheme_range) * (len(phones) + 1) + ( + j + phone_range + ) + fst.add_arc( + istate, + pywrapfst.Arc( + ilabel, + ilabel, + pynini.Weight( + "log", float(grapheme_range * phone_range) + ), + ostate, + ), + ) + fst.set_start(0) + fst.set_final(final_state, pywrapfst.Weight.one(fst.weight_type())) fst = pynini.connect(fst) - far_writer[key] = fst - self.return_queue.put(fst) - current_index += 1 - except Exception as e: # noqa - self.stopped.stop() - self.return_queue.put(e) - raise + for state in fst.states(): + for arc in fst.arcs(state): + sym = symbol_table.find(arc.ilabel) + if sym not in data: + data[sym] = arc.weight + else: + data[sym] = pynini.plus(data[sym], arc.weight) + if count >= self.batch_size: + data = {k: float(v) for k, v in data.items()} + self.return_queue.put((self.job_name, data, count)) + data = {} + count = 0 + log_file.flush() + far_writer[key] = fst + del fst + count += 1 + except Exception as e: # noqa + print(e) + self.stopped.stop() + self.return_queue.put(e) + if data: + data = {k: float(v) for k, v in data.items()} + self.return_queue.put((self.job_name, data, count)) self.finished.stop() + del far_writer + symbol_table.write_text(self.far_path.replace(".far", ".syms")) return -class MaximizationWorker(mp.Process): - """ - Multiprocessing worker that runs the maximization step of training for a subset of the data - - Parameters - ---------- - return_queue: mp.Queue - Queue to return data - stopped: Stopped - Stop check - args: :class:`~montreal_forced_aligner.g2p.phonetisaurus_trainer.MaximizationArguments` - Arguments for maximization - """ - - def __init__(self, return_queue: mp.Queue, stopped: Stopped, args: MaximizationArguments): - mp.Process.__init__(self) - self.return_queue = return_queue - self.stopped = stopped - self.finished = Stopped() - self.penalize_em = args.penalize_em - self.alignment_model = args.alignment_model - self.penalties = args.penalties - self.far_path = args.far_path - - def run(self) -> None: - """Run the function""" - zero = pynini.Weight.zero("log") - far_reader = pywrapfst.FarReader.open(self.far_path) - far_writer = pywrapfst.FarWriter.create(self.far_path + ".temp", arc_type="log") - while not far_reader.done(): - if self.stopped.stop_check(): - break - key = far_reader.get_key() - fst = far_reader.get_fst() - for state_id in fst.states(): - maiter = fst.mutable_arcs(state_id) - while not maiter.done(): - arc = maiter.value() - if not self.penalize_em: - arc.weight = self.alignment_model[arc.ilabel] - else: - label_data = self.penalties[arc.ilabel] - if label_data.lhs > 1 and label_data.rhs > 1: - arc.weight = pynini.Weight(fst.weight_type(), 99) - elif not label_data.lhsE and not label_data.rhsE: - arc.weight = pynini.Weight( - fst.weight_type(), float(arc.weight) * label_data.tot - ) - if arc.weight == zero: - arc.weight = pynini.Weight(fst.weight_type(), 99) - arc = pywrapfst.Arc(arc.ilabel, arc.olabel, arc.weight, arc.nextstate) - maiter.set_value(arc) - next(maiter) - far_writer[key] = fst - next(far_reader) - self.return_queue.put(1) - os.remove(self.far_path) - os.rename(self.far_path + ".temp", self.far_path) - self.finished.stop() - - class ExpectationWorker(mp.Process): """ Multiprocessing worker that runs the expectation step of training for a subset of the data Parameters ---------- - far_path: str - Path to FST archive file - return_queue: mp.Queue + job_name: int + Integer ID for the job + return_queue: :class:`multiprocessing.Queue` Queue to return data - stopped: Stopped + stopped: :class:`~montreal_forced_aligner.utils.Stopped` Stop check + args: :class:`~montreal_forced_aligner.g2p.phonetisaurus_trainer.ExpectationArguments` + Arguments for the function """ def __init__( - self, - far_path: str, - return_queue: mp.Queue, - stopped: Stopped, + self, job_name: int, return_queue: mp.Queue, stopped: Stopped, args: ExpectationArguments ): mp.Process.__init__(self) - self.far_path = far_path + self.job_name = job_name + self.db_path = args.db_path + self.far_path = args.far_path + self.batch_size = args.batch_size self.return_queue = return_queue self.stopped = stopped self.finished = Stopped() def run(self) -> None: """Run the function""" + engine = sqlalchemy.create_engine( + f"sqlite:///file:{self.db_path}?mode=ro&nolock=1&uri=true" + ) + Session = scoped_session(sessionmaker(bind=engine, autoflush=False, autocommit=False)) far_reader = pywrapfst.FarReader.open(self.far_path) + symbol_table = pynini.SymbolTable.read_text(self.far_path.replace(".far", ".syms")) + symbol_mapper = {} + data = {} + count = 0 + with Session() as session: + query = ( + session.query(M2MSymbol.symbol, M2MSymbol.id) + .join(M2MSymbol.jobs) + .filter(M2M2Job.job_id == self.job_name) + ) + for symbol, sym_id in query: + symbol_mapper[symbol_table.find(symbol)] = sym_id while not far_reader.done(): if self.stopped.stop_check(): break - key = far_reader.get_key() fst = far_reader.get_fst() - data = {} + zero = pynini.Weight.zero("log") try: fst = pynini.Fst.read_from_string(fst.write_to_string()) alpha = pynini.shortestdistance(fst) @@ -357,19 +375,264 @@ def run(self) -> None: beta[0], ) if float(gamma) != numpy.inf: - if arc.ilabel not in data: - data[arc.ilabel] = 0 - data[arc.ilabel] += float(gamma) - self.return_queue.put((key, data)) + sym_id = symbol_mapper[arc.ilabel] + if sym_id not in data: + data[sym_id] = zero + data[sym_id] = pynini.plus(data[sym_id], gamma) + if count >= self.batch_size: + data = {k: float(v) for k, v in data.items()} + self.return_queue.put((data, count)) + data = {} + count = 0 next(far_reader) + del alpha + del beta + del fst + count += 1 except Exception as e: # noqa self.stopped.stop() self.return_queue.put(e) raise + if data: + data = {k: float(v) for k, v in data.items()} + self.return_queue.put((data, count)) self.finished.stop() + del far_reader return +class MaximizationWorker(mp.Process): + """ + Multiprocessing worker that runs the maximization step of training for a subset of the data + + Parameters + ---------- + job_name: int + Integer ID for the job + return_queue: :class:`multiprocessing.Queue` + Queue to return data + stopped: :class:`~montreal_forced_aligner.utils.Stopped` + Stop check + args: :class:`~montreal_forced_aligner.g2p.phonetisaurus_trainer.MaximizationArguments` + Arguments for maximization + """ + + def __init__( + self, job_name: int, return_queue: mp.Queue, stopped: Stopped, args: MaximizationArguments + ): + mp.Process.__init__(self) + self.job_name = job_name + self.return_queue = return_queue + self.stopped = stopped + self.finished = Stopped() + self.db_path = args.db_path + self.penalize_em = args.penalize_em + self.far_path = args.far_path + self.batch_size = args.batch_size + + def run(self) -> None: + """Run the function""" + symbol_table = pynini.SymbolTable.read_text(self.far_path.replace(".far", ".syms")) + count = 0 + try: + engine = sqlalchemy.create_engine( + f"sqlite:///file:{self.db_path}?mode=ro&nolock=1&uri=true" + ) + Session = scoped_session(sessionmaker(bind=engine, autoflush=False, autocommit=False)) + alignment_model = {} + with Session() as session: + query = ( + session.query(M2MSymbol) + .join(M2MSymbol.jobs) + .filter(M2M2Job.job_id == self.job_name) + ) + for m2m in query: + weight = pynini.Weight("log", m2m.weight) + if self.penalize_em: + if m2m.grapheme_order > 1 or m2m.phone_order > 1: + weight = pynini.Weight("log", float(weight) * m2m.total_order) + if weight == pynini.Weight.zero("log") or float(weight) == numpy.inf: + weight = pynini.Weight("log", 99) + alignment_model[symbol_table.find(m2m.symbol)] = weight + far_reader = pywrapfst.FarReader.open(self.far_path) + far_writer = pywrapfst.FarWriter.create(self.far_path + ".temp", arc_type="log") + while not far_reader.done(): + if self.stopped.stop_check(): + break + key = far_reader.get_key() + fst = far_reader.get_fst() + for state_id in fst.states(): + maiter = fst.mutable_arcs(state_id) + while not maiter.done(): + arc = maiter.value() + arc.weight = alignment_model[arc.ilabel] + arc = pywrapfst.Arc(arc.ilabel, arc.olabel, arc.weight, arc.nextstate) + maiter.set_value(arc) + next(maiter) + del maiter + far_writer[key] = fst + next(far_reader) + if count >= self.batch_size: + self.return_queue.put(count) + count = 0 + del fst + count += 1 + del far_reader + del far_writer + os.remove(self.far_path) + os.rename(self.far_path + ".temp", self.far_path) + except Exception as e: + self.stopped.stop() + self.return_queue.put(e) + raise + finally: + if count >= 1: + self.return_queue.put(count) + self.finished.stop() + + +class AlignmentExporter(mp.Process): + """ + Multiprocessing worker to generate Ngram counts for aligned FST archives + + Parameters + ---------- + return_queue: :class:`multiprocessing.Queue` + Queue to return data + stopped: :class:`~montreal_forced_aligner.utils.Stopped` + Stop check + args: :class:`~montreal_forced_aligner.g2p.phonetisaurus_trainer.AlignmentExportArguments` + Arguments for maximization + """ + + def __init__(self, return_queue: mp.Queue, stopped: Stopped, args: AlignmentExportArguments): + mp.Process.__init__(self) + self.return_queue = return_queue + self.stopped = stopped + self.finished = Stopped() + self.penalize = args.penalize + self.far_path = args.far_path + self.log_path = args.log_path + self.db_path = args.db_path + + def run(self) -> None: + """Run the function""" + symbol_table = pynini.SymbolTable.read_text(self.far_path.replace(".far", ".syms")) + with open(self.log_path, "w", encoding="utf8") as log_file: + far_reader = pywrapfst.FarReader.open(self.far_path) + one_best_path = self.far_path + ".strings" + no_alignment_count = 0 + total = 0 + with open(one_best_path, "w", encoding="utf8") as f: + while not far_reader.done(): + fst = far_reader.get_fst() + total += 1 + if fst.num_states() == 0: + next(far_reader) + no_alignment_count += 1 + self.return_queue.put(1) + continue + tfst = pynini.arcmap( + pynini.Fst.read_from_string(fst.write_to_string()), map_type="to_std" + ) + if self.penalize: + for state in tfst.states(): + maiter = tfst.mutable_arcs(state) + while not maiter.done(): + arc = maiter.value() + sym = symbol_table.find(arc.ilabel) + ld = self.penalties[sym] + if ld.lhs > 1 and ld.rhs > 1: + arc.weight = pynini.Weight(tfst.weight_type(), 999) + else: + arc.weight = pynini.Weight( + tfst.weight_type(), float(arc.weight) * ld.max + ) + maiter.set_value(arc) + next(maiter) + del maiter + pfst = rewrite.lattice_to_dfa(tfst, True, 8).project("output").rmepsilon() + + if pfst.start() != pynini.NO_SYMBOL: + path = pynini.shortestpath(pfst) + else: + pfst = rewrite.lattice_to_dfa(tfst, False, 8).project("output").rmepsilon() + path = pynini.shortestpath(pfst) + string = path.string(symbol_table) + f.write(f"{string}\n") + log_file.flush() + next(far_reader) + self.return_queue.put(1) + del fst + del pfst + del path + del tfst + log_file.write( + f"Done {total - no_alignment_count}, no alignment for {no_alignment_count}" + ) + log_file.flush() + self.finished.stop() + del far_reader + + +class NgramCountWorker(mp.Process): + """ + Multiprocessing worker to generate Ngram counts for aligned FST archives + + Parameters + ---------- + return_queue: :class:`multiprocessing.Queue` + Queue to return data + stopped: :class:`~montreal_forced_aligner.utils.Stopped` + Stop check + args: :class:`~montreal_forced_aligner.g2p.phonetisaurus_trainer.NgramCountArguments` + Arguments for maximization + """ + + def __init__(self, return_queue: mp.Queue, stopped: Stopped, args: NgramCountArguments): + mp.Process.__init__(self) + self.return_queue = return_queue + self.stopped = stopped + self.finished = Stopped() + self.order = args.order + self.far_path = args.far_path + self.log_path = args.log_path + self.alignment_symbols_path = args.alignment_symbols_path + + def run(self) -> None: + """Run the function""" + with open(self.log_path, "w", encoding="utf8") as log_file: + one_best_path = self.far_path + ".strings" + ngram_count_path = self.far_path.replace(".far", ".cnts") + farcompile_proc = subprocess.Popen( + [ + thirdparty_binary("farcompilestrings"), + "--token_type=symbol", + f"--symbols={self.alignment_symbols_path}", + one_best_path, + ], + stderr=log_file, + stdout=subprocess.PIPE, + env=os.environ, + ) + ngramcount_proc = subprocess.Popen( + [ + thirdparty_binary("ngramcount"), + "--require_symbols=false", + "--round_to_int", + f"--order={self.order}", + "-", + ngram_count_path, + ], + stderr=log_file, + stdin=farcompile_proc.stdout, + # stdout=subprocess.PIPE, + env=os.environ, + ) + ngramcount_proc.communicate() + self.finished.stop() + + class PhonetisaurusTrainerMixin: """ Mixin class for training Phonetisaurus style models @@ -377,18 +640,9 @@ class PhonetisaurusTrainerMixin: Parameters ---------- order: int - Order of the ngram model, defaults to 7 - random_starts: int - Number of random starts to use in initialization, defaults to 25 - seed: int - Seed for randomization, defaults to 1917 - delta: float - Comparison/quantization delta for Baum-Welch training, defaults to 1/1024 - alpha: float - Step size reduction power parameter for Baum-Welch training; - full standard batch EM is run (not stepwise) if set to 0, defaults to 1.0 + Order of the ngram model, defaults to 8 batch_size:int - Batch size for Baum-Welch training, defaults to 200 + Batch size for training, defaults to 1000 num_iterations:int Maximum number of iterations to use in Baum-Welch training, defaults to 10 smoothing_method:str @@ -397,58 +651,83 @@ class PhonetisaurusTrainerMixin: Pruning method for pruning the ngram model, defaults to "relative_entropy" model_size: int Target number of ngrams for pruning, defaults to 1000000 + initial_prune_threshold: float + Pruning threshold for calculating the multiple phone/grapheme strings that are to be allowed, defaults to 0.0001 insertions: bool Flag for whether to allow for insertions, default True deletions: bool Flag for whether to allow for deletions, default True + restrict_m2m: bool + Flag for whether to restrict possible alignments to one-to-many and disable many-to-many alignments, default False + penalize_em: bool + Flag for whether to many-to-many and one-to-many are penalized over one-to-one mappings during training, default False + penalize: bool + Flag for whether to many-to-many and one-to-many are penalized over one-to-one mappings during export, default False + sequence_separator: str + Character to use for concatenating and aligning multiple phones or graphemes, defaults to "|" + skip: str + Character to use to represent deletions or insertions, defaults to "_" + alignment_separator: str + Character to use for concatenating grapheme strings and phone strings, defaults to ";" grapheme_order: int Maximum number of graphemes to map to single phones phone_order: int Maximum number of phones to map to single graphemes - fst_default_cache_gc: str - String to pass to OpenFst binaries for GC behavior - fst_default_cache_gc_limit: str - String to pass to OpenFst binaries for GC behavior + em_threshold: float + Threshold of minimum change for early stopping of EM training """ def __init__( self, order: int = 8, + batch_size: int = 1000, num_iterations: int = 10, smoothing_method: str = "kneser_ney", pruning_method: str = "relative_entropy", model_size: int = 1000000, + initial_prune_threshold: float = 0.0001, insertions: bool = True, deletions: bool = True, + restrict_m2m: bool = False, + penalize_em: bool = False, + penalize: bool = False, + sequence_separator: str = "|", + skip: str = "_", + alignment_separator: str = ";", grapheme_order: int = 2, phone_order: int = 2, - fst_default_cache_gc="", - fst_default_cache_gc_limit="", + em_threshold: float = 1e-5, **kwargs, ): super().__init__(**kwargs) if not hasattr(self, "_data_source"): self._data_source = None self.order = order + self.batch_size = batch_size self.num_iterations = num_iterations self.smoothing_method = smoothing_method self.pruning_method = pruning_method self.model_size = model_size + self.initial_prune_threshold = initial_prune_threshold self.insertions = insertions self.deletions = deletions - self.fst_default_cache_gc = fst_default_cache_gc - self.fst_default_cache_gc_limit = fst_default_cache_gc_limit self.grapheme_order = grapheme_order self.phone_order = phone_order - self.seq_sep = "|" - self.s1s2_sep = "}" - self.skip = "_" + self.sequence_separator = sequence_separator + self.alignment_separator = alignment_separator + self.skip = skip self.eps = "" - self.restrict = True - self.penalize_em = True - self.penalize = True + self.restrict_m2m = restrict_m2m + self.penalize_em = penalize_em + self.penalize = penalize + self.em_threshold = em_threshold self.g2p_num_training_pronunciations = 0 + self.symbol_table = pynini.SymbolTable() + self.symbol_table.add_symbol(self.eps) + self.total = pynini.Weight.zero("log") + self.prev_total = pynini.Weight.zero("log") + @property def architecture(self) -> str: """Phonetisaurus""" @@ -459,137 +738,111 @@ def initialize_alignments(self) -> None: Initialize alignment FSTs for training """ - self.symbol_table = pynini.SymbolTable() - self.symbol_table.add_symbol(self.eps) - self.symbol_table.add_symbol(self.skip) - self.symbol_table.add_symbol(f"{self.seq_sep}_{self.seq_sep}") - self.symbol_table.add_symbol(self.s1s2_sep) - model_params = [ - "true" if self.deletions else "false", - "true" if self.insertions else "false", - str(self.grapheme_order), - str(self.phone_order), - ] - self.symbol_table.add_symbol("_".join(model_params)) - self.alignment_model: Dict[int, pynini.Weight] = {} - self.prev_alignment_model: Dict[int, pynini.Weight] = {} - self.penalties: Dict[int, LabelData] = {} - self.total = pynini.Weight.zero("log") - self.prev_total = pynini.Weight.zero("log") - self.fsas = [] self.log_info("Creating alignment FSTs...") - with mp.Manager() as manager: - mp_symbol_dict = manager.dict() - mp_reverse_symbol_dict = manager.dict() - lock = mp.Lock() - next_symbol = mp.Value("i", self.symbol_table.num_symbols()) - for i in range(self.symbol_table.num_symbols()): - sym = self.symbol_table.find(i) - mp_symbol_dict[sym] = i - mp_symbol_dict[i] = sym - job_queue = mp.Queue() - return_queue = mp.Queue() - stopped = Stopped() - finished_adding = Stopped() - procs = [] - for i in range(self.num_jobs): - args = AlignmentInitArguments( - os.path.join(self.working_directory, f"{i}.far"), - self.deletions, - self.insertions, - self.restrict, - self.phone_order, - self.grapheme_order, - self.s1s2_sep, - self.seq_sep, - self.skip, - ) - procs.append( - AlignmentInitWorker( - job_queue, - return_queue, - stopped, - finished_adding, - mp_symbol_dict, - mp_reverse_symbol_dict, - next_symbol, - lock, - args, - ) + return_queue = mp.Queue() + stopped = Stopped() + finished_adding = Stopped() + procs = [] + for i in range(self.num_jobs): + args = AlignmentInitArguments( + self.db_path, + os.path.join(self.working_log_directory, f"alignment_init.{i}.log"), + os.path.join(self.working_directory, f"{i}.far"), + self.deletions, + self.insertions, + self.restrict_m2m, + self.phone_order, + self.grapheme_order, + self.eps, + self.alignment_separator, + self.sequence_separator, + self.skip, + self.batch_size, + ) + procs.append( + AlignmentInitWorker( + i, + return_queue, + stopped, + finished_adding, + args, ) - procs[i].start() - self.g2p_num_training_pronunciations = 0 - for word, pronunciations in self.g2p_training_dictionary.items(): - graphemes = list(word) - for p in pronunciations: - phones = p.split() - job_queue.put((graphemes, phones)) - self.g2p_num_training_pronunciations += 1 - finished_adding.stop() - error_list = [] - reverse_look_up = {} - with tqdm.tqdm( - total=self.g2p_num_training_pronunciations, disable=getattr(self, "quiet", False) - ) as pbar: - while True: - try: - fst = return_queue.get(timeout=1) - if isinstance(fst, Exception): - error_list.append(fst) - continue - if stopped.stop_check(): - continue - except queue.Empty: - for p in procs: - if not p.finished.stop_check(): - break - else: - break + ) + procs[i].start() + + finished_adding.stop() + error_list = [] + symbols = {} + job_symbols = {} + symbol_id = 1 + with tqdm.tqdm( + total=self.g2p_num_training_pronunciations, disable=getattr(self, "quiet", False) + ) as pbar, self.session(autoflush=False, autocommit=False) as session: + while True: + try: + result = return_queue.get(timeout=2) + if isinstance(result, Exception): + error_list.append(result) continue - for state_id in fst.states(): - for arc in fst.arcs(state_id): - if arc.ilabel not in self.prev_alignment_model: - self.prev_alignment_model[arc.ilabel] = arc.weight - if arc.ilabel not in reverse_look_up: - reverse_look_up[arc.ilabel] = mp_reverse_symbol_dict[ - arc.ilabel - ] - sym = reverse_look_up[arc.ilabel] - d = sym.find("}") - c = sym.find("|") - left_side_count = 1 - right_side_count = 1 - if c != -1: - if c < d: - left_side_count += 1 - else: - right_side_count += 1 - max_count = max(left_side_count, right_side_count) - self.penalties[arc.ilabel] = LabelData( - tot=left_side_count + right_side_count, - max=max_count, - lhs=left_side_count, - rhs=right_side_count, - lhsE=False, - rhsE=False, - ) - else: - self.prev_alignment_model[arc.ilabel] = pynini.plus( - self.prev_alignment_model[arc.ilabel], arc.weight - ) - self.total = pynini.plus(self.total, arc.weight) - pbar.update(1) + if stopped.stop_check(): + continue + except queue.Empty: + for p in procs: + if not p.finished.stop_check(): + break + else: + break + continue + job_name, weights, count = result + for symbol, weight in weights.items(): + weight = pynini.Weight("log", weight) + if symbol not in symbols: + left_side, right_side = symbol.split(self.alignment_separator) + if left_side == self.skip: + left_side_order = 0 + else: + left_side_order = 1 + left_side.count(self.sequence_separator) + if right_side == self.skip: + right_side_order = 0 + else: + right_side_order = 1 + right_side.count(self.sequence_separator) + max_order = max(left_side_order, right_side_order) + total_order = left_side_order + right_side_order + symbols[symbol] = { + "symbol": symbol, + "id": symbol_id, + "total_order": total_order, + "max_order": max_order, + "grapheme_order": left_side_order, + "phone_order": right_side_order, + "weight": weight, + } + symbol_id += 1 + else: + symbols[symbol]["weight"] = pynini.plus(symbols[symbol]["weight"], weight) + self.total = pynini.plus(self.total, weight) + if job_name not in job_symbols: + job_symbols[job_name] = set() + job_symbols[job_name].add(symbols[symbol]["id"]) + pbar.update(count) for p in procs: p.join() - for sym, key in mp_symbol_dict.items(): - if self.symbol_table.find(sym) == pynini.NO_SYMBOL: - self.symbol_table.add_symbol(sym, key=key) - if error_list: - for v in error_list: - raise v - - def maximization(self) -> float: + if error_list: + for v in error_list: + raise v + self.log_debug(f"Total of {len(symbols)} symbols, initial total: {self.total}") + session.bulk_insert_mappings(M2MSymbol, [x for x in symbols.values()]) + session.flush() + del symbols + mappings = [] + for j, sym_ids in job_symbols.items(): + mappings.extend({"m2m_id": x, "job_id": j} for x in sym_ids) + session.bulk_insert_mappings(M2M2Job, mappings) + + session.commit() + + def maximization(self, last_iteration=False) -> float: """ Run the maximization step for training @@ -599,25 +852,28 @@ def maximization(self) -> float: Current iteration's score """ self.log_info("Performing maximization step...") - cond = False change = abs(float(self.total) - float(self.prev_total)) - zero = pynini.Weight.zero("log") - if not cond: - self.prev_total = self.total - for ilabel, weight in self.prev_alignment_model.items(): - self.alignment_model[ilabel] = pynini.divide(weight, self.total) - self.prev_alignment_model[ilabel] = zero + self.log_debug(f"Previous total: {float(self.prev_total)}") + self.log_debug(f"Current total: {float(self.total)}") + self.log_debug(f"Change: {change}") + + self.prev_total = self.total + with self.session(autoflush=False, autocommit=False) as session: + session.query(M2MSymbol).update( + {"weight": M2MSymbol.weight - float(self.total)}, synchronize_session=False + ) + session.commit() return_queue = mp.Queue() stopped = Stopped() procs = [] for i in range(self.num_jobs): args = MaximizationArguments( + self.db_path, os.path.join(self.working_directory, f"{i}.far"), - self.alignment_model, self.penalize_em, - self.penalties, + self.batch_size, ) - procs.append(MaximizationWorker(return_queue, stopped, args)) + procs.append(MaximizationWorker(i, return_queue, stopped, args)) procs[i].start() error_list = [] @@ -639,14 +895,19 @@ def maximization(self) -> float: else: break continue - pbar.update(1) + + pbar.update(result) for p in procs: p.join() if error_list: for v in error_list: raise v - self.total = zero + if not last_iteration and change >= self.em_threshold: # we're still converging + self.total = pynini.Weight.zero("log") + with self.session(autoflush=False, autocommit=False) as session: + session.query(M2MSymbol).update({"weight": 0.0}) + session.commit() self.log_info(f"Maximization done! Change from last iteration was {change:.3f}") return change @@ -660,13 +921,13 @@ def expectation(self) -> None: error_list = [] procs = [] for i in range(self.num_jobs): - procs.append( - ExpectationWorker( - os.path.join(self.working_directory, f"{i}.far"), return_queue, stopped - ) + args = ExpectationArguments( + self.db_path, os.path.join(self.working_directory, f"{i}.far"), self.batch_size ) + procs.append(ExpectationWorker(i, return_queue, stopped, args)) procs[i].start() - + mappings = {} + zero = pynini.Weight.zero("log") with tqdm.tqdm( total=self.g2p_num_training_pronunciations, disable=getattr(self, "quiet", False) ) as pbar: @@ -678,7 +939,6 @@ def expectation(self) -> None: continue if stopped.stop_check(): continue - index, data = result except queue.Empty: for p in procs: if not p.finished.stop_check(): @@ -686,19 +946,25 @@ def expectation(self) -> None: else: break continue - for ilabel, gamma in data.items(): + result, count = result + for sym_id, gamma in result.items(): gamma = pynini.Weight("log", gamma) - self.prev_alignment_model[ilabel] = pynini.plus( - self.prev_alignment_model[ilabel], gamma - ) + if sym_id not in mappings: + mappings[sym_id] = zero + mappings[sym_id] = pynini.plus(mappings[sym_id], gamma) self.total = pynini.plus(self.total, gamma) - pbar.update(1) + pbar.update(count) for p in procs: p.join() if error_list: for v in error_list: raise v + with self.session() as session: + session.bulk_update_mappings( + M2MSymbol, [{"id": k, "weight": v} for k, v in mappings.items()] + ) + session.commit() self.log_info("Expectation done!") def train_ngram_model(self) -> None: @@ -708,29 +974,66 @@ def train_ngram_model(self) -> None: if os.path.exists(self.fst_path): self.log_info("Ngram model already exists.") return + self.log_info("Generating ngram counts...") + return_queue = mp.Queue() + stopped = Stopped() + error_list = [] + procs = [] + count_paths = [] + for i in range(self.num_jobs): + args = NgramCountArguments( + os.path.join(self.working_log_directory, f"ngram_count.{i}.log"), + os.path.join(self.working_directory, f"{i}.far"), + self.alignment_symbols_path, + self.order, + ) + procs.append(NgramCountWorker(return_queue, stopped, args)) + count_paths.append(args.far_path.replace(".far", ".cnts")) + procs[i].start() + + with tqdm.tqdm( + total=self.g2p_num_training_pronunciations, disable=getattr(self, "quiet", False) + ) as pbar: + while True: + try: + result = return_queue.get(timeout=1) + if isinstance(result, Exception): + error_list.append(result) + continue + if stopped.stop_check(): + continue + except queue.Empty: + for p in procs: + if not p.finished.stop_check(): + break + else: + break + continue + pbar.update(1) + for p in procs: + p.join() + + if error_list: + for v in error_list: + raise v + self.log_info("Done counting ngrams!") + self.log_info("Training ngram model...") with open( os.path.join(self.working_log_directory, "model.log"), "w", encoding="utf8" ) as logf: - ngramcount_proc = subprocess.Popen( - [ - thirdparty_binary("ngramcount"), - "--require_symbols=false", - "--round_to_int", - f"--order={self.order}", - self.far_path, - ], + ngrammerge_proc = subprocess.Popen( + [thirdparty_binary("ngrammerge"), *count_paths], stderr=logf, stdout=subprocess.PIPE, env=os.environ, ) - ngramcount_proc.communicate() ngrammake_proc = subprocess.Popen( [ thirdparty_binary("ngrammake"), f"--method={self.smoothing_method}", ], - stdin=ngramcount_proc.stdout, + stdin=ngrammerge_proc.stdout, stderr=logf, stdout=subprocess.PIPE, env=os.environ, @@ -754,11 +1057,11 @@ def train_ngram_model(self) -> None: ngram_fst = pynini.Fst.read(self.ngram_path) grapheme_symbols = pynini.SymbolTable() grapheme_symbols.add_symbol(self.eps) - grapheme_symbols.add_symbol(self.seq_sep) + grapheme_symbols.add_symbol(self.sequence_separator) grapheme_symbols.add_symbol(self.skip) phone_symbols = pynini.SymbolTable() phone_symbols.add_symbol(self.eps) - phone_symbols.add_symbol(self.seq_sep) + phone_symbols.add_symbol(self.sequence_separator) phone_symbols.add_symbol(self.skip) single_phone_symbols = pynini.SymbolTable() single_phone_symbols.add_symbol(self.eps) @@ -774,50 +1077,57 @@ def train_ngram_model(self) -> None: arc = maiter.value() symbol = self.symbol_table.find(arc.ilabel) try: - grapheme, phone = symbol.split(self.s1s2_sep) - g_symbol = grapheme_symbols.find(grapheme) - if g_symbol == pynini.NO_SYMBOL: - g_symbol = grapheme_symbols.add_symbol(grapheme) - p_symbol = phone_symbols.find(phone) - if p_symbol == pynini.NO_SYMBOL: - p_symbol = phone_symbols.add_symbol(phone) - singles = phone.split(self.seq_sep) - for i, s in enumerate(singles): - s_symbol = single_phone_symbols.find(s) - - if s_symbol == pynini.NO_SYMBOL: - s_symbol = single_phone_symbols.add_symbol(s) - if i == 0: - single_start = start_state - else: - single_start = current_ind - if i < len(singles) - 1: - current_ind = single_phone_fst.add_state() - end_state = current_ind - else: - end_state = start_state - single_phone_fst.add_arc( - single_start, - pywrapfst.Arc( - p_symbol if i == 0 else 0, s_symbol, one, end_state - ), - ) + grapheme, phone = symbol.split(self.alignment_separator) + if grapheme == self.skip: + g_symbol = grapheme_symbols.find(self.eps) + else: + g_symbol = grapheme_symbols.find(grapheme) + if g_symbol == pynini.NO_SYMBOL: + g_symbol = grapheme_symbols.add_symbol(grapheme) + if phone == self.skip: + p_symbol = phone_symbols.find(self.eps) + else: + p_symbol = phone_symbols.find(phone) + if p_symbol == pynini.NO_SYMBOL: + p_symbol = phone_symbols.add_symbol(phone) + singles = phone.split(self.sequence_separator) + for i, s in enumerate(singles): + s_symbol = single_phone_symbols.find(s) + + if s_symbol == pynini.NO_SYMBOL: + s_symbol = single_phone_symbols.add_symbol(s) + if i == 0: + single_start = start_state + else: + single_start = current_ind + if i < len(singles) - 1: + current_ind = single_phone_fst.add_state() + end_state = current_ind + else: + end_state = start_state + single_phone_fst.add_arc( + single_start, + pywrapfst.Arc( + p_symbol if i == 0 else 0, s_symbol, one, end_state + ), + ) arc = pywrapfst.Arc(g_symbol, p_symbol, arc.weight, arc.nextstate) maiter.set_value(arc) except ValueError: - if symbol == "": + if symbol in {"", "", ""}: arc = pywrapfst.Arc(0, 0, arc.weight, arc.nextstate) maiter.set_value(arc) else: + print(symbol) raise - pass - next(maiter) + finally: + next(maiter) for i in range(grapheme_symbols.num_symbols()): sym = grapheme_symbols.find(i) - if sym in {self.eps, self.seq_sep, self.skip}: + if sym in {self.eps, self.sequence_separator, self.skip}: continue - parts = sym.split(self.seq_sep) + parts = sym.split(self.sequence_separator) if len(parts) > 1: for s in parts: if grapheme_symbols.find(s) == pynini.NO_SYMBOL: @@ -826,9 +1136,9 @@ def train_ngram_model(self) -> None: for i in range(phone_symbols.num_symbols()): sym = phone_symbols.find(i) - if sym in {self.eps, self.seq_sep, self.skip}: + if sym in {self.eps, self.sequence_separator, self.skip}: continue - parts = sym.split(self.seq_sep) + parts = sym.split(self.sequence_separator) if len(parts) > 1: for s in parts: if phone_symbols.find(s) == pynini.NO_SYMBOL: @@ -849,7 +1159,7 @@ def train_alignments(self) -> None: """ Run an Expectation-Maximization (EM) training on alignment FSTs to generate well-aligned FSTs for ngram modeling """ - if os.path.exists(self.far_path): + if os.path.exists(self.alignment_model_path): self.log_info("Using existing alignments.") self.symbol_table = pynini.SymbolTable.read_text(self.alignment_symbols_path) return @@ -860,10 +1170,9 @@ def train_alignments(self) -> None: for i in range(self.num_iterations): self.log_info(f"Iteration {i}") self.expectation() - change = self.maximization() - if change < 1e-10: + change = self.maximization(last_iteration=i == self.num_iterations - 1) + if change < self.em_threshold: break - self.export_alignments() @property def data_directory(self) -> str: @@ -905,7 +1214,6 @@ def export_model(self, output_model_path: str) -> None: basename, _ = os.path.splitext(output_model_path) model.dump(basename) model.clean_up() - # self.clean_up() self.log_info(f"Saved model to {output_model_path}") @property @@ -948,63 +1256,71 @@ def export_alignments(self) -> None: Combine alignment training archives to a final combined FST archive to train the ngram model """ self.log_info("Exporting final alignments...") - model = pynini.Fst(arc_type="log") - model.add_state() - model.set_start(0) - model.set_final(0, pynini.Weight.one(model.arc_type())) - for ilabel, weight in self.alignment_model.items(): - model.add_arc(0, pywrapfst.Arc(ilabel, ilabel, weight, 0)) - model.set_input_symbols(self.symbol_table) - model.write(self.alignment_model_path) - set_symbols = False - self.symbol_table.write_text(self.alignment_symbols_path) - far_writer = pywrapfst.FarWriter.create(self.far_path) - zero = pynini.Weight.zero("log") - one = pynini.Weight.one("log") - index = 0 + + return_queue = mp.Queue() + stopped = Stopped() + error_list = [] + procs = [] + count_paths = [] + for i in range(self.num_jobs): + args = AlignmentExportArguments( + self.db_path, + os.path.join(self.working_log_directory, f"ngram_count.{i}.log"), + os.path.join(self.working_directory, f"{i}.far"), + self.penalize, + ) + procs.append(AlignmentExporter(return_queue, stopped, args)) + count_paths.append(args.far_path.replace(".far", ".cnts")) + procs[i].start() + with tqdm.tqdm( total=self.g2p_num_training_pronunciations, disable=getattr(self, "quiet", False) ) as pbar: - for i in range(self.num_jobs): - far_reader = pywrapfst.FarReader.open( - os.path.join(self.working_directory, f"{i}.far") - ) - while not far_reader.done(): - fst = far_reader.get_fst() - tfst = pynini.arcmap( - pynini.Fst.read_from_string(fst.write_to_string()), map_type="to_std" - ) - if self.penalize: - for state in tfst.states(): - maiter = tfst.mutable_arcs(state) - while not maiter.done(): - arc = maiter.value() - ld = self.penalties[arc.ilabel] - if ld.lhs > 1 and ld.rhs > 1: - arc.weight = pynini.Weight(tfst.weight_type(), 999) - else: - arc.weight = pynini.Weight( - tfst.weight_type(), float(arc.weight) * ld.max - ) - maiter.set_value(arc) - next(maiter) - tfst = rewrite.lattice_to_dfa(tfst, True, 4).project("output").rmepsilon() - lfst = pynini.arcmap(tfst, map_type="to_log") - pfst = pynini.push(lfst, reweight_type="to_final", push_weights=True) - for state in pfst.states(): - if pfst.final(state) != zero: - pfst.set_final(state, one) - lattice = pynini.arcmap(pfst, map_type="to_std") - - if not set_symbols: - lattice.set_input_symbols(self.symbol_table) - lattice.set_output_symbols(self.symbol_table) - set_symbols = True - key = f"{index:08x}" - far_writer[key] = lattice - pbar.update(1) - index += 1 - next(far_reader) + while True: + try: + result = return_queue.get(timeout=1) + if isinstance(result, Exception): + error_list.append(result) + continue + if stopped.stop_check(): + continue + except queue.Empty: + for p in procs: + if not p.finished.stop_check(): + break + else: + break + continue + pbar.update(1) + for p in procs: + p.join() + + if error_list: + for v in error_list: + raise v + + symbols_proc = subprocess.Popen( + [ + thirdparty_binary("ngramsymbols"), + "--OOV_symbol=", + "--epsilon_symbol=", + "-", + self.alignment_symbols_path, + ], + encoding="utf8", + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + for j in range(self.num_jobs): + text_path = os.path.join(self.working_directory, f"{j}.far.strings") + with open(text_path, "r", encoding="utf8") as f: + for line in f: + symbols_proc.stdin.write(line) + symbols_proc.stdin.flush() + symbols_proc.stdin.close() + symbols_proc.wait() + self.symbol_table = pynini.SymbolTable.read_text(self.alignment_symbols_path) + self.log_info("Done exporting alignments!") class PhonetisaurusTrainer( @@ -1060,6 +1376,7 @@ def train(self) -> None: self.log_debug( f"Aligning {len(self.g2p_training_dictionary)} words took {time.time() - begin} seconds" ) + self.export_alignments() begin = time.time() self.train_ngram_model() self.log_debug( @@ -1087,7 +1404,7 @@ def meta(self) -> MetaDict: "graphemes": self.g2p_training_graphemes, "grapheme_order": self.grapheme_order, "phone_order": self.phone_order, - "seq_sep": self.seq_sep, + "sequence_separator": self.sequence_separator, "evaluation": {}, "training": { "num_words": len(self.g2p_training_dictionary), @@ -1127,60 +1444,259 @@ def evaluate_g2p_model(self) -> None: f.write(f"{orthography}\t{p}\n") self.compute_validation_errors(output) + def compute_initial_ngrams(self): + word_path = os.path.join(self.working_directory, "words.txt") + word_ngram_path = os.path.join(self.working_directory, "grapheme_ngram.fst") + word_symbols_path = os.path.join(self.working_directory, "grapheme_ngram.syms") + symbols_proc = subprocess.Popen( + [ + thirdparty_binary("ngramsymbols"), + "--OOV_symbol=", + "--epsilon_symbol=", + word_path, + word_symbols_path, + ], + encoding="utf8", + ) + symbols_proc.communicate() + farcompile_proc = subprocess.Popen( + [ + thirdparty_binary("farcompilestrings"), + "--token_type=symbol", + f"--symbols={word_symbols_path}", + word_path, + ], + stdout=subprocess.PIPE, + env=os.environ, + ) + ngramcount_proc = subprocess.Popen( + [ + thirdparty_binary("ngramcount"), + "--require_symbols=false", + "--round_to_int", + f"--order={self.grapheme_order}", + ], + stdin=farcompile_proc.stdout, + stdout=subprocess.PIPE, + env=os.environ, + ) + ngrammake_proc = subprocess.Popen( + [ + thirdparty_binary("ngrammake"), + f"--method={self.smoothing_method}", + ], + stdin=ngramcount_proc.stdout, + stdout=subprocess.PIPE, + env=os.environ, + ) + + ngramshrink_proc = subprocess.Popen( + [ + thirdparty_binary("ngramshrink"), + f"--method={self.pruning_method}", + f"--theta={self.initial_prune_threshold}", + ], + stdin=ngrammake_proc.stdout, + stdout=subprocess.PIPE, + env=os.environ, + ) + print_proc = subprocess.Popen( + [ + thirdparty_binary("ngramprint"), + f"--symbols={word_symbols_path}", + ], + env=os.environ, + stdin=ngramshrink_proc.stdout, + stdout=subprocess.PIPE, + encoding="utf8", + ) + ngrams = set() + for line in print_proc.stdout: + line = line.strip().split()[:-1] + ngram = self.sequence_separator.join(x for x in line if x not in {"", ""}) + if self.sequence_separator not in ngram: + continue + ngrams.add(ngram) + + print_proc.wait() + with open(word_ngram_path.replace(".fst", ".ngrams"), "w", encoding="utf8") as f: + for ngram in sorted(ngrams): + f.write(f"{ngram}\n") + + phone_path = os.path.join(self.working_directory, "pronunciations.txt") + phone_ngram_path = os.path.join(self.working_directory, "phone_ngram.fst") + phone_symbols_path = os.path.join(self.working_directory, "phone_ngram.syms") + symbols_proc = subprocess.Popen( + [ + thirdparty_binary("ngramsymbols"), + "--OOV_symbol=", + "--epsilon_symbol=", + phone_path, + phone_symbols_path, + ], + encoding="utf8", + ) + symbols_proc.communicate() + farcompile_proc = subprocess.Popen( + [ + thirdparty_binary("farcompilestrings"), + "--token_type=symbol", + f"--symbols={phone_symbols_path}", + phone_path, + ], + stdout=subprocess.PIPE, + env=os.environ, + ) + ngramcount_proc = subprocess.Popen( + [ + thirdparty_binary("ngramcount"), + "--require_symbols=false", + "--round_to_int", + f"--order={self.phone_order}", + ], + stdin=farcompile_proc.stdout, + stdout=subprocess.PIPE, + env=os.environ, + ) + ngrammake_proc = subprocess.Popen( + [ + thirdparty_binary("ngrammake"), + f"--method={self.smoothing_method}", + ], + stdin=ngramcount_proc.stdout, + stdout=subprocess.PIPE, + env=os.environ, + ) + + ngramshrink_proc = subprocess.Popen( + [ + thirdparty_binary("ngramshrink"), + f"--method={self.pruning_method}", + f"--theta={self.initial_prune_threshold}", + ], + stdin=ngrammake_proc.stdout, + stdout=subprocess.PIPE, + env=os.environ, + ) + print_proc = subprocess.Popen( + [thirdparty_binary("ngramprint"), f"--symbols={phone_symbols_path}"], + env=os.environ, + stdin=ngramshrink_proc.stdout, + stdout=subprocess.PIPE, + encoding="utf8", + ) + ngrams = set() + for line in print_proc.stdout: + line = line.strip().split()[:-1] + ngram = self.sequence_separator.join(x for x in line if x not in {"", ""}) + if self.sequence_separator not in ngram: + continue + ngrams.add(ngram) + + print_proc.wait() + with open(phone_ngram_path.replace(".fst", ".ngrams"), "w", encoding="utf8") as f: + for ngram in sorted(ngrams): + f.write(f"{ngram}\n") + def initialize_training(self) -> None: """Initialize training G2P model""" with self.session() as session: - self.g2p_training_dictionary = {} - pronunciations = ( - session.query(Word.word, Pronunciation.pronunciation) - .join(Pronunciation.word) - .filter(Word.word_type.in_([WordType.speech, WordType.clitic])) + session.query(Job).delete() + session.commit() + + job_objs = [{"id": j} for j in range(self.num_jobs)] + self.g2p_num_training_pronunciations = 0 + self.g2p_num_validation_pronunciations = 0 + self.g2p_num_training_words = 0 + self.g2p_num_validation_words = 0 + # Below we partition sorted list of words to try to have each process handling different symbol tables + # so they're not completely overlapping and using more memory + num_words = session.query(Word.id).count() + words_per_job = int(num_words / self.num_jobs) + current_job = 0 + words = session.query(Word.id).filter( + Word.word_type.in_([WordType.speech, WordType.clitic]) ) - for w, p in pronunciations: - if w not in self.g2p_training_dictionary: - self.g2p_training_dictionary[w] = set() - self.g2p_training_dictionary[w].add(p) + mappings = [] + for i, (w,) in enumerate(words): + if i >= (current_job + 1) * words_per_job and current_job != self.num_jobs: + current_job += 1 + mappings.append({"word_id": w, "job_id": current_job, "training": 1}) + with session.bind.begin() as conn: + conn.execute(sqlalchemy.insert(Job.__table__), job_objs) + conn.execute(sqlalchemy.insert(Word2Job.__table__), mappings) + + session.commit() if self.evaluation_mode: - word_dict = self.g2p_training_dictionary - words = sorted(word_dict.keys()) - total_items = len(words) - validation_items = int(total_items * self.validation_proportion) - validation_words = set(random.sample(words, validation_items)) - self.g2p_training_dictionary = { - k: v for k, v in word_dict.items() if k not in validation_words - } - self.g2p_validation_dictionary = { - k: v for k, v in word_dict.items() if k in validation_words - } - if self.debug: - with open( - os.path.join(self.working_directory, "validation_set.txt"), - "w", - encoding="utf8", - ) as f: - for word in self.g2p_validation_dictionary: - f.write(word + "\n") + validation_items = int(num_words * self.validation_proportion) + validation_words = ( + sqlalchemy.select(Word.id) + .order_by(sqlalchemy.func.random()) + .limit(validation_items) + .scalar_subquery() + ) + query = ( + sqlalchemy.update(Word2Job) + .execution_options(synchronize_session="fetch") + .values(training=False) + .where(Word2Job.word_id.in_(validation_words)) + ) + session.execute(query) + session.flush() + query = ( + session.query(Word.word, Pronunciation.pronunciation) + .join(Pronunciation.word) + .join(Word.job) + .filter(Word2Job.training == False) # noqa + ) + for word, pronunciation in query: + self.g2p_validation_graphemes.update(word) + self.g2p_validation_phones.update(pronunciation.split()) + self.g2p_num_validation_pronunciations += 1 + self.g2p_num_validation_words = ( + session.query(Word2Job.word_id) + .filter(Word2Job.training == False) # noqa + .count() + ) + grapheme_count = 0 phone_count = 0 self.character_sets = set() - for word, pronunciations in self.g2p_training_dictionary.items(): - # if re.match(r"\W", word) is not None: - # continue - word = list(word) - grapheme_count += len(word) - self.g2p_training_graphemes.update(word) - for p in pronunciations: - self.g2p_training_phones.update(p.split()) - phone_count += len(p.split()) + query = ( + session.query(Pronunciation.pronunciation, Word.word) + .join(Pronunciation.word) + .join(Word.job) + .filter(Word2Job.training == True) # noqa + ) + with open( + os.path.join(self.working_directory, "words.txt"), "w", encoding="utf8" + ) as word_f, open( + os.path.join(self.working_directory, "pronunciations.txt"), "w", encoding="utf8" + ) as phone_f: + for pronunciation, word in query: + word = list(word) + grapheme_count += len(word) + self.g2p_training_graphemes.update(word) + self.g2p_num_training_pronunciations += 1 + self.g2p_training_phones.update(pronunciation.split()) + phone_count += len(pronunciation.split()) + word_f.write(" ".join(word) + "\n") + phone_f.write(pronunciation + "\n") + self.g2p_num_training_words = ( + session.query(Word2Job.word_id).filter(Word2Job.training == True).count() # noqa + ) self.log_debug(f"Graphemes in training data: {sorted(self.g2p_training_graphemes)}") self.log_debug(f"Phones in training data: {sorted(self.g2p_training_phones)}") self.log_debug(f"Averages phones per grapheme: {phone_count / grapheme_count}") + if self.sequence_separator in self.g2p_training_phones | self.g2p_training_graphemes: + raise PhonetisaurusSymbolError(self.sequence_separator, "sequence_separator") + if self.skip in self.g2p_training_phones | self.g2p_training_graphemes: + raise PhonetisaurusSymbolError(self.skip, "skip") + if self.alignment_separator in self.g2p_training_phones | self.g2p_training_graphemes: + raise PhonetisaurusSymbolError(self.alignment_separator, "alignment_separator") + if self.evaluation_mode: - for word, pronunciations in self.g2p_validation_dictionary.items(): - self.g2p_validation_graphemes.update(word) - for p in pronunciations: - self.g2p_validation_phones.update(p.split()) self.log_debug( f"Graphemes in validation data: {sorted(self.g2p_validation_graphemes)}" ) @@ -1195,3 +1711,4 @@ def initialize_training(self) -> None: self.log_warning( f"The following phones appear only in the validation set: {', '.join(phone_diff)}" ) + self.compute_initial_ngrams() diff --git a/tests/data/dictionaries/mixed_format_dictionary.txt b/tests/data/dictionaries/mixed_format_dictionary.txt index f54ca62f..f27ab555 100644 --- a/tests/data/dictionaries/mixed_format_dictionary.txt +++ b/tests/data/dictionaries/mixed_format_dictionary.txt @@ -1,9 +1,9 @@ 'm 1.0 m ’m m -i’m 0.01 ay m ih -this 1.0 0.43 1.23 0.85 dh ih s -is 1.0 0.5 1.0 1.0 ih z -the 1.0 0.5 1.0 1.0 dh ah +i’m 0.01 ay m ih +this 1.0 0.43 1.23 0.85 dh ih s +is 1.0 0.5 1.0 1.0 ih z +the 1.0 0.5 1.0 1.0 dh ah acoustic ah k uw s t ih k corpus k ao r p us i'm ay m