diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a80e6bcc..649b9476 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -34,26 +34,3 @@ jobs: with: file: ./coverage.xml fail_ci_if_error: true - - name: Build a binary wheel and a source tarball - run: >- - python -m - build - --sdist - --wheel - --outdir dist/ - . - - name: "List result" - run: "ls -l dist" - - name: "Check long_description" - run: "python -m twine check dist/*" - - name: Publish distribution 📦 to PyPI - if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@master - with: - password: ${{ secrets.PYPI_API_TOKEN }} - - name: Publish distribution 📦 to Test PyPI - continue-on-error: true - uses: pypa/gh-action-pypi-publish@master - with: - password: ${{ secrets.TEST_PYPI_API_TOKEN }} - repository_url: https://test.pypi.org/legacy/ diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 00000000..0d3e9b95 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,70 @@ +name: Publish Python distributions to PyPI and TestPyPI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build-n-publish: + name: Build and publish to PyPI and TestPyPI + runs-on: ubuntu-latest + defaults: + run: + # Adding -l {0} helps ensure conda can be found properly. + shell: bash -l {0} + env: + ENV_NAME: mfa_publish + PYTHON: 3.8 + steps: + - uses: actions/checkout@main + with: + fetch-depth: 0 + + - name: Setup Miniconda + uses: conda-incubator/setup-miniconda@v2.0.0 + with: + auto-update-conda: true + miniconda-version: "latest" + python-version: ${{ env.PYTHON }} + environment-file: ci/${{ env.ENV_NAME }}.yml + activate-environment: ${{ env.ENV_NAME }} + + - name: Conda Info + run: | + conda info -a + conda list + PYVER=`python -c "import sys; print('{:d}.{:d}'.format(sys.version_info.major, sys.version_info.minor))"` + if [[ $PYVER != ${{ env.PYTHON }} ]]; then + exit 1; + fi + - name: Create environment variable + if: startsWith(github.ref, 'refs/tags/') != true + run: | + wget https://gist.github.com/plaplant/0902f09e59166bac742bbd554f3cd2f9/raw/make_dev_version.sh + version=$(bash make_dev_version.sh) + echo "SETUPTOOLS_SCM_PRETEND_VERSION=$version" >> $GITHUB_ENV + - name: Check environment variable + run: echo $SETUPTOOLS_SCM_PRETEND_VERSION + + - name: Build a binary wheel and a source tarball + run: >- + python -m + build + --sdist + --wheel + --outdir dist/ + . + - name: Publish to Test PyPI + if: startsWith(github.event.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@master + with: + password: ${{ secrets.TEST_PYPI_API_TOKEN }} + repository_url: https://test.pypi.org/legacy/ + + - name: Publish to PyPI + if: startsWith(github.event.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@master + with: + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index 616237db..3b0108c2 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ pretrained_models/ # Distribution / packaging montreal_forced_aligner/version.py +montreal_forced_aligner/_version.py .Python env/ build/ @@ -81,3 +82,6 @@ target/ docs/source/api/ montreal_forced_aligner/_version.py +/docs/source/reference/generated/ + +docs/source/reference/multiprocessing/generated/ diff --git a/ci/mfa_publish.yml b/ci/mfa_publish.yml new file mode 100644 index 00000000..3de124c9 --- /dev/null +++ b/ci/mfa_publish.yml @@ -0,0 +1,24 @@ +name: mfa_publish +channels: + - conda-forge + - defaults +dependencies: + - numpy + - librosa + - tqdm + - requests + - colorama + - pyyaml + - kaldi + - sox + - openfst + - baumwelch + - ngram + - pynini + - setuptools + - setuptools_scm[toml] + - pip + - pip: + - build + - twine + - praatio >= 5.0 diff --git a/docs/source/_static/css/style.css b/docs/source/_static/css/style.css new file mode 100644 index 00000000..4caaf308 --- /dev/null +++ b/docs/source/_static/css/style.css @@ -0,0 +1,82 @@ +.wy-nav-content { + max-width: 1200px !important; +} +.wy-table-responsive table td { + white-space: normal !important; +} +.wy-table-responsive { + overflow: visible !important; +} +.wy-table-responsive table td, +.wy-table-responsive table th { + white-space: normal; +} + +:root { + --base-blue: 0, 53, 102; + --dark-blue: 0, 29, 61; + --light-blue: 14, 99, 179; + --base-yellow: 255, 195, 0; + --light-yellow: 255, 214, 10; + --pst-color-primary: var(--base-blue); + --pst-color-warning: var(--light-yellow); + --pst-color-info: var(--light-blue); + + --pst-color-link: var(--light-blue); + --pst-color-link-hover: var(--dark-blue); + + --pst-color-active-navigation: var(--dark-blue); + --pst-color-hover-navigation: var(--base-yellow); + + --pst-color-navbar-link: var(--base-blue); + --pst-color-navbar-link-hover: var(--pst-color-hover-navigation); + --pst-color-navbar-link-active: var(--pst-color-active-navigation); + + --pst-color-sidebar-link: var(--base-blue); + --pst-color-sidebar-caption: var(--base-blue); + --pst-color-sidebar-link-hover: var(--pst-color-hover-navigation); + --pst-color-sidebar-link-active: var(--pst-color-active-navigation); + + --pst-color-toc-link: var(--base-blue); + --pst-color-toc-link-hover: var(--pst-color-hover-navigation); + --pst-color-toc-link-active: var(--pst-color-active-navigation); +} +.btn-navigation{ + background-color: #0E63B3; + border-color: #0E63B3; +} +.btn-navigation:hover { + background-color: #FFC300; + border-color: #FFC300; + color: #000814; +} +.i-navigation{ + color: #003566; + padding: 20px; +} +.i-navigation:hover { + color: #FFC300; +} + +.rst-table-cell{ +width: 100%; +height: 100%; +display: inline-block; +text-align: center; + +} + +.supported { +background-color: #E9F6EC; +} + +.not-supported { +background-color: #FBEAEC; +} +#navbar-icon-links i.fa-github-square::before, i.fa-github-square::before { + color: inherit; +} + +dt:target { +background-color: #FFD60A; +} diff --git a/docs/source/_static/favicon.ico b/docs/source/_static/favicon.ico new file mode 100644 index 00000000..c91bac24 Binary files /dev/null and b/docs/source/_static/favicon.ico differ diff --git a/docs/source/_static/interrogate_badge.svg b/docs/source/_static/interrogate_badge.svg index 8c034161..2e1a4ff3 100644 --- a/docs/source/_static/interrogate_badge.svg +++ b/docs/source/_static/interrogate_badge.svg @@ -1,5 +1,5 @@ - interrogate: 99.3% + interrogate: 96.8% @@ -12,8 +12,8 @@ interrogate interrogate - 99.3% - 99.3% + 96.8% + 96.8% diff --git a/docs/source/_static/logo.svg b/docs/source/_static/logo.svg new file mode 100644 index 00000000..278fa2a5 --- /dev/null +++ b/docs/source/_static/logo.svg @@ -0,0 +1,89 @@ + + + + + + + + + + + + + + + diff --git a/docs/source/_static/logo_long.svg b/docs/source/_static/logo_long.svg new file mode 100644 index 00000000..5d76824d --- /dev/null +++ b/docs/source/_static/logo_long.svg @@ -0,0 +1,161 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/logo_stacked.svg b/docs/source/_static/logo_stacked.svg new file mode 100644 index 00000000..e3aa9ed4 --- /dev/null +++ b/docs/source/_static/logo_stacked.svg @@ -0,0 +1,181 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/style.css b/docs/source/_static/style.css deleted file mode 100644 index 8aa6c288..00000000 --- a/docs/source/_static/style.css +++ /dev/null @@ -1,3 +0,0 @@ -.wy-nav-content { - max-width: 1200px !important; -} diff --git a/docs/source/_templates/autosummary/attribute.rst b/docs/source/_templates/autosummary/attribute.rst new file mode 100644 index 00000000..2433c226 --- /dev/null +++ b/docs/source/_templates/autosummary/attribute.rst @@ -0,0 +1,9 @@ +:orphan: + +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. autoattribute:: {{ objname }} + :type: diff --git a/docs/source/_templates/autosummary/base.rst b/docs/source/_templates/autosummary/base.rst new file mode 100644 index 00000000..e03319b8 --- /dev/null +++ b/docs/source/_templates/autosummary/base.rst @@ -0,0 +1,5 @@ +{{ objname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. auto{{ objtype }}:: {{ objname }} diff --git a/docs/source/_templates/autosummary/class.rst b/docs/source/_templates/autosummary/class.rst new file mode 100644 index 00000000..1389abf5 --- /dev/null +++ b/docs/source/_templates/autosummary/class.rst @@ -0,0 +1,10 @@ +{{ objname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :members: + :show-inheritance: + :no-inherited-members: + :no-special-members: diff --git a/docs/source/_templates/autosummary/function.rst b/docs/source/_templates/autosummary/function.rst new file mode 100644 index 00000000..f5676ee8 --- /dev/null +++ b/docs/source/_templates/autosummary/function.rst @@ -0,0 +1,6 @@ +{{ objname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. autofunction:: {{ objname }} diff --git a/docs/source/_templates/autosummary/method.rst b/docs/source/_templates/autosummary/method.rst new file mode 100644 index 00000000..d940d3f8 --- /dev/null +++ b/docs/source/_templates/autosummary/method.rst @@ -0,0 +1,8 @@ +:orphan: + +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. automethod:: {{ objname }} diff --git a/docs/source/_templates/autosummary/property.rst b/docs/source/_templates/autosummary/property.rst new file mode 100644 index 00000000..184aa227 --- /dev/null +++ b/docs/source/_templates/autosummary/property.rst @@ -0,0 +1,8 @@ +:orphan: + +{{ fullname }} +{{ underline }} + +.. currentmodule:: {{ module }} + +.. autoproperty:: {{ objname }} diff --git a/docs/source/_templates/class.rst b/docs/source/_templates/class_b.rst similarity index 100% rename from docs/source/_templates/class.rst rename to docs/source/_templates/class_b.rst diff --git a/docs/source/_templates/function.rst b/docs/source/_templates/function_b.rst similarity index 100% rename from docs/source/_templates/function.rst rename to docs/source/_templates/function_b.rst diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html deleted file mode 100644 index 3e44f4a3..00000000 --- a/docs/source/_templates/layout.html +++ /dev/null @@ -1,4 +0,0 @@ -{% extends "!layout.html" %} -{% block extrahead %} - -{% endblock %} diff --git a/docs/source/_templates/sidebar-nav-bs.html b/docs/source/_templates/sidebar-nav-bs.html new file mode 100644 index 00000000..9ec8c015 --- /dev/null +++ b/docs/source/_templates/sidebar-nav-bs.html @@ -0,0 +1,10 @@ +
+ {{ generate_nav_html("sidebar", + show_nav_level=theme_show_nav_level|int, + maxdepth=theme_navigation_depth|int, + collapse=theme_collapse_navigation|tobool, + includehidden=True, + titles_only=True) }} +
+ diff --git a/docs/source/_templates/version.html b/docs/source/_templates/version.html new file mode 100644 index 00000000..3d621022 --- /dev/null +++ b/docs/source/_templates/version.html @@ -0,0 +1,24 @@ +{# This will display the version of the docs as a badge + +Colors from: + +Wong, B. Points of view: Color blindness. +Nat Methods 8, 441 (2011). https://doi.org/10.1038/nmeth.1618 + +#} + + + {% if "dev" in version %} + {# orange for dev #E69F00 #} + + {% elif versionwarning %} + {# red for old #980F0F #} + + {% else %} + {# green for stable #009E73 #} + + {% endif %} + diff --git a/docs/source/aligning.rst b/docs/source/aligning.rst deleted file mode 100644 index 382578ce..00000000 --- a/docs/source/aligning.rst +++ /dev/null @@ -1,274 +0,0 @@ -.. _aligning: - -.. _`LibriSpeech corpus`: http://www.openslr.org/12/ - -******************* -Running the aligner -******************* - -.. _pretrained_alignment: - -Align using pretrained models ------------------------------ - -The Montreal Forced Aligner comes with :ref:`pretrained_acoustic` for several languages. - -Steps to align: - -1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that - MFA was installed in. - -2. Run the following command, substituting the arguments with your own paths: - - .. code-block:: bash - - mfa align corpus_directory dictionary_path acoustic_model_path output_directory - - -.. warning:: - - Aligned TextGrids will overwrite any existing TextGrids (with the same name as the wav files) in the output directory. - The aligner will throw an error if the corpus directory is specified as the output directory (to prevent overwriting - any input TextGrids). - -.. note:: - - ``acoustic_model_path`` can also be a language that has been pretrained by MFA developers. For instance, to use - the pretrained English model, first download it via :code:`mfa download acoustic english`. A list of available - acoustic models will be provided if you run :code:`mfa download acoustic`. See :ref:`pretrained_models` for more details. - -.. note:: - On Mac/Unix, to save time typing out the path, you - can drag a folder from Finder into Terminal and it will put the full - path to that folder into your command. - - On Windows, you can hold Shift and right-click on a folder/file. Select - "Copy as path..." and paste it into the command window. - -Once the aligner finishes, the resulting TextGrids will be in the -specified output directory. - -Options available: - -.. option:: -h - --help - - Display help message for the command - -.. option:: --config_path PATH - - Path to a YAML config file that will specify the alignment configuration. See - :ref:`align_config` for more details. - -.. option:: -s NUMBER - --speaker_characters NUMBER - - Number of characters to use to identify speakers; if not specified, - the aligner assumes that the directory name is the identifier for the - speaker. Additionally, it accepts the value ``prosodylab`` to use the second field of a ``_`` delimited file name, - following the convention of labelling production data in the ProsodyLab at McGill. - -.. option:: -a PATH - --audio_directory PATH - - Path to directory containing audio files if they are in a different root directory than the transcription files. - Useful when you have multiple steps of a transcription/alignment pipeline - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for aligning, default is ``~/Documents/MFA`` - -.. option:: -j NUMBER - --num_jobs NUMBER - - Number of jobs to use; defaults to 3, set higher if you have more - processors available and would like to align faster - -.. option:: -v - --verbose - - The aligner will print out more information if present - -.. option:: -d - --debug - - The aligner will run in debug mode - -.. option:: -c - --clean - - Forces removal of temporary files in ``~/Documents/MFA`` - prior to aligning. This is good to use when aligning a new dataset, - but it shares a name with a previously aligned dataset. Cleaning automatically happens if the previous alignment - run had an error. - -.. _trained_alignment: - -Align using only the data set ------------------------------ - -Steps to align: - -1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that - MFA was installed in. - -2. Run the following command, substituting the arguments with your own paths: - - .. code-block:: bash - - mfa train corpus_directory dictionary_path output_directory - -.. warning:: - - Aligned TextGrids will overwrite any existing TextGrids (with the same name as the wav files) in the output directory. - The aligner will throw an error if the corpus directory is specified as the output directory (to prevent overwriting - any input TextGrids). - - -Once the aligner finishes, the resulting TextGrids will be in the -specified output directory. Training can take several hours for large datasets. - -Options available: - -.. option:: -h - --help - - Display help message for the command - -.. option:: --config_path PATH - - Path to a YAML config file that will specify either the alignment options or the training configuration. see - :ref:`training_config` for more details. - -.. option:: -o PATH - --output_model_path PATH - - Path to a zip file to save the results' acoustic models - from training to use in future aligning - -.. option:: -a PATH - --audio_directory PATH - - Path to directory containing audio files if they are in a different root directory than the transcription files. - Useful when you have multiple steps of a transcription/alignment pipeline - -.. option:: -s NUMBER - --speaker_characters NUMBER - - Number of characters to use to identify speakers; if not specified, - the aligner assumes that the directory name is the identifier for the - speaker. Additionally, it accepts the value ``prosodylab`` to use the second field of a ``_`` delimited file name, - following the convention of labelling production data in the ProsodyLab at McGill. - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for aligning, default is ``~/Documents/MFA`` - -.. option:: -j NUMBER - --num_jobs NUMBER - - Number of jobs to use; defaults to 3, set higher if you have more - processors available and would like to align faster - -.. option:: -v - --verbose - - The aligner will print out more information if present - -.. option:: -d - --debug - - The aligner will run in debug mode - -.. option:: -c - --clean - - Forces removal of temporary files in ``~/Documents/MFA`` - prior to aligning. This is good to use when aligning a new dataset, - but it shares a name with a previously aligned dataset. Cleaning automatically happens if the previous alignment - run had an error. - -.. _adapting_model: - -Adapt pretrained models to new dataset --------------------------------------- - -A recent 2.0 functionality for MFA is to adapt pretrained models to a new dataset. MFA will first align the dataset using the pretrained model, and then perform a couple of rounds of speaker-adaptation training. - -Steps to align: - -1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that - MFA was installed in. - -2. Run the following command, substituting the arguments with your own paths: - - .. code-block:: bash - - mfa adapt corpus_directory dictionary_path acoustic_model_path output_model_path - -3. Using the new model, run the alignment - - .. code-block:: bash - - mfa align corpus_directory dictionary_path output_model_path output_path - - -.. note:: - - ``acoustic_model_path`` can also be a language that has been pretrained by MFA developers. For instance, to use - the pretrained English model, first download it via :code:`mfa download acoustic english`. A list of available - acoustic models will be provided if you run :code:`mfa download acoustic`. See :ref:`pretrained_models` for more details. - -Once the aligner finishes, the resulting TextGrids will be in the -specified output directory. - -Options available: - -.. option:: -h - --help - - Display help message for the command - -.. option:: --config_path PATH - - Path to a YAML config file that will specify the alignment configuration. See - :ref:`align_config` for more details. - -.. option:: -s NUMBER - --speaker_characters NUMBER - - Number of characters to use to identify speakers; if not specified, - the aligner assumes that the directory name is the identifier for the - speaker. Additionally, it accepts the value ``prosodylab`` to use the second field of a ``_`` delimited file name, - following the convention of labelling production data in the ProsodyLab at McGill. - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for aligning, default is ``~/Documents/MFA`` - -.. option:: -j NUMBER - --num_jobs NUMBER - - Number of jobs to use; defaults to 3, set higher if you have more - processors available and would like to align faster - -.. option:: -v - --verbose - - The aligner will print out more information if present - -.. option:: -d - --debug - - The aligner will run in debug mode - -.. option:: -c - --clean - - Forces removal of temporary files in ``~/Documents/MFA`` - prior to aligning. This is good to use when aligning a new dataset, - but it shares a name with a previously aligned dataset. Cleaning automatically happens if the previous alignment - run had an error. diff --git a/docs/source/api_reference/api_aligner.rst b/docs/source/api_reference/api_aligner.rst deleted file mode 100644 index 750104ec..00000000 --- a/docs/source/api_reference/api_aligner.rst +++ /dev/null @@ -1,13 +0,0 @@ -.. _api_aligner_reference: - -*********** -Aligner API -*********** - -.. automodapi:: montreal_forced_aligner.aligner.base - -.. automodapi:: montreal_forced_aligner.aligner.pretrained - -.. automodapi:: montreal_forced_aligner.aligner.trainable - -.. automodapi:: montreal_forced_aligner.aligner.adapting diff --git a/docs/source/api_reference/api_command_line.rst b/docs/source/api_reference/api_command_line.rst deleted file mode 100644 index 7362b950..00000000 --- a/docs/source/api_reference/api_command_line.rst +++ /dev/null @@ -1,37 +0,0 @@ -.. _api_command_line_reference: - -**************** -Command line API -**************** - -.. automodapi:: montreal_forced_aligner.command_line.adapt - -.. automodapi:: montreal_forced_aligner.command_line.align - -.. automodapi:: montreal_forced_aligner.command_line.anchor - -.. automodapi:: montreal_forced_aligner.command_line.classify_speakers - -.. automodapi:: montreal_forced_aligner.command_line.classify_speakers - -.. automodapi:: montreal_forced_aligner.command_line.create_segments - -.. automodapi:: montreal_forced_aligner.command_line.g2p - -.. automodapi:: montreal_forced_aligner.command_line.mfa - -.. automodapi:: montreal_forced_aligner.command_line.model - -.. automodapi:: montreal_forced_aligner.command_line.train_acoustic_model - -.. automodapi:: montreal_forced_aligner.command_line.train_dictionary - -.. automodapi:: montreal_forced_aligner.command_line.train_g2p - -.. automodapi:: montreal_forced_aligner.command_line.train_lm - -.. automodapi:: montreal_forced_aligner.command_line.transcribe - -.. automodapi:: montreal_forced_aligner.command_line.utils - -.. automodapi:: montreal_forced_aligner.command_line.validate diff --git a/docs/source/api_reference/api_config.rst b/docs/source/api_reference/api_config.rst deleted file mode 100644 index a81dc4b5..00000000 --- a/docs/source/api_reference/api_config.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. _api_config_reference: - -********** -Config API -********** - -.. automodapi:: montreal_forced_aligner.config.align_config - -.. automodapi:: montreal_forced_aligner.config.base_config - -.. automodapi:: montreal_forced_aligner.config.command_config - -.. automodapi:: montreal_forced_aligner.config.g2p_config - -.. automodapi:: montreal_forced_aligner.config.segmentation_config - -.. automodapi:: montreal_forced_aligner.config.speaker_classification_config - -.. automodapi:: montreal_forced_aligner.config.train_config - -.. automodapi:: montreal_forced_aligner.config.train_g2p_config - -.. automodapi:: montreal_forced_aligner.config.train_lm_config - -.. automodapi:: montreal_forced_aligner.config.transcribe_config diff --git a/docs/source/api_reference/api_corpus.rst b/docs/source/api_reference/api_corpus.rst deleted file mode 100644 index 0015e63b..00000000 --- a/docs/source/api_reference/api_corpus.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. _api_corpus_reference: - -********** -Corpus API -********** - -.. automodapi:: montreal_forced_aligner.corpus.base - -.. automodapi:: montreal_forced_aligner.corpus.classes - -.. automodapi:: montreal_forced_aligner.corpus.helper diff --git a/docs/source/api_reference/api_g2p.rst b/docs/source/api_reference/api_g2p.rst deleted file mode 100644 index 7301a749..00000000 --- a/docs/source/api_reference/api_g2p.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. _api_g2p_reference: - -******* -G2P API -******* - -.. automodapi:: montreal_forced_aligner.g2p.trainer - -.. automodapi:: montreal_forced_aligner.g2p.generator diff --git a/docs/source/api_reference/api_index.rst b/docs/source/api_reference/api_index.rst deleted file mode 100644 index 01bafcc3..00000000 --- a/docs/source/api_reference/api_index.rst +++ /dev/null @@ -1,81 +0,0 @@ -.. _api_reference: - -************* -API Reference -************* - -.. toctree:: - :maxdepth: 1 - - api_aligner.rst - api_command_line.rst - api_config.rst - api_corpus.rst - api_g2p.rst - api_lm.rst - api_multiprocessing.rst - api_trainers.rst - - -.. _speaker_classifier_api: -Speaker Classifier API -====================== - -.. automodapi:: montreal_forced_aligner.speaker_classifier - -.. _transcriber_api: -Transcriber API -=============== - -.. automodapi:: montreal_forced_aligner.transcriber - -.. _segmenter_api: -Segmenter API -============= - -.. automodapi:: montreal_forced_aligner.segmenter - -.. _validator_api: -Validator API -============= - -.. automodapi:: montreal_forced_aligner.validator - - -.. _dictionary_api: -Dictionary API -============== - -.. automodapi:: montreal_forced_aligner.dictionary - - -.. _models_api: -Models API -========== - -.. automodapi:: montreal_forced_aligner.models - - -.. _textgrid_api: -TextGrid API -============ - -.. automodapi:: montreal_forced_aligner.textgrid - -.. _utils_api: - -Utils API -========= -.. automodapi:: montreal_forced_aligner.utils - -.. _helper_api: -Helper API -========== - -.. automodapi:: montreal_forced_aligner.helper - -.. _exceptions_api: -Exceptions API -============== - -.. automodapi:: montreal_forced_aligner.exceptions diff --git a/docs/source/api_reference/api_lm.rst b/docs/source/api_reference/api_lm.rst deleted file mode 100644 index 56bed521..00000000 --- a/docs/source/api_reference/api_lm.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _api_lm_reference: - -****************** -Language Model API -****************** - -.. automodapi:: montreal_forced_aligner.lm.trainer diff --git a/docs/source/api_reference/api_multiprocessing.rst b/docs/source/api_reference/api_multiprocessing.rst deleted file mode 100644 index a3ad0b4a..00000000 --- a/docs/source/api_reference/api_multiprocessing.rst +++ /dev/null @@ -1,21 +0,0 @@ -.. _api_multiprocessing_reference: - -******************* -Multiprocessing API -******************* - -.. automodapi:: montreal_forced_aligner.multiprocessing.alignment - -.. automodapi:: montreal_forced_aligner.multiprocessing.classes - -.. automodapi:: montreal_forced_aligner.multiprocessing.corpus - -.. automodapi:: montreal_forced_aligner.multiprocessing.features - -.. automodapi:: montreal_forced_aligner.multiprocessing.helper - -.. automodapi:: montreal_forced_aligner.multiprocessing.ivector - -.. automodapi:: montreal_forced_aligner.multiprocessing.pronunciations - -.. automodapi:: montreal_forced_aligner.multiprocessing.transcription diff --git a/docs/source/api_reference/api_trainers.rst b/docs/source/api_reference/api_trainers.rst deleted file mode 100644 index da4c54bb..00000000 --- a/docs/source/api_reference/api_trainers.rst +++ /dev/null @@ -1,17 +0,0 @@ -.. _api_trainers_reference: - -************ -Trainers API -************ - -.. automodapi:: montreal_forced_aligner.trainers.base - -.. automodapi:: montreal_forced_aligner.trainers.monophone - -.. automodapi:: montreal_forced_aligner.trainers.triphone - -.. automodapi:: montreal_forced_aligner.trainers.lda - -.. automodapi:: montreal_forced_aligner.trainers.sat - -.. automodapi:: montreal_forced_aligner.trainers.ivector_extractor diff --git a/docs/source/changelog/changelog_2.0.rst b/docs/source/changelog/changelog_2.0.rst index 80efc45d..f843cbba 100644 --- a/docs/source/changelog/changelog_2.0.rst +++ b/docs/source/changelog/changelog_2.0.rst @@ -1,7 +1,3 @@ -.. _`PR #194`: https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/pull/194 -.. _`PR #235`: https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/pull/235 -.. _`PR #288`: https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/pull/288 -.. _`PR #337`: https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/pull/337 .. _changelog_2.0: @@ -14,18 +10,24 @@ Beta releases ============= +2.0.0b5 +------- + +- Documentation refresh! Docs now use the :xref:`pydata_sphinx_theme` and should have a better landing page and flow, as well as up to date API reference +- Some refactoring to use type hinting and abstract class interfaces (still a work in progress) + + 2.0.0b4 ------- - Massive refactor to a proper class-based API for interacting with MFA corpora - Sorry, I really do hope this is the last big refactor of 2.0 - - :class:`~montreal_forced_aligner.corpus.classes.Speakers`, :class:`~montreal_forced_aligner.corpus.classes.Files`, and :class:`~montreal_forced_aligner.corpus.classes.Utterances` have dedicated classes rather than having their information split across dictionaries mimicking Kaldi files, so they should be more useful for interacting with outside of MFA - - Added :class:`~montreal_forced_aligner.multiprocessing.classes.Job` class as well to make it easier to generate and keep track of information about different processes + - :class:`~montreal_forced_aligner.corpus.Speaker`, :class:`~montreal_forced_aligner.corpus.File`, and :class:`~montreal_forced_aligner.corpus.Utterance` have dedicated classes rather than having their information split across dictionaries mimicking Kaldi files, so they should be more useful for interacting with outside of MFA + - Added :class:`~montreal_forced_aligner.multiprocessing.Job` class as well to make it easier to generate and keep track of information about different processes - Updated installation style to be more dependent on conda-forge packages - - Kaldi is now on conda-forge! |:tada:| - - MFA should follow suit soon, making installation a lot simpler + - Kaldi and MFA are now on conda-forge! |:tada:| - Added a :code:`mfa model` command for inspecting, listing, downloading, and saving pretrained models, see :ref:`pretrained_models` for more information. - Fixed a bug where saving command history with errors would throw an error of its own @@ -44,7 +46,7 @@ Beta releases 2.0.0b1 ------- -- Fixed bug in training (`PR #337`_) +- Fixed bug in training (:mfa_pr:`337`) - Fixed bug when using Ctrl-C in loading 2.0.0b0 @@ -140,7 +142,7 @@ Beta release! 2.0.0a16 -------- -- Changed how punctuation is stripped from beginning/end of words (`PR #288`_) +- Changed how punctuation is stripped from beginning/end of words (:mfa_pr:`288`) - Added more logging for alignment (validating acoustic models and generating overall log-likelihood of the alignment) - Changed subsetting features prior to initializing monophone trainer to prevent erroneous error detection - Fixed parsing of boolean arguments on command line to be passed to aligners @@ -198,8 +200,8 @@ Beta release! - Upgraded dependency of Pynini version to 2.1.4, please update package versions via :code:`conda upgrade -c conda-forge openfst pynini ngram baumwelch` if you had previously installed MFA. - Allowed for splitting clitics on multiple apostrophes -- Fixed bug in checking for brackets in G2P (`PR #194`_) -- Updated Annotator utility (:ref:`annotator` for more details) to be generally more usable for TextGrid use cases and +- Fixed bug in checking for brackets in G2P (:mfa_pr:`235`) +- Updated Annotator utility (:ref:`anchor` for more details) to be generally more usable for TextGrid use cases and adjusting segments and their transcriptions - Improved handling of stereo files with TextGrids so that MFA doesn't need to generate temporary files for each channel @@ -238,4 +240,4 @@ Currently under development with major changes, see :ref:`whats_new_2_0`. messages for files that do not meet Kaldi's input requirements - Removed multiprocessing from speaker adaptation, as the executables use multiple threads leading to a bottleneck in performance. This change should result in faster speaker adaptation. -- Optimized corpus parsing algorithm to be O(n log n) instead of O(n^2) (`PR #194`_) +- Optimized corpus parsing algorithm to be O(n log n) instead of O(n^2) (:mfa_pr:`194`) diff --git a/docs/source/changelog/changelog_index.rst b/docs/source/changelog/changelog_index.rst deleted file mode 100644 index 76a427a9..00000000 --- a/docs/source/changelog/changelog_index.rst +++ /dev/null @@ -1,12 +0,0 @@ - -.. _changelog: - -********* -Changelog -********* - -.. toctree:: - :maxdepth: 3 - - changelog_2.0.rst - changelog_1.0.rst diff --git a/docs/source/news.rst b/docs/source/changelog/index.rst similarity index 80% rename from docs/source/news.rst rename to docs/source/changelog/index.rst index 33dac524..9ad58a96 100644 --- a/docs/source/news.rst +++ b/docs/source/changelog/index.rst @@ -11,7 +11,7 @@ What's new in 2.0 ================= Version 2.0 of the Montreal Forced Aligner represents several overhauls to installation and management -of commands. +of commands. See :ref:`changelog_2.0` for a more specific changes. .. _2_0_installation_update: @@ -25,13 +25,10 @@ customize the runtime for different environments and versions. Moving forward, MFA will: -- Use standard Python packaging, i.e., :code:`pip install montreal-forced-aligner` or - :code:`python setup.py install` from the cloned repo -- Allow for downloading third party executables for the particular system, but also allow for picking up relevant executables - that were built on the system, increasing flexibility of use +- Use standard Python packaging and be available for import in Python +- Rely on :xref:`conda_forge` for handling dependencies - Switch to using Pynini instead of Phonetisaurus for G2P purposes, which should ease distribution and installation -- Have a :ref:`2_0_unified_cli` with subcommands for each command line function that will be available upon installation, - as well as exposing the full MFA api for use in other Python scripts +- Have a :ref:`2_0_unified_cli` with subcommands for each command line function that will be available upon installation, as well as exposing the full MFA api for use in other Python scripts - Allow for faster bug fixes that do not require repackaging and releasing frozen binaries across all platforms .. _2_0_unified_cli: @@ -44,13 +41,13 @@ more functionality has been added with G2P models, validation, managing pretrain different types of models, it has become unwieldy to have separate commands for each. As such, going forward: -- There will be a single :code:`mfa` command line utility that will be available once it is installed via pip. -- Running :code:`mfa -h` will list the subcommands that can be run, along with their descriptions. +- There will be a single :code:`mfa` command line utility that will be available once it is installed via pip/conda. +- Running :code:`mfa -h` will list the subcommands that can be run, along with their descriptions, see :ref:`commands` for details. -.. _2_0_annotator_gui: +.. _2_0_anchor_gui: -Annotator GUI -------------- +Anchor annotator GUI +-------------------- Added a basic annotation GUI with features for: @@ -60,7 +57,7 @@ Added a basic annotation GUI with features for: - Updating/adding dictionary entries - Updating transcriptions -See also :ref:`annotator` for more information on using the annotation GUI. +See also :ref:`anchor` for more information on using the annotation GUI. .. _2.0_transcription: @@ -72,30 +69,25 @@ MFA now supports: - Transcribing a corpus of sound files using an acoustic model, dictionary, and language model, see :ref:`transcribing` for more information. - Training language models from corpora that have text transcriptions, see :ref:`training_lm` for more information -- Training pronunciation probability dictionaries from alignments, for use in alignment or transcription, see - :ref:`training_dictionary` for more information +- Training pronunciation probability dictionaries from alignments, for use in alignment or transcription, see :ref:`training_dictionary` for more information .. _whats_new_1_1: What's new in 1.1 ================= -Version 1.1 of the Montreal Forced Aligner represents several overhauls to the workflow and ability to customize model -training and alignment. +Version 1.1 of the Montreal Forced Aligner represents several overhauls to the workflow and ability to customize model training and alignment. .. attention:: - Please note that development of 1.1 has been bundled into 2.0 as part of larger infrastructure changes - in developing MFA (@mmcauliffe no longer being affiliated with an academic institution, lack of access to Mac - OS for building third party executables, etc) + With the development of 2.0, the below sections are out of date. .. _1_1_training_configurations: Training configurations ----------------------- -A major new feature is the ability to specify and customize configuration for training and alignment. Prior to 1.1, -the training procedure for new models was: +A major new feature is the ability to specify and customize configuration for training and alignment. Prior to 1.1, the training procedure for new models was: - Monophone training - Triphone training @@ -110,7 +102,6 @@ In 1.1, the following training procedures are available: - LDA+MLLT training - Speaker-adapted triphone training - Ivector extractor training -- Nnet2 training Each of these blocks (as well as their inclusion) can be customized through a YAML config file. In addition to training parameters, global alignment and feature configuration parameters are available. See :ref:`configuration` for more details. @@ -150,3 +141,11 @@ The functionality of :code:`mfa_generate_dictionary` has been expanded. files with an associated sound file - When a text file is specified as the input path, all words in the text file will be run through G2P, allowing for a simpler pipeline for generating transcriptions from out of vocabulary items + + +.. toctree:: + :maxdepth: 1 + :hidden: + + changelog_2.0.rst + changelog_1.0.rst diff --git a/docs/source/classify_speakers.rst b/docs/source/classify_speakers.rst deleted file mode 100644 index 68b9b19d..00000000 --- a/docs/source/classify_speakers.rst +++ /dev/null @@ -1,69 +0,0 @@ -.. _classify_speakers: - -********************** -Speaker classification -********************** - -The Montreal Forced Aligner can use trained ivector models (see :ref:`train_ivector` for more information about training -these models) to classify or cluster utterances according to speakers. - -Steps to classify speakers: - - -1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that - MFA was installed in. -2. Run the following command, substituting the arguments with your own paths: - - .. code-block:: bash - - mfa classify_speakers corpus_directory ivector_extractor_path output_directory - -If the input uses TextGrids, the output TextGrids will have utterances sorted into tiers by each identified speaker. At -the moment, there is no way to retrain the classifier based on new data. - -If the input corpus directory does not have TextGrids associated with them, then the speaker classifier will output -speaker directories with a text file that contains all the utterances that were classified. - -Options available: - -.. option:: -h - --help - - Display help message for the command - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for aligning, default is ``~/Documents/MFA`` - -.. option:: -j NUMBER - --num_jobs NUMBER - - Number of jobs to use; defaults to 3, set higher if you have more - processors available and would like to process faster - -.. option:: -s NUMBER - --num_speakers NUMBER - - Number of speakers to return. If ``--cluster`` is present, this specifies the number of clusters. Otherwise, - MFA will sort speakers according to the first pass classification and then takes the top X speakers, and reclassify - the utterances to only use those speakers. - -.. option:: --cluster - - MFA will perform clustering of utterance ivectors into the number of speakers specified by ``--num_speakers`` - -.. option:: -v - --verbose - - The aligner will print out more information if present - -.. option:: -d - --debug - - The aligner will run in debug mode - -.. option:: -c - --clean - - Forces removal of temporary files in ``~/Documents/MFA`` diff --git a/docs/source/conf.py b/docs/source/conf.py index 8032b071..956650da 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -19,6 +19,7 @@ # import os import sys +from datetime import date sys.path.insert(0, os.path.abspath("../../")) import montreal_forced_aligner # noqa @@ -32,22 +33,121 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + extensions = [ "sphinx.ext.autodoc", - "sphinx.ext.mathjax", - "sphinx.ext.ifconfig", - "sphinx.ext.viewcode", "sphinx.ext.autosummary", - "sphinx_automodapi.automodapi", - "sphinx_automodapi.smart_resolver", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", "sphinx.ext.intersphinx", + "sphinx.ext.extlinks", + "external_links", + # "numpydoc", + "sphinx.ext.napoleon", + "sphinx_panels", + "sphinx.ext.viewcode", + "sphinxcontrib.autoprogram", "sphinxemoji.sphinxemoji", + # "sphinx_autodoc_typehints", ] - +panels_add_bootstrap_css = False intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} + +extlinks = { + "mfa_pr": ("https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/pull/%s", "PR #%s"), +} + +xref_links = { + "mfa_mailing_list": ("MFA mailing list", "https://groups.google.com/g/mfa-users"), + "mfa_github": ("MFA GitHub Repo", "https://groups.google.com/g/mfa-users"), + "mfa_github_issues": ( + "MFA GitHub Issues", + "https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/issues", + ), + "memcauliffe.com": ("Michael McAuliffe's blog", "https://memcauliffe.com"), + "@wavable": ("@wavable", "https://twitter.com/wavable"), + "sonderegger": ("Morgan Sonderegger", "http://people.linguistics.mcgill.ca/~morgan/"), + "wagner": ("Michael Wagner", "https://prosodylab.org/"), + "coles": ("Arlie Coles", "https://a-coles.github.io/"), + "stengel-eskin": ("Elias Stengel-Eskin", "https://esteng.github.io/"), + "socolof": ("Michaela Socolof", "https://mcqll.org/people/socolof.michaela/"), + "mihuc": ("Sarah Mihuc", "https://www.cs.mcgill.ca/~smihuc/"), + "wsl": ("Windows Subsystem for Linux", "https://docs.microsoft.com/en-us/windows/wsl/install"), + "kaldi": ("Kaldi", "http://kaldi-asr.org/"), + "kaldi_github": ("Kaldi GitHub", "https://github.com/kaldi-asr/kaldi"), + "htk": ("HTK", "http://htk.eng.cam.ac.uk/"), + "phonetisaurus": ("Phonetisaurus", "https://github.com/AdolfVonKleist/Phonetisaurus"), + "pynini": ("Pynini", "https://www.openfst.org/twiki/bin/view/GRM/Pynini"), + "prosodylab_aligner": ("Prosodylab-aligner", "http://prosodylab.org/tools/aligner/"), + "p2fa": ( + "Penn Phonetics Forced Aligner", + "https://www.ling.upenn.edu/phonetics/old_website_2015/p2fa/", + ), + "fave": ("FAVE-align", "https://github.com/JoFrhwld/FAVE/wiki/FAVE-align"), + "maus": ("MAUS", "http://www.bas.uni-muenchen.de/Bas/BasMAUS.html"), + "praat": ("Praat", "http://www.fon.hum.uva.nl/praat/"), + "easy_align": ("EasyAlign", "http://latlcui.unige.ch/phonetique/easyalign.php"), + "gentle": ("Gentle", "https://lowerquality.com/gentle/"), + "chodroff_kaldi": ("Kaldi tutorial", "https://eleanorchodroff.com/tutorial/kaldi/index.html"), + "chodroff_phonetics": ( + "Corpus Phonetics Tutorial", + "https://eleanorchodroff.com/tutorial/intro.html", + ), + "coqui": ("Coqui", "https://coqui.ai/"), + "conda_installation": ( + "Conda installation", + "https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html", + ), + "conda_forge": ("Conda Forge", "https://conda-forge.org/"), + "pydata_sphinx_theme": ( + "Pydata Sphinx Theme", + "https://pydata-sphinx-theme.readthedocs.io/en/latest/", + ), + "mfa_reorg_scripts": ( + "MFA-reorganization-scripts repository", + "https://github.com/MontrealCorpusTools/MFA-reorganization-scripts", + ), +} + +# ----------------------------------------------------------------------------- +# Autosummary +# ----------------------------------------------------------------------------- + + +autosummary_generate = True +autodoc_typehints = "none" +# autodoc_typehints_description_target = 'documented' +# autoclass_content = 'both' +autodoc_docstring_signature = True +autodoc_type_aliases = { + "MultispeakerDictionary": "montreal_forced_aligner.dictionary.MultispeakerDictionary", + "Trainer": "montreal_forced_aligner.abc.Trainer", + "Aligner": "montreal_forced_aligner.abc.Aligner", + "DictionaryData": "montreal_forced_aligner.dictionary.DictionaryData", + "Utterance": "montreal_forced_aligner.corpus.Utterance", + "File": "montreal_forced_aligner.corpus.File", + "FeatureConfig": "montreal_forced_aligner.config.FeatureConfig", + "multiprocessing.context.Process": "multiprocessing.Process", + "mp.Process": "multiprocessing.Process", + "Speaker": "montreal_forced_aligner.corpus.Speaker", +} + +napoleon_preprocess_types = False +napoleon_attr_annotations = False +napoleon_use_param = True +napoleon_type_aliases = { + "Labels": "List[str]", +} +typehints_fully_qualified = False +# numpydoc_xref_param_type = True +# numpydoc_show_inherited_class_members = False numpydoc_show_class_members = False -numpydoc_class_members_toctree = False -autosummary_imported_members = False +# ----------------------------------------------------------------------------- +# Autodoc +# ----------------------------------------------------------------------------- + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -66,7 +166,7 @@ # General information about the project. project = "Montreal Forced Aligner" -copyright = "2018, Montreal Corpus Tools" +copyright = f"2018-{date.today().year}, Montreal Corpus Tools" author = "Montreal Corpus Tools" # The version info for the project you're documenting, acts as replacement for @@ -102,11 +202,11 @@ # The reST default role (used for this markup: `text`) to use for all # documents. # -# default_role = None +default_role = "autolink" # If true, '()' will be appended to :func: etc. cross-reference text. # -# add_function_parentheses = True +add_function_parentheses = False # If true, the current module name will be prepended to all description # unit titles (such as .. function::). @@ -118,6 +218,27 @@ # # show_authors = False +# nitpicky = True +nitpick_ignore = [ + ("py:class", "optional"), + ("py:class", "callable"), + ("py:class", "CtmType"), + ("py:class", "ReversedMappingType"), + ("py:class", "WordsType"), + ("py:class", "MappingType"), + ("py:class", "TextIO"), + ("py:class", "SegmentationType"), + ("py:class", "CtmErrorDict"), + ("py:class", "Labels"), + ("py:class", "ScpType"), + ("py:class", "multiprocessing.Value"), + ("py:class", "praatio.utilities.constants.Interval"), + ("py:class", "CorpusMappingType"), + ("py:class", "DictionaryEntryType"), + ("py:class", "montreal_forced_aligner.abc.MetaDict"), + ("py:class", "multiprocessing.context.Process"), +] + # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" @@ -136,15 +257,46 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = "sphinx_rtd_theme" +html_theme = "pydata_sphinx_theme" + +html_logo = "_static/logo_long.svg" +html_favicon = "_static/favicon.ico" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # -# html_theme_options = { -# 'page_width': 'auto', -# } +html_theme_options = { + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner", + "icon": "fab fa-github", + }, + { + "name": "PyPI", + "url": "https://pypi.org/project/Montreal-Forced-Aligner/", + "icon": "fas fa-box", + }, + { + "name": "PyPI", + "url": "https://anaconda.org/conda-forge/montreal-forced-aligner", + "icon": "fas fa-toolbox", + }, + ], + "google_analytics_id": "UA-73068199-4", + "show_nav_level": 1, + "navigation_depth": 4, + "show_toc_level": 2, + "collapse_navigation": False, +} +html_context = { + # "github_url": "https://github.com", # or your GitHub Enterprise interprise + "github_user": "MontrealCorpusTools", + "github_repo": "Montreal-Forced-Aligner", + "github_version": "main", + "doc_path": "docs/source", +} # Add any paths that contain custom themes here, relative to this directory. # html_theme_path = [] @@ -173,6 +325,9 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] +html_css_files = [ + "css/style.css", +] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied @@ -194,7 +349,7 @@ # Custom sidebar templates, maps document names to template names. # # html_sidebars = { '**': ['globaltoc.html', 'relations.html', 'sourcelink.html', 'searchbox.html'], } - +html_sidebars = {"**": ["search-field.html", "sidebar-nav-bs.html", "sidebar-ethical-ads.html"]} # Additional templates that should be rendered to pages, maps page names to # template names. # diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst deleted file mode 100644 index 953369aa..00000000 --- a/docs/source/configuration.rst +++ /dev/null @@ -1,101 +0,0 @@ - -.. _configuration: - -************* -Configuration -************* - -Global configuration for MFA can be updated via the ``mfa configure`` subcommand. Once the command is called with a flag, it will set a default value for any future runs (though, you can overwrite most settings when you call other commands). - -Options available: - -.. option:: -t - --temp_directory - - Set the default temporary directory - -.. option:: -j - --num_jobs - - Set the number of processes to use by default - -.. option:: --always_clean - - Always remove files from previous runs by default - -.. option:: --never_clean - - Don't remove files from previous runs by default - -.. option:: --always_verbose - - Default to verbose output (outputs debug messages) - -.. option:: --never_verbose - - Default to non-verbose output - - Default to verbose output (outputs debug messages) - -.. option:: --always_debug - - Default to running debugging steps - -.. option:: --never_debug - - Default to not running debugging steps - -.. option:: --always_overwrite - - Always overwrite output files - -.. option:: --never_overwrite - - Never overwrite output files (if file already exists, the output will be saved in the temp directory) - -.. option:: --disable_mp - - Disable all multiprocessing (not recommended as it will usually increase processing times) - -.. option:: --enable_mp - - Enable multiprocessing (recommended and enabled by default) - -.. option:: --disable_textgrid_cleanup - - Disable postprocessing of TextGrids that cleans up silences and recombines compound words and clitics - -.. option:: --enable_textgrid_cleanup - - Enable postprocessing of TextGrids that cleans up silences and recombines compound words and clitics - -.. option:: --disable_terminal_colors - - Turn off colored text in output - -.. option:: --enable_terminal_colors - - Turn on colored text in output - -.. option:: --terminal_width - - Set width of terminal output, defaults to 120 characters - -.. option:: --blas_num_threads - - Number of threads to use for BLAS libraries, 1 is recommended and the default, due to how much MFA relies on multiprocessing. - -.. option:: -h - --help - - Display help message for the command - -.. toctree:: - :maxdepth: 2 - - configuration_align.rst - configuration_transcription.rst - configuration_lm.rst - configuration_segment.rst - configuration_ivector.rst - configuration_g2p.rst diff --git a/docs/source/create_segments.rst b/docs/source/create_segments.rst deleted file mode 100644 index 88f2ed02..00000000 --- a/docs/source/create_segments.rst +++ /dev/null @@ -1,65 +0,0 @@ -.. _create_segments: - -*************** -Create segments -*************** - -The Montreal Forced Aligner can use Voice Activity Detection (VAD) capabilities from Kaldi to generate segments from -a longer sound file. - -Steps to create segments: - - -1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that - MFA was installed in. -2. Run the following command, substituting the arguments with your own paths: - - .. code-block:: bash - - mfa create_segments corpus_directory output_directory - - -.. note:: - - The default configuration for VAD uses configuration values based on quiet speech. The algorithm is based on energy, - so if your recordings are more noisy, you may need to adjust the configuration. See :ref:`configuration_segments` - for more information on changing these parameters. - - -Options available: - -.. option:: -h - --help - - Display help message for the command - -.. option:: --config_path PATH - - Path to a YAML config file that will specify the alignment configuration. See - :ref:`align_config` for more details. - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for aligning, default is ``~/Documents/MFA`` - -.. option:: -j NUMBER - --num_jobs NUMBER - - Number of jobs to use; defaults to 3, set higher if you have more - processors available and would like to process faster - -.. option:: -v - --verbose - - The aligner will print out more information if present - -.. option:: -d - --debug - - The aligner will run in debug mode - -.. option:: -c - --clean - - Forces removal of temporary files in ``~/Documents/MFA`` diff --git a/docs/source/data_format.rst b/docs/source/data_format.rst deleted file mode 100644 index 4f51886b..00000000 --- a/docs/source/data_format.rst +++ /dev/null @@ -1,151 +0,0 @@ -.. _data_format: - -************ -Data formats -************ - -.. _prosodylab_format: - -Prosodylab-aligner format -========================= - -Every audio file you are aligning must have a corresponding .lab -file containing the text transcription of that audio file. The audio and -transcription files must have the same name. For example, if you have ``givrep_1027_2_1.wav``, -its transcription should be in ``givrep_1027_2_1.lab`` (which is just a -text file with the .lab extension). - -.. note:: If you have transcriptions in a - tab-separated text file (or an Excel file, which can be saved as one), - you can generate .lab files from it using the relabel function of relabel_clean.py. - The relabel_clean.py script is currently in the prosodylab.alignertools repository on GitHub. - -If no ``.lab`` file is found, then the aligner will look for any matching ``.txt`` files and use those. - -In terms of directory structure, the default configuration assumes that -files are separated into subdirectories based on their speaker (with one -speaker per file). - -An alternative way to specify which speaker says which -segment is to use the ``-s`` flag with some number of characters of the file name as the speaker identifier. - -The output from aligning this format of data will be TextGrids that have a tier -for words and a tier for phones. - -.. _textgrid_format: - -TextGrid format -=============== - -The other main format that is supported is long sound files accompanied -by TextGrids that specify orthographic transcriptions for short intervals -of speech. - - - .. figure:: _static/librispeech_textgrid.png - :align: center - :alt: Image cannot be displayed in your browser - -If the ``-s`` flag is specified, the tier names will not be used as speaker names, and instead the first X characters -specified by the flag will be used as the speaker name. - -By default, each tier corresponds to a speaker (speaker "237" in the above example), so it is possible to -align speech for multiple speakers per sound file using this format. - - - .. figure:: _static/multiple_speakers_textgrid.png - :align: center - :alt: Image cannot be displayed in your browser - -Stereo files are supported as well, where it assumes that if there are -multiple talkers, the first half of speaker tiers are associated with the first -channel, and the second half of speaker tiers are associated with the second channel. - -The output from aligning will be a TextGrid with word and phone tiers for -each speaker. - - .. figure:: _static/multiple_speakers_output_textgrid.png - :align: center - :alt: Image cannot be displayed in your browser - -.. note:: - - Intervals in the TextGrid less than 100 milliseconds will not be aligned. - -Transcription normalization and dictionary lookup -================================================= - -If a word is not found in the dictionary, and has no orthographic -markers for morpheme boundaries (apostrophes or hyphens), then it will -be replaced in the output with '' for unknown word. - -.. note:: - - The list of all unknown words (out of vocabulary words; OOV words) will - be output to a file named ``oovs_found.txt`` - in the output directory, if you would like to add them to the dictionary - you are using. To help find any typos in transcriptions, a file named - ``utterance_oovs.txt`` will be put in the output directory and will list - the unknown words per utterance. - -As part of parsing orthographic transcriptions, punctuation is stripped -from the ends of words. In addition, all words are converted to lowercase -so that dictionary lookup is not case-sensitive. - -.. note:: - - The definition of punctuation, clitic markers, and compound markers can be set in a config file, see :ref:`configuration_alignment` - for more details - -Dictionary lookup will attempt to generate the most maximal coverage of -novel forms if they use some overt morpheme boundary in the orthography. - -For instance, in French, clitics are marked using apostrophes between the -bound clitic and the stem. Thus given a dictionary like: - -.. highlight:: none - -:: - - c'est S E - c S E - c' S - etait E T E - un A N - -And two example orthographic transcriptions: - -:: - - c'est un c - c'etait un c - -The normalization would result in the following: - -:: - - c'est un c - c' était un c - -With a pronunciation of: - -:: - - S E A N S E - S E T E A N S E - -The key point to note is that the pronunciation of the clitic ``c'`` is ``S`` -and the pronunciation of the letter ``c`` in French is ``S A``. - -The algorithm will try to associate the clitic marker with either the element -before (as for French clitics) or the element after (as for English clitics -like the possessive marker). The default clitic markers are ``'`` and ``’`` (but they are collapsed into a single -clitic marker, ``'`` by default). - -The default compound marker is a hyphen (``-``). -Compound markers are treated similarly to clitic markers, but they are not associated with one -particular element in the word over another. Instead, they are used to simply split the compound word. -For example, ``merry-go-round`` will -become ``merry go round`` if the hyphenated form is not in the dictionary. -If no words are found on splitting the word based on hyphens or apostrophes, -then the word will be treated as a single unit (single unknown word). diff --git a/docs/source/data_prep.rst b/docs/source/data_prep.rst deleted file mode 100644 index bd8bd70b..00000000 --- a/docs/source/data_prep.rst +++ /dev/null @@ -1,51 +0,0 @@ - -.. _`MFA-reorganization-scripts repository`: https://github.com/MontrealCorpusTools/MFA-reorganization-scripts -.. _data_prep: - -**************** -Data preparation -**************** - -Prior to running the aligner, make sure the following are set up: - -1. A pronunciation dictionary for your language should specify the pronunciations of orthographic transcriptions. - -2. The sound files to align. - -3. Orthographic annotations in .lab files for individual sound files (Prosodylab-aligner format) - or in TextGrid intervals for longer sound files (TextGrid format). - -The sound files and the orthographic annotations should be contained in one directory structured as follows:: - - +-- textgrid_corpus_directory - | --- recording1.wav - | --- recording1.TextGrid - | --- recording2.wav - | --- recording2.TextGrid - | --- ... - - +-- prosodylab_corpus_directory - | +-- speaker1 - | --- recording1.wav - | --- recording1.lab - | --- recording2.wav - | --- recording2.lab - | +-- speaker2 - | --- recording3.wav - | --- recording3.lab - | --- ... - - -.. note:: - - A collection of preprocessing scripts to get various corpora of other formats is available in the - `MFA-reorganization-scripts repository`_. - -For details on how to organize each of these three components, see below. - -.. toctree:: - :maxdepth: 3 - - dictionary.rst - sound_files.rst - data_format.rst diff --git a/docs/source/data_validation.rst b/docs/source/data_validation.rst deleted file mode 100644 index 4fd308b5..00000000 --- a/docs/source/data_validation.rst +++ /dev/null @@ -1,80 +0,0 @@ - -.. _validating_data: - -*************** -Validating data -*************** - -The validation utility will perform the basic set up that alignment would perform, but analyzes and reports any issues -that the user may want to fix. - -First, the utility parses the corpus and dictionary, prints out summary information about the corpus, -and logs any of the following issues: - -- If there are any words in transcriptions that are not in the dictionary, these are logged as out-of-vocabulary items (OOVs). - A list of these OOVs and which utterances they appear in are saved to text files. -- Any issues reading sound files -- Any issues generating features, skipped if ``--ignore_acoustics`` is flagged -- Any transcription files missing .wav files -- Any .wav files missing transcription files -- Any issues reading transcription files -- Any unsupported sampling rates of .wav files -- Any unaligned files from a basic monophone acoustic model trained on the dataset (or using a supplied acoustic model), - skipped if ``--ignore_acoustics`` is flagged -- Any files that have deviations from their original transcription to decoded transcriptions using a simple language model - - -.. _running_the_validator: - -Running the validation utility -============================== - -Steps to run the validation utility: - -1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that - MFA was installed in. - -2. Run the following command, substituting the arguments with your own paths: - - .. code-block:: bash - - mfa validate corpus_directory dictionary_path [optional_acoustic_model_path] - -The ``corpus_directory`` argument should be a full path to the corpus to validate, following the proper :ref:`data_format`. -The ``dictionary_path`` should be a full path to the pronunciation dictionary you want to use with -the corpus, following the proper :ref:`dictionary`. The optional ``acoustic_model_path`` can be used -to test alignment as well as flag potential transcription issues if ``--test_transcriptions`` is present. -The ``acoustic_model_path`` should be either a full path to an acoustic model you've trained, or you can use one of the -:ref:`pretrained_acoustic`. - -Extra options to the validation utility: - -.. option:: -s NUMBER - --speaker_characters NUMBER - - Number of characters to use to identify speakers; if not specified, - the aligner assumes that the directory name is the identifier for the - speaker. Additionally, it accepts the value ``prosodylab`` to use the second field of a ``_`` delimited file name, - following the convention of labelling production data in the ProsodyLab at McGill. - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for aligning, default is ``~/Documents/MFA`` - -.. option:: -j NUMBER - --num_jobs NUMBER - - Number of jobs to use; defaults to 3, set higher if you have more - processors available and would like to align faster - -.. option:: --ignore_acoustics - - Prevent validation of feature generation and initial alignment. Using this flag will make validation much faster. - -.. option:: --test_transcriptions - - If flagged, the validation utility will construct simple unigram language model and attempt to decode each segment to - be aligned. Segments are flagged if the decoded transcriptions contain deviations from the original transcriptions. - This is largely experimental feature that may be useful, but may not be always reliable. Cannot be flagged at the - same time as ``--ignore_acoustics`` diff --git a/docs/source/external_links.py b/docs/source/external_links.py new file mode 100644 index 00000000..4f3f621f --- /dev/null +++ b/docs/source/external_links.py @@ -0,0 +1,423 @@ +""" + sphinx.ext.extlinks + ~~~~~~~~~~~~~~~~~~~ + Extension to save typing and prevent hard-coding of base URLs in the reST + files. + This adds a new config value called ``extlinks`` that is created like this:: + extlinks = {'exmpl': ('https://example.invalid/%s.html', caption), ...} + Now you can use e.g. :exmpl:`foo` in your documents. This will create a + link to ``https://example.invalid/foo.html``. The link caption depends on + the *caption* value given: + - If it is ``None``, the caption will be the full URL. + - If it is a string, it must contain ``%s`` exactly once. In this case the + caption will be *caption* with the role content substituted for ``%s``. + You can also give an explicit caption, e.g. :exmpl:`Foo `. + Both, the url string and the caption string must escape ``%`` as ``%%``. + :copyright: Copyright 2007-2021 by the Sphinx team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +from typing import Any, Dict, List, Tuple + +import sphinx +from docutils import nodes, utils +from docutils.nodes import Node, system_message +from docutils.parsers.rst.states import Inliner +from sphinx.application import Sphinx +from sphinx.util import caption_ref_re + +MODEL_TYPE_MAPPING = { + "acoustic": "acoustic model", + "g2p": "g2p model", + "lm": "language model", + "dictionary": "dictionary", + "ivector": "ivector extractor", +} + + +def model_role( + typ: str, + rawtext: str, + text: str, + lineno: int, + inliner: Inliner, + options: Dict = None, + content: List[str] = None, +) -> Tuple[List[Node], List[system_message]]: + text = utils.unescape(text) + model_type, model_name = text.split("/") + full_url = f"https://github.com/MontrealCorpusTools/mfa-models/raw/main/{model_type}/{model_name.lower()}.zip" + title = f"{model_name.title()} {MODEL_TYPE_MAPPING[model_type]}" + pnode = nodes.reference(title, title, internal=False, refuri=full_url) + return [pnode], [] + + +def kaldi_steps_role( + typ: str, + rawtext: str, + text: str, + lineno: int, + inliner: Inliner, + options: Dict = None, + content: List[str] = None, +) -> Tuple[List[Node], List[system_message]]: + text = utils.unescape(text) + full_url = f"https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5/steps/{text}.sh" + title = f"{text}.sh" + pnode = nodes.reference(title, title, internal=False, refuri=full_url) + return [pnode], [] + + +def kaldi_utils_role( + typ: str, + rawtext: str, + text: str, + lineno: int, + inliner: Inliner, + options: Dict = None, + content: List[str] = None, +) -> Tuple[List[Node], List[system_message]]: + filename = utils.unescape(text) + full_url = f"https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5/utils/{filename}" + title = f"{text}" + pnode = nodes.reference(title, title, internal=False, refuri=full_url) + return [pnode], [] + + +def kaldi_steps_sid_role( + typ: str, + rawtext: str, + text: str, + lineno: int, + inliner: Inliner, + options: Dict = None, + content: List[str] = None, +) -> Tuple[List[Node], List[system_message]]: + text = utils.unescape(text) + full_url = f"https://github.com/kaldi-asr/kaldi/tree/cbed4ff688a172a7f765493d24771c1bd57dcd20/egs/sre08/v1/sid/{text}.sh" + title = f"sid/{text}.sh" + pnode = nodes.reference(title, title, internal=False, refuri=full_url) + return [pnode], [] + + +def kaldi_docs_role( + typ: str, + rawtext: str, + text: str, + lineno: int, + inliner: Inliner, + options: Dict = None, + content: List[str] = None, +) -> Tuple[List[Node], List[system_message]]: + text = utils.unescape(text) + t = text.split("#") + text = t[0] + ref = "" + title = f"Kaldi {' '.join(text.split('_'))} page" + if len(t) > 1: + ref = f"#{t[1]}" + sec = t[1].split("sec_")[1] + title = f"Kaldi {' '.join(sec.split('_'))} section" + full_url = f"http://kaldi-asr.org/doc/{text}.html{ref}" + pnode = nodes.reference(title, title, internal=False, refuri=full_url) + return [pnode], [] + + +def openfst_src_role( + typ: str, + rawtext: str, + text: str, + lineno: int, + inliner: Inliner, + options: Dict = None, + content: List[str] = None, +) -> Tuple[List[Node], List[system_message]]: + text = utils.unescape(text) + full_url = f"https://www.openfst.org/doxygen/fst/html/{text}-main_8cc_source.html" + title = f"OpenFst {text} source" + pnode = nodes.reference(title, title, internal=False, refuri=full_url) + return [pnode], [] + + +def kaldi_src_role( + typ: str, + rawtext: str, + text: str, + lineno: int, + inliner: Inliner, + options: Dict = None, + content: List[str] = None, +) -> Tuple[List[Node], List[system_message]]: + text = utils.unescape(text) + mapping = { + "bin": set( + """align-equal align-equal-compiled acc-tree-stats + show-alignments compile-questions cluster-phones + compute-wer compute-wer-bootci make-h-transducer + add-self-loops convert-ali + compile-train-graphs compile-train-graphs-fsts + make-pdf-to-tid-transducer make-ilabel-transducer show-transitions + ali-to-phones ali-to-post weight-silence-post acc-lda est-lda + ali-to-pdf est-mllt build-tree build-tree-two-level decode-faster + decode-faster-mapped vector-scale copy-transition-model + phones-to-prons prons-to-wordali copy-gselect copy-tree scale-post + post-to-weights sum-tree-stats weight-post post-to-tacc copy-matrix + copy-vector copy-int-vector sum-post sum-matrices draw-tree + align-mapped align-compiled-mapped latgen-faster-mapped latgen-faster-mapped-parallel + hmm-info analyze-counts post-to-phone-post + post-to-pdf-post logprob-to-post prob-to-post copy-post + matrix-sum build-pfile-from-ali get-post-on-ali tree-info am-info + vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs + transform-vec align-text matrix-dim post-to-smat compile-graph + compare-int-vector latgen-incremental-mapped + compute-gop compile-train-graphs-without-lexicon""".split() + ), + "chainbin": set( + """chain-est-phone-lm chain-get-supervision chain-make-den-fst + nnet3-chain-get-egs nnet3-chain-copy-egs nnet3-chain-merge-egs + nnet3-chain-shuffle-egs nnet3-chain-subset-egs + nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob + nnet3-chain-combine nnet3-chain-normalize-egs + nnet3-chain-e2e-get-egs nnet3-chain-compute-post + chain-make-num-fst-e2e + nnet3-chain-train2 nnet3-chain-combine2""".split() + ), + "featbin": set( + """add-deltas add-deltas-sdc append-post-to-feats + append-vector-to-feats apply-cmvn apply-cmvn-sliding compare-feats + compose-transforms compute-and-process-kaldi-pitch-feats + compute-cmvn-stats compute-cmvn-stats-two-channel + compute-fbank-feats compute-kaldi-pitch-feats compute-mfcc-feats + compute-plp-feats compute-spectrogram-feats concat-feats copy-feats + copy-feats-to-htk copy-feats-to-sphinx extend-transform-dim + extract-feature-segments extract-segments feat-to-dim + feat-to-len fmpe-acc-stats fmpe-apply-transform fmpe-est + fmpe-init fmpe-sum-accs get-full-lda-mat interpolate-pitch + modify-cmvn-stats paste-feats post-to-feats + process-kaldi-pitch-feats process-pitch-feats + select-feats shift-feats splice-feats subsample-feats + subset-feats transform-feats wav-copy wav-reverberate + wav-to-duration multiply-vectors paste-vectors""".split() + ), + "fgmmbin": set( + """fgmm-global-acc-stats fgmm-global-sum-accs fgmm-global-est + fgmm-global-merge fgmm-global-to-gmm fgmm-gselect fgmm-global-get-frame-likes + fgmm-global-copy fgmm-global-gselect-to-post fgmm-global-info + fgmm-global-acc-stats-post fgmm-global-init-from-accs""".split() + ), + "fstbin": set( + """fstdeterminizestar + fstrmsymbols fstisstochastic fstminimizeencoded fstmakecontextfst + fstmakecontextsyms fstaddsubsequentialloop fstaddselfloops + fstrmepslocal fstcomposecontext fsttablecompose fstrand + fstdeterminizelog fstphicompose fstcopy + fstpushspecial fsts-to-transcripts fsts-project fsts-union + fsts-concat make-grammar-fst""".split() + ), + "gmmbin": set( + """gmm-init-mono gmm-est gmm-acc-stats-ali gmm-align + gmm-decode-faster gmm-decode-simple gmm-align-compiled + gmm-sum-accs gmm-est-regtree-fmllr gmm-acc-stats-twofeats + gmm-acc-stats gmm-init-lvtln gmm-est-lvtln-trans gmm-train-lvtln-special + gmm-acc-mllt gmm-mixup gmm-init-model gmm-transform-means + gmm-make-regtree gmm-decode-faster-regtree-fmllr gmm-post-to-gpost + gmm-est-fmllr-gpost gmm-est-fmllr gmm-est-regtree-fmllr-ali + gmm-est-regtree-mllr gmm-compute-likes + gmm-decode-faster-regtree-mllr gmm-latgen-simple + gmm-rescore-lattice gmm-decode-biglm-faster + gmm-est-gaussians-ebw gmm-est-weights-ebw gmm-latgen-faster gmm-copy + gmm-global-acc-stats gmm-global-est gmm-global-sum-accs gmm-gselect + gmm-latgen-biglm-faster gmm-ismooth-stats gmm-global-get-frame-likes + gmm-global-est-fmllr gmm-global-to-fgmm gmm-global-acc-stats-twofeats + gmm-global-copy gmm-fmpe-acc-stats gmm-acc-stats2 gmm-init-model-flat gmm-info + gmm-get-stats-deriv gmm-est-rescale gmm-boost-silence + gmm-basis-fmllr-accs gmm-basis-fmllr-training gmm-est-basis-fmllr + gmm-est-map gmm-adapt-map gmm-latgen-map gmm-basis-fmllr-accs-gpost + gmm-est-basis-fmllr-gpost gmm-latgen-faster-parallel + gmm-est-fmllr-raw gmm-est-fmllr-raw-gpost gmm-global-init-from-feats + gmm-global-info gmm-latgen-faster-regtree-fmllr gmm-est-fmllr-global + gmm-acc-mllt-global gmm-transform-means-global gmm-global-get-post + gmm-global-gselect-to-post gmm-global-est-lvtln-trans gmm-init-biphone""".split() + ), + "ivectorbin": set( + """ivector-extractor-init ivector-extractor-copy ivector-extractor-acc-stats + ivector-extractor-sum-accs ivector-extractor-est + ivector-extract compute-vad select-voiced-frames + compute-vad-from-frame-likes merge-vads + ivector-normalize-length + ivector-transform ivector-compute-dot-products ivector-mean + ivector-compute-lda ivector-compute-plda + ivector-copy-plda compute-eer + ivector-subtract-global-mean ivector-plda-scoring + logistic-regression-train logistic-regression-eval + logistic-regression-copy ivector-extract-online + ivector-adapt-plda ivector-plda-scoring-dense + agglomerative-cluster""".split() + ), + "kwsbin": set( + """lattice-to-kws-index kws-index-union transcripts-to-fsts + kws-search generate-proxy-keywords compute-atwv print-proxy-keywords""".split() + ), + "latbin": set( + """lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest + lattice-lmrescore lattice-scale lattice-union lattice-to-post + lattice-determinize lattice-oracle lattice-rmali + lattice-compose lattice-boost-ali lattice-copy lattice-to-fst + lattice-to-phone-lattice lattice-interp lattice-project + lattice-add-trans-probs lattice-difference + nbest-to-linear nbest-to-lattice lattice-1best linear-to-nbest + lattice-mbr-decode lattice-align-words lattice-to-mpe-post + lattice-copy-backoff nbest-to-ctm lattice-determinize-pruned + lattice-to-ctm-conf lattice-combine + lattice-rescore-mapped lattice-depth lattice-align-phones + lattice-to-smbr-post lattice-determinize-pruned-parallel + lattice-add-penalty lattice-align-words-lexicon lattice-push + lattice-minimize lattice-limit-depth lattice-depth-per-frame + lattice-confidence lattice-determinize-phone-pruned + lattice-determinize-phone-pruned-parallel lattice-expand-ngram + lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons + lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm + lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned lattice-reverse + lattice-expand lattice-path-cover lattice-add-nnlmscore""".split() + ), + "nnet2bin": set( + """nnet-am-info nnet-init + nnet-train-simple nnet-train-ensemble nnet-train-transitions nnet-latgen-faster nnet-am-copy + nnet-am-init nnet-insert nnet-align-compiled + nnet-compute-prob nnet-copy-egs nnet-combine + nnet-am-average nnet-am-compute nnet-am-mixup + nnet-get-egs nnet-train-parallel nnet-combine-fast + nnet-subset-egs nnet-shuffle-egs nnet-am-fix + nnet-latgen-faster-parallel nnet-to-raw-nnet nnet-compute + raw-nnet-concat raw-nnet-info + nnet-get-feature-transform nnet-compute-from-egs + nnet-am-widen nnet-show-progress + nnet-get-feature-transform-multi nnet-copy-egs-discriminative + nnet-get-egs-discriminative nnet-shuffle-egs-discriminative + nnet-compare-hash-discriminative nnet-combine-egs-discriminative + nnet-train-discriminative-simple nnet-train-discriminative-parallel + nnet-modify-learning-rates nnet-normalize-stddev + nnet-get-weighted-egs nnet-adjust-priors + nnet-replace-last-layers nnet-am-switch-preconditioning + nnet1-to-raw-nnet raw-nnet-copy nnet-relabel-egs nnet-am-reinitialize""".split() + ), + "nnet3bin": set( + """nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs + nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs + nnet3-compute-from-egs nnet3-train nnet3-am-init nnet3-am-train-transitions + nnet3-am-adjust-priors nnet3-am-copy nnet3-compute-prob + nnet3-average nnet3-am-info nnet3-combine nnet3-latgen-faster + nnet3-latgen-faster-parallel nnet3-show-progress nnet3-align-compiled + nnet3-copy nnet3-get-egs-dense-targets nnet3-compute + nnet3-discriminative-get-egs nnet3-discriminative-copy-egs + nnet3-discriminative-merge-egs nnet3-discriminative-shuffle-egs + nnet3-discriminative-compute-objf nnet3-discriminative-train + nnet3-discriminative-subset-egs nnet3-get-egs-simple + nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped + nnet3-egs-augment-image nnet3-xvector-get-egs nnet3-xvector-compute + nnet3-xvector-compute-batched + nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch + nnet3-latgen-faster-lookahead cuda-gpu-available cuda-compiled""".split() + ), + "nnetbin": set( + """nnet-train-frmshuff + nnet-train-perutt + nnet-train-mmi-sequential + nnet-train-mpe-sequential + nnet-train-multistream nnet-train-multistream-perutt + rbm-train-cd1-frmshuff rbm-convert-to-nnet + nnet-forward nnet-copy nnet-info nnet-concat + transf-to-nnet cmvn-to-nnet nnet-initialize + feat-to-post paste-post train-transitions + nnet-set-learnrate""".split() + ), + "online2bin": set( + """online2-wav-gmm-latgen-faster apply-cmvn-online + extend-wav-with-silence compress-uncompress-speex + online2-wav-nnet2-latgen-faster ivector-extract-online2 + online2-wav-dump-features ivector-randomize + online2-wav-nnet2-am-compute online2-wav-nnet2-latgen-threaded + online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar + online2-tcp-nnet3-decode-faster online2-wav-nnet3-latgen-incremental + online2-wav-nnet3-wake-word-decoder-faster""".split() + ), + "onlinebin": set( + """online-net-client online-server-gmm-decode-faster online-gmm-decode-faster + online-wav-gmm-decode-faster online-audio-server-decode-faster + online-audio-client""".split() + ), + "rnnlmbin": set( + """rnnlm-get-egs rnnlm-train rnnlm-get-sampling-lm + rnnlm-get-word-embedding rnnlm-compute-prob rnnlm-sentence-probs""".split() + ), + "sgmm2bin": set( + """sgmm2-init sgmm2-gselect sgmm2-acc-stats sgmm2-est sgmm2-sum-accs + sgmm2-align-compiled sgmm2-est-spkvecs sgmm2-post-to-gpost + sgmm2-acc-stats-gpost sgmm2-latgen-faster sgmm2-est-spkvecs-gpost + sgmm2-rescore-lattice sgmm2-copy sgmm2-info sgmm2-est-ebw + sgmm2-acc-stats2 sgmm2-comp-prexform sgmm2-est-fmllr sgmm2-project + sgmm2-latgen-faster-parallel init-ubm""".split() + ), + "tfrnnlmbin": set( + """lattice-lmrescore-tf-rnnlm lattice-lmrescore-tf-rnnlm-pruned""".split() + ), + } + for k, v in mapping.items(): + if text in v: + text = f"{k}/{text}" + break + full_url = f"https://github.com/kaldi-asr/kaldi/tree/master/src/{text}.cc" + title = f"{text}.cc" + pnode = nodes.reference(title, title, internal=False, refuri=full_url) + return [pnode], [] + + +def xref( + typ: str, + rawtext: str, + text: str, + lineno: int, + inliner: Inliner, + options: Dict = None, + content: List[str] = None, +) -> Tuple[List[Node], List[system_message]]: + + title = target = text + # look if explicit title and target are given with `foo ` syntax + brace = text.find("<") + if brace != -1: + m = caption_ref_re.match(text) + if m: + target = m.group(2) + title = m.group(1) + else: + # fallback: everything after '<' is the target + target = text[brace + 1 :] + title = text[:brace] + + link = xref.links[target] + + if brace != -1: + pnode = nodes.reference(target, title, refuri=link[1]) + else: + pnode = nodes.reference(target, link[0], refuri=link[1]) + + return [pnode], [] + + +def get_refs(app): + xref.links = app.config.xref_links + + +def setup(app: Sphinx) -> Dict[str, Any]: + app.add_config_value("xref_links", {}, "env") + app.add_role("mfa_model", model_role) + app.add_role("kaldi_steps", kaldi_steps_role) + app.add_role("kaldi_utils", kaldi_utils_role) + app.add_role("kaldi_steps_sid", kaldi_steps_sid_role) + app.add_role("kaldi_src", kaldi_src_role) + app.add_role("openfst_src", openfst_src_role) + app.add_role("kaldi_docs", kaldi_docs_role) + app.add_role("xref", xref) + app.connect("builder-inited", get_refs) + return {"version": sphinx.__display_version__, "parallel_read_safe": True} diff --git a/docs/source/example.rst b/docs/source/first_steps/example.rst similarity index 93% rename from docs/source/example.rst rename to docs/source/first_steps/example.rst index 664ac494..a23cea60 100644 --- a/docs/source/example.rst +++ b/docs/source/first_steps/example.rst @@ -33,8 +33,8 @@ Example 1: Aligning LibriSpeech (English) Set up ------ -1. Ensure you have installed MFA via :ref:`installation`. -2. Ensure you have downloaded the pretrained model via :code:`mfa download acoustic english` +1. Ensure you have installed MFA via :ref:`installation_ref`. +2. Ensure you have downloaded the pretrained model via :code:`mfa model download acoustic english` 3. Download the prepared LibriSpeech dataset (`LibriSpeech data set`_) and extract it somewhere on your computer 4. Download the LibriSpeech lexicon (`LibriSpeech lexicon`_) and save it somewhere on your computer @@ -69,8 +69,8 @@ Example 2: Generate Mandarin dictionary Set up ------ -1. Ensure you have installed MFA via :ref:`installation`. -2. Ensure you have downloaded the pretrained model via :code:`mfa download g2p mandarin_pinyin_g2p` +1. Ensure you have installed MFA via :ref:`installation_ref`. +2. Ensure you have downloaded the pretrained model via :code:`mfa model download g2p mandarin_pinyin_g2p` 3. Download the prepared Mandarin dataset from (`example Mandarin corpus`_) and extract it somewhere on your computer .. note:: @@ -102,7 +102,7 @@ Example 3: Train Mandarin G2P model Set up ------ -1. Ensure you have installed MFA via :ref:`installation`. +1. Ensure you have installed MFA via :ref:`installation_ref`. 2. Download the prepared Mandarin dictionary from (`example Mandarin dictionary`_) In the same environment that you've installed MFA, enter the following command into the terminal: diff --git a/docs/source/first_steps/index.rst b/docs/source/first_steps/index.rst new file mode 100644 index 00000000..e9953051 --- /dev/null +++ b/docs/source/first_steps/index.rst @@ -0,0 +1,190 @@ + + +.. _first_steps: + +*********** +First steps +*********** + +The ``mfa`` command line utility has grown over the years to encompass a number of utility functions. This section aims to provide a path for first-time users to figure out the workflow that works best for them. + +Also check out :ref:`tutorials` for external tutorials or blog posts on specific topics. + +Use cases +========= + +There are several broad use cases that you might want to use MFA for. Take a look below and if any are close matches, you should be able to apply the linked instructions to your data. + +#. **Use case 1:** You have a speech corpus, the language involved is in the list of :ref:`pretrained_acoustic_models` and the list of :ref:`pretrained_dictionaries`. + + #. Follow :ref:`first_steps_align_pretrained` to generate aligned TextGrids + +#. **Use case 2:** You have a speech corpus, the language involved is in the list of :ref:`pretrained_acoustic_models` and the list of :ref:`pretrained_g2p`, but not on the list of :ref:`pretrained_dictionaries`. + + #. Follow :ref:`first_steps_g2p_pretrained` to generate a dictionary + #. Use the generated dictionary in :ref:`first_steps_align_pretrained` to generate aligned TextGrids + +#. **Use case 3:** You have a speech corpus, a pronunciation dictionary, but there is no pretrained acoustic model for the language (or none that have the same phones as the pronunciation dictionary) + + #. Follow :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids + +#. **Use case 4:** You have a speech corpus, a pronunciation dictionary, but it does not have great coverage of the words in the corpus. + + #. Follow :ref:`first_steps_train_g2p` to train a G2P model + #. Use the trained G2P model in :ref:`first_steps_g2p_pretrained` to generate a pronunciation dictionary + #. Use the generated pronunciation dictionary in :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids + +.. _first_steps_align_pretrained: + +Aligning a speech corpus with existing pronunciation dictionary and acoustic model +---------------------------------------------------------------------------------- + +For the purposes of this example, we'll use the "english" model, but the instructions will be applicable to any pretrained acoustic model/pronunciation dictionary pairing. We'll also assume that you have done nothing else with MFA other than follow the :ref:`installation_ref` instructions and you have the :code:`mfa` command working. Finally, we'll assume that your data is stored in the folder :code:`~/mfa_data/my_corpus`, so when working with your data, this will be the main thing to update. + +First we'll need the pretrained models and dictionary. These are installed via the :code:`mfa model download` command: + +.. code-block:: + + mfa model download acoustic english + mfa model download dictionary english + +You should be able to run :code:`mfa model inspect acoustic english` and it will output information about the :code:`english` acoustic model. + +Next, we want to make sure that the dataset is in the proper format for MFA, which is what the :code:`mfa validate` command does: + +.. code-block:: + + mfa validate ~/mfa_data/my_corpus english english + +This command will look through the corpus and make sure that MFA is parsing everything correctly. There are couple of different types of :ref:`corpus_structure` that MFA supports, but in general the core requirement is that you should have pairs of sound files and transcription files with the same name (except for the extension). Take a look over the validator output and make sure that the number of speakers and number of files and utterances match your expectations, and that the number of Out of Vocabulary (OOV) items is not too high. If you want to generate transcriptions for these words so that they can be aligned, see :ref:`first_steps_g2p_pretrained` to make a new dictionary. The validator will also attempt to run feature generation and train a simple monophone model to make sure that everything works within Kaldi. + +Once we've validated the data, we can align it via the :code:`mfa align` command: + +.. code-block:: + + mfa align ~/mfa_data/my_corpus english english ~/mfa_data/my_corpus_aligned + +If alignment is successful, you'll see TextGrid files containing the aligned words and phones in the output directory (here :code:`~/mfa_data/my_corpus_aligned`). If there were issues in exporting the TextGrids, you'll see them listed in the output directory. If your corpus is large, you'll likely want to increase the number of jobs that MFA uses. For that and more advanced configuration, see :ref:`pretrained_alignment`. + +.. note:: + + Please see :ref:`alignment_example` for an example using toy data. + + +.. _first_steps_g2p_pretrained: + +Generating a pronunciation dictionary with a pretrained G2P model +----------------------------------------------------------------- + +For the purposes of this example, we'll use the "english" model, but the instructions will be applicable to any pretrained G2P model. We'll also assume that you have done nothing else with MFA other than follow the :ref:`installation_ref` instructions and you have the :code:`mfa` command working. Finally, we'll assume that your corpus is stored in the folder :code:`~/mfa_data/my_corpus`, so when working with your data, this will be the main thing to update. + +First we'll need the pretrained G2P model. These are installed via the :code:`mfa model download` command: + +.. code-block:: + + mfa model download g2p english_g2p + +You should be able to run :code:`mfa model inspect g2p english_g2p` and it will output information about the :code:`english_g2p` G2P model. + +Depending on your use case, you might have a list of words to run G2P over, or just a corpus of sound and transcription files. The :code:`mfa g2p` command can process either: + +.. code-block:: + + mfa g2p english_g2p ~/mfa_data/my_corpus ~/mfa_data/new_dictionary.txt # If using a corpus + mfa g2p english_g2p ~/mfa_data/my_word_list.txt ~/mfa_data/new_dictionary.txt # If using a word list + +Running one of the above will output a text file pronunciation dictionary in the format that MFA uses (:ref:`dictionary_format`). I recommend looking over the pronunciations generated and make sure that they look sensible. For languages where the orthography is not transparent, it may be helpful to include :code:`--num_pronunciations 3` so that more pronunciations are generated than just the most likely one. For more details on running G2P, see :ref:`g2p_dictionary_generating`. + +From here you can use this dictionary file as input to any MFA command that uses dictionaries, i.e. + +.. code-block:: + + mfa align ~/mfa_data/my_corpus ~/mfa_data/new_dictionary.txt english ~/mfa_data/my_corpus_aligned + + +.. note:: + + Please see :ref:`dict_generating_example` for an example using toy data. + +.. _first_steps_align_train_acoustic_model: + +Training a new acoustic model on a corpus +----------------------------------------- + +For the purposes of this example, we'll also assume that you have done nothing else with MFA other than follow the :ref:`installation_ref` instructions and you have the :code:`mfa` command working. We'll assume that your corpus data is stored in the folder :code:`~/mfa_data/my_corpus` and that you have a pronunciation dictionary at :code:`~/mfa_data/my_dictionary.txt`, so when working with your data, these paths will be the main thing to update. + +The first thing we want to do is to make sure that the dataset is in the proper format for MFA, which is what the :code:`mfa validate` command does: + +.. code-block:: + + mfa validate ~/mfa_data/my_corpus ~/mfa_data/my_dictionary.txt + +This command will look through the corpus and make sure that MFA is parsing everything correctly. There are couple of different types of :ref:`corpus_structure` that MFA supports, but in general the core requirement is that you should have pairs of sound files and transcription files with the same name (except for the extension). Take a look over the validator output and make sure that the number of speakers and number of files and utterances match your expectations, and that the number of Out of Vocabulary (OOV) items is not too high. If you want to generate transcriptions for these words so that they can be aligned, see :ref:`first_steps_train_g2p` and :ref:`first_steps_g2p_pretrained` to make a new dictionary. The validator will also attempt to run feature generation and train a simple monophone model to make sure that everything works within Kaldi. + +Once we've validated the data, we can train an acoustic model (and output the aligned TextGrids if we want) it via the :code:`mfa train` command: + +.. code-block:: + + mfa train ~/mfa_data/my_corpus ~/mfa_data/my_dictionary.txt ~/mfa_data/new_acoustic_model.zip # Export just the trained acoustic model + mfa train ~/mfa_data/my_corpus ~/mfa_data/my_dictionary.txt ~/mfa_data/my_corpus_aligned # Export just the training alignments + mfa train ~/mfa_data/my_corpus ~/mfa_data/my_dictionary.txt ~/mfa_data/new_acoustic_model.zip ~/mfa_data/my_corpus_aligned # Export both trained model and alignments + +As for other commands, if your data is large, you'll likely want to increase the number of jobs that MFA uses. For that and more advanced configuration of the training command, see :ref:`train_acoustic_model`. + +If training was successful, you'll now see the TextGrids in the output directory, assuming you wanted to export them. The TextGrid export is identical to if you had run :code:`mfa align` with the trained acoustic model. + +If you choose export the acoustic model, you can now use this model for other utilities and use cases, such as refining your pronunciation dictionary through :ref:`training_dictionary` or :ref:`transcribing` for new data. If you would like to store the exported acoustic model for easy reference like the downloaded pretrained models, you can save it via :code:`mfa model save`: + +.. code-block:: + + mfa model save acoustic ~/mfa_data/new_acoustic_model.zip + +You can then run :code:`mfa model inspect` on it: + +.. code-block:: + + mfa model inspect acoustic new_acoustic_model + +Or use it as a reference in other MFA commands. + + +.. _first_steps_train_g2p: + +Training a G2P model from a pronunciation dictionary +---------------------------------------------------- + +For the purposes of this example, we'll also assume that you have done nothing else with MFA other than follow the :ref:`installation_ref` instructions and you have the :code:`mfa` command working. Finally, we'll assume that your pronunciation dictionary is stored as :code:`~/mfa_data/my_dictionary.txt` and that it fits the :ref:`dictionary_format`. + + +To train the G2P model, we use the :code:`mfa train_g2p`: + +.. code-block:: + + mfa train_g2p ~/mfa_data/my_dictionary.txt ~/mfa_data/my_g2p_model.zip + +As for other commands, if your dictionary is large, you'll likely want to increase the number of jobs that MFA uses. For that and more advanced configuration of the training command, see :ref:`g2p_model_training`. + +Once the G2P model is trained, you should see the exported archive in the folder. From here, we can save it for future use, or use the full path directly for generating pronunciations of new words. + +.. code-block:: + + mfa model save g2p ~/mfa_data/my_g2p_model.zip + + mfa g2p my_g2p_model ~/mfa_data/my_new_word_list.txt ~/mfa_data/my_new_dictionary.txt + + # Or + + mfa g2p ~/mfa_data/my_g2p_model.zip ~/mfa_data/my_new_word_list.txt ~/mfa_data/my_new_dictionary.txt + +Take a look at :ref:`first_steps_g2p_pretrained` with this new model for a more detailed walk-through of generating a dictionary. + +.. note:: + + Please see :ref:`g2p_model_training_example` for an example using toy data. + +.. toctree:: + :maxdepth: 1 + :hidden: + + example + tutorials diff --git a/docs/source/tutorials.rst b/docs/source/first_steps/tutorials.rst similarity index 85% rename from docs/source/tutorials.rst rename to docs/source/first_steps/tutorials.rst index b07c4237..ebcda3ff 100644 --- a/docs/source/tutorials.rst +++ b/docs/source/first_steps/tutorials.rst @@ -16,7 +16,7 @@ External tutorials I will try to keep this updated with a list of in-depth tutorials for using MFA. If you write up anything that could be included here, please let me know by `filing an issue`_ and I will add it. -* (Version 1.0) `MFA section in Eleanor Chodroff's excellent corpus phonetics tutorial series`_ +* `MFA section in Eleanor Chodroff's excellent corpus phonetics tutorial series`_ (Written using version 1.0, the API and command line utilities have changed dramatically for Version 2.0) * `Bootstrapping an IPA dictionary for English using Montreal Forced Aligner 2.0`_ * `Update on Montreal Forced Aligner performance`_ * `Speaker dictionaries and multilingual IPA`_ diff --git a/docs/source/g2p_dictionary_generating.rst b/docs/source/g2p_dictionary_generating.rst deleted file mode 100644 index 608c0ee9..00000000 --- a/docs/source/g2p_dictionary_generating.rst +++ /dev/null @@ -1,76 +0,0 @@ - - -.. _g2p_dictionary_generating: - -*********************** -Generating a dictionary -*********************** - -We have trained several G2P models that are available for download (:ref:`pretrained_g2p`). - -.. warning:: - - Please note that G2P models trained prior to 2.0 cannot be used with MFA 2.0. If you would like to use - these models, please use the the 1.0.1 or 1.1 g2p utilities or retrain a new G2P model following - :ref:`g2p_model_training`. - -To construct a pronunciation dictionary from your .lab or .TextGrid files, simply input: - -.. code-block:: bash - - mfa g2p g2p_model_path input_path output_path - -The argument ``g2p_model_path`` can either be a fully specified path to a G2P model you've trained previously -or one that you've downloaded via the :code:`mfa download g2p` command (see :ref:`pretrained_g2p`). The -``input_path`` argument can either be a text file of words to generate transcriptions for or a corpus directory that -will be inspected for text transcripts and a word list will be compiled and pronunciations generated. The -``output_path`` argument is the full path to where the resulting pronunciation dictionary should be saved. - -.. note:: - - Generating pronunciations to supplement your existing pronunciation - dictionary can be done by running the validation utility (see :ref:`running_the_validator`), and then use the path - to the ``oovs_found.txt`` file that it generates. - - -Pronunciation dictionaries can also be generated from the orthographies of the words themselves, rather than relying on -a trained G2P model. This functionality should be reserved for languages with transparent orthographies, close to 1-to-1 -grapheme-to-phoneme mapping. - -.. code-block:: bash - - mfa g2p input_path output_path - -Extra options (see :ref:`configuration_g2p` for full configuration details): - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for generating dictionary, default is ``~/Documents/MFA`` - -.. option:: -j NUMBER - --num_jobs NUMBER - - Number of jobs to use; defaults to 3, set higher if you have more - processors available and would like to generate pronunciations faster - -.. option:: -c - --clean - - Forces removal of temporary files under ``~/Documents/MFA`` or the specified temporary directory - prior to generating the dictionary. - -.. option:: --config_path - - Path to a configuration yaml for G2P generation (see :ref:`default_g2p_config` for an example yaml file) - -.. option:: -n NUMBER - --num_pronunciations NUMBER - - Number of pronunciation variants to generate per word, the default is 1 - -.. option:: --include_bracketed - - Flag for whether to generate pronunciations for words that are enclosed in brackets (i.e., [...], (...), <...>) - -See :ref:`dict_generating_example` for an example of how to use G2P functionality with a premade example. diff --git a/docs/source/g2p_model_training.rst b/docs/source/g2p_model_training.rst deleted file mode 100644 index ea37f3fd..00000000 --- a/docs/source/g2p_model_training.rst +++ /dev/null @@ -1,62 +0,0 @@ -.. _`Pynini`: https://github.com/kylebgormon/Pynini -.. _`Sigmorphon 2020 G2P task baseline`: https://github.com/sigmorphon/2020/tree/master/task1/baselines/fst - -.. _g2p_model_training: - -************************ -Training a new G2P model -************************ - -Another tool included with MFA allows you to train a G2P (Grapheme to Phoneme) model automatically from a given -pronunciation dictionary. -This type of model can be used for :ref:`g2p_dictionary_generating`. -It requires a pronunciation dictionary with each line consisting of the orthographic transcription followed by the -phonetic transcription. The model is generated using the `Pynini`_ package, which generates FST (finite state transducer) -files. The implementation is based on that in the `Sigmorphon 2020 G2P task baseline`_. -The G2P model output will be a .zip file like the acoustic model generated from alignment. - -To train a model from a pronunciation dictionary, the following command is used: - -.. code-block:: bash - - mfa train_g2p dictionary_path output_model_path - -The ``dictionary_path`` should be a full path to a pronunciation dictionary to train the model from. The -``output_model_path`` is the path to save the resulting G2P model. - -Extra options (see :ref:`configuration_g2p` for full configuration details): - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for training, default is ``~/Documents/MFA`` - -.. option:: -j NUMBER - --num_jobs NUMBER - - Number of jobs to use; defaults to 3, set higher if you have more - processors available and would like to train the G2P model faster - -.. option:: --config_path - - Path to a configuration yaml for G2P model training (see :ref:`default_train_g2p_config` for an example yaml file) - -.. cmdoption:: --validate - - Run a validation on the dictionary with 90% of the data as training and 10% as test. It will output the percentage - accuracy of pronunciations generated. - -.. option:: -v - --verbose - - Print more messages to the command line output (see also, the log files in the MFA temporary directory for the training) - -.. option:: -c - --clean - - Forces removal of temporary files under ``~/Documents/MFA`` or the specified temporary directory - prior to training the model. - -.. note:: - - See :ref:`g2p_model_training_example` for an example of how to train a G2P model with a premade toy example. diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst new file mode 100644 index 00000000..55b8e329 --- /dev/null +++ b/docs/source/getting_started.rst @@ -0,0 +1,73 @@ + +.. _`Conda forge`: https://conda-forge.org/ + +.. _getting_started_ref: + +*************** +Getting started +*************** + + +Installation +------------ + +.. panels:: + :card: + install-card + :column: col-lg-6 col-md-6 col-sm-12 col-xs-12 p-3 + + Installing with conda + ^^^^^^^^^^^^^^^^^^^^^ + + MFA is now on `Conda forge`_ + and can be installed with Anaconda or Miniconda: + + +++ + + .. code-block:: bash + + conda config --add channels conda-forge + conda install montreal-forced-aligner + + +++ + + .. link-button:: https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html + :type: url + :text: Install Conda + :classes: btn-block btn-primary btn-navigation stretched-link + + + --- + + In-depth instructions + ^^^^^^^^^^^^^^^^^^^^^ + + Want to learn more about installing? Want to use G2P commands on Windows? + + +++ + + .. link-button:: installation_ref + :type: ref + :text: To the installation guide + :classes: btn-block btn-primary btn-navigation stretched-link + + --- + :column: col-12 p-3 + + First steps + ^^^^^^^^^^^ + + First time using MFA? Want a walkthrough of a specific use case? + + + .. link-button:: first_steps + :type: ref + :text: First steps + :classes: btn-block btn-primary btn-navigation + + +.. toctree:: + :maxdepth: 1 + :hidden: + + installation + first_steps/index diff --git a/docs/source/index.rst b/docs/source/index.rst index 00d830fb..62ab4016 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,37 +1,85 @@ -.. Montreal Forced Aligner documentation master file, created by - sphinx-quickstart on Wed Jun 15 13:27:38 2016. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. -Welcome to Montreal Forced Aligner's documentation! -=================================================== +Montreal Forced Aligner documentation +===================================== -Contents: +.. panels:: + :card: + intro-card text-center + :column: col-lg-6 col-md-6 col-sm-6 col-xs-12 d-flex + + --- + .. raw:: html + + + + Getting started + ^^^^^^^^^^^^^^^ + + Install the Montreal Forced Aligner and get started with examples and tutorials. + + +++ + + .. link-button:: getting_started_ref + :type: ref + :text: Install MFA + :classes: btn-block btn-primary btn-navigation stretched-link + + --- + .. raw:: html + + + + First steps + ^^^^^^^^^^^ + + Have a particular use case for MFA? + + Check out the first steps tutorials. + + +++ + + .. link-button:: first_steps + :type: ref + :text: First steps + :classes: btn-block btn-primary btn-navigation stretched-link + + --- + .. raw:: html + + + + User guide + ^^^^^^^^^^ + + The User Guide gives more details on input formats, available commands, and details on the various workflows available. + + +++ + + .. link-button:: user_guide + :type: ref + :text: User guide + :classes: btn-block btn-primary btn-navigation stretched-link + + --- + .. raw:: html + + + + API reference + ^^^^^^^^^^^^^ + + The API guide lists all the inner workings of MFA, the modules and classes that you can import and use in your own scripts and projects, along with details about the Kaldi functionality used. + + +++ + + .. link-button:: mfa_api + :type: ref + :text: Reference guide + :classes: btn-block btn-primary btn-navigation stretched-link .. toctree:: - :maxdepth: 3 - - introduction.rst - news.rst - installation.rst - tutorials.rst - commands.rst - data_prep.rst - data_validation.rst - aligning.rst - example.rst - corpus_creation.rst - g2p.rst - configuration.rst - annotator.rst - pretrained_models.rst - changelog/changelog_index.rst - api_reference/api_index.rst - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` + :hidden: + + Getting started + User guide + API reference + Release notes diff --git a/docs/source/installation.rst b/docs/source/installation.rst index b5a7005c..bfe2ea86 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -1,71 +1,105 @@ -.. _`Montreal Forced Aligner releases`: https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases - -.. _`Kaldi GitHub repository`: https://github.com/kaldi-asr/kaldi - -.. _installation: +.. _installation_ref: ************ Installation ************ +.. important:: + + Kaldi and MFA are now built on :xref:`conda_forge` |:tada:|, so installation of third party binaries is wholly through conda from 2.0.0b4 onwards. Installing MFA via conda will pick up Kaldi as well. + + All platforms ============= +1. Install Anaconda/Miniconda (https://docs.conda.io/en/latest/miniconda.html) +2. Create new environment and install MFA: :code:`conda create -n aligner -c conda-forge montreal-forced-aligner` + + a. You can enable the :code:`conda-forge` channel by default by running :code:`conda config --add channels conda-forge` in order to omit the :code:`-c conda-forge` from these commands + +3. Ensure you're in the new environment created (:code:`conda activate aligner`) + +Upgrading from non-conda version +================================ + +In general, it's recommend to create a new environment. If you want to update, + +1. Activate your conda environment (i.e., :code:`conda activate aligner`) +2. Upgrade all packages via :code:`conda update --all` +3. Run :code:`pip uninstall montreal-forced-aligner` (to clean up previous pip installation) +4. Run :code:`conda install -c conda-forge montreal-forced-aligner` + .. warning:: - Windows native install is not fully supported in 2.0. G2P functionality will be unavailable due to Pynini supporting - only Linux and MacOS. To use G2P functionality on Windows, please set up the "Windows Subsystem - For Linux" and use the Bash console to continue the instructions. + Windows native install is not fully supported in 2.0. G2P functionality will be unavailable due to Pynini supporting only Linux and MacOS. To use G2P functionality on Windows, please set up the :xref:`wsl` and use the Bash console to continue the instructions. -1. Install Anaconda/Miniconda (https://docs.conda.io/en/latest/miniconda.html) -2. Create new environment: +Supported functionality +======================= - b. On Linux/MacOS: :code:`conda create -n aligner -c conda-forge kaldi sox python=3.8 openfst pynini ngram baumwelch` +Currently in the 2.0 beta, supported functionality is fragmented across platforms. Native support for features +is as follows. Note that Windows can use Windows Subsystem for Linux to use the Linux version as necessary. - b. On Windows (no G2P functionality), use the command :code:`conda create -n aligner -c conda-forge kaldi sox python=3.8` +.. list-table:: + :header-rows: 1 + :stub-columns: 1 -3. Ensure you're in the new environment created (:code:`conda activate aligner`) -4. Install via pip :code:`pip install montreal-forced-aligner` + * - Feature + - Linux support + - Windows support + - MacOS support -To upgrade to the latest version of MFA: + * - Alignment + - .. raw:: html -1. Activate your conda environment (i.e., :code:`conda activate aligner`) -2. Run :code:`pip install montreal-forced-aligner -U` + Yes + - .. raw:: html -.. note:: + Yes + - .. raw:: html - MFA 2.0.0a5 and earlier used Pynini version 2.1.0. As of 2.0.0a6, versions have been upgraded to the latest version - of Pynini, but there were some breaking changes, so please be sure to upgrade via :code:`conda upgrade -c conda-forge kaldi sox openfst pynini ngram baumwelch` - if you installed a previous 2.0 alpha version to ensure correct performance. + Yes -.. note:: + * - G2P training + - .. raw:: html - Kaldi is now built on conda-forge, so installation of third party binaries is wholly through conda from 2.0.0b4 onwards. - I plan to put MFA on conda-forge eventually as well, so installation will be handled just through that. + Yes + - .. raw:: html + No + - .. raw:: html -Files created when using the Montreal Forced Aligner -==================================================== + Yes -The aligner will save data and logs for the models it trains in a new folder, -``Documents/MFA`` (which it creates in your user's home directory). If a model for a corpus already -exists in MFA, it will use any existing models if you try to align it again. -(If this is not desired, delete or move the old model folder or use the ``--clean`` flag.) -You can specify your own temporary directory by using the ``-t`` -flag when calling the executable or by changing the default, see :ref:`configuration` for more details. + * - G2P generation + - .. raw:: html -Supported functionality -======================= + Yes + - .. raw:: html -Currently in the 2.0 alpha, supported functionality is somewhat fragmented across platforms. Native support for features -is as follows. Note that Windows can use Windows Subsystem for Linux to use the Linux version as necessary. + No + - .. raw:: html + + Yes + + * - Transcription + - .. raw:: html + + Yes + - .. raw:: html + + Yes + - .. raw:: html + + Yes + + * - Training language model + - .. raw:: html + + Yes + - .. raw:: html -.. csv-table:: - :header: "Feature", "Linux support", "Windows support", "MacOS support" + No + - .. raw:: html - "Alignment", "Yes", "Yes", "Yes" - "G2P", "Yes", "No", "Yes" - "Transcribe", "Yes", "Yes", "Yes" - "Train LM", "Yes", "No", "Yes" - "Train dictionary", "Yes", "Yes", "Yes" + Yes diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst deleted file mode 100644 index 21ac6337..00000000 --- a/docs/source/pretrained_models.rst +++ /dev/null @@ -1,300 +0,0 @@ -.. _`Arabic acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/arabic.zip - -.. _`Bulgarian acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/bulgarian.zip - -.. _`Croatian acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/croatian.zip - -.. _`Czech acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/czech.zip - -.. _`English acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/english.zip - -.. _`French (FR) acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/french.zip - -.. _`French (Prosodylab) acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/french_prosodylab.zip - -.. _`French (QC) acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/french_qc.zip - -.. _`German acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/german.zip - -.. _`German (Prosodylab) acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/german_prosodylab.zip - -.. _`Hausa acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/hausa.zip - -.. _`Japanese acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/japanese.zip - -.. _`Korean acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/korean.zip - -.. _`Mandarin acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/mandarin.zip - -.. _`Polish acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/polish.zip - -.. _`Portuguese acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/portuguese.zip - -.. _`Russian acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/russian.zip - -.. _`Spanish acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/spanish.zip - -.. _`Swahili acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/swahili.zip - -.. _`Swedish acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/swedish.zip - -.. _`Tamil acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/tamil.zip - -.. _`Thai acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/thai.zip - -.. _`Turkish acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/turkish.zip - -.. _`Ukrainian acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/ukrainian.zip - -.. _`Vietnamese acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/vietnamese.zip - -.. _`Vietnamese (vPhon) acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/vietnamese_vphon.zip - -.. _`Wu acoustic model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/wu.zip - - -.. _`Pynini`: https://github.com/kylebgormon/Pynini -.. _`Sigmorphon 2020 G2P task baseline`: https://github.com/sigmorphon/2020/tree/master/task1/baselines/fst - - -.. _`Arabic G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/arabic_g2p.zip - -.. _`Bulgarian G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/bulgarian_g2p.zip - -.. _`Croatian G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/croatian_g2p.zip - -.. _`Czech G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/czech_g2p.zip - -.. _`English G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/english_g2p.zip - -.. _`French G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/french_g2p.zip - -.. _`French (Lexique) G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/french_lexique_g2p.zip - -.. _`French (ProsodyLab) G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/french_prosodylab_g2p.zip - -.. _`German G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/german_g2p.zip - -.. _`German (ProsodyLab) G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/german_prosodylab_g2p.zip - -.. _`Hausa G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/hausa_g2p.zip - -.. _`Japanese G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/japanese_character_g2p.zip - -.. _`Korean Hangul G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/korean_hangul_g2p.zip - -.. _`Korean Jamo G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/korean_jamo_g2p.zip - -.. _`Mandarin Pinyin G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/mandarin_pinyin_g2p.zip - -.. _`Mandarin Character G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/mandarin_character_g2p.zip - -.. _`Polish G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/polish_g2p.zip - -.. _`Portuguese G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/portuguese_g2p.zip - -.. _`Russian G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/russian_g2p.zip - -.. _`Spanish G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/spanish_g2p.zip - -.. _`Swahili G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/swahili_g2p.zip - -.. _`Swedish G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/swedish_g2p.zip - -.. _`Thai G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/thai_g2p.zip - -.. _`Turkish G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/turkish_g2p.zip - -.. _`Ukrainian G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/ukrainian_g2p.zip - -.. _`Vietnamese G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/vietnamese_g2p.zip - -.. _`Vietnamese (vPhon) G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/vietnamese_vphon_g2p.zip - -.. _`Wu G2P model`: https://github.com/MontrealCorpusTools/mfa-models/raw/main/g2p/wu_g2p.zip - -.. _`ProsodyLab dictionary repository`: https://github.com/prosodylab/prosodylab.dictionaries - -.. _`Lexique`: http://www.lexique.org/ - -.. _`ProsodyLab French dictionary`: https://github.com/prosodylab/prosodylab.dictionaries/raw/master/fr.dict - -.. _`English pronunciation dictionary`: https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/english.dict -.. _`French Prosodylab dictionary`: https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/fr.dict -.. _`German Prosodylab dictionary`: https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/de.dict -.. _`TalnUPF Spanish IPA dictionary`: https://raw.githubusercontent.com/TalnUPF/phonetic_lexica/master/es/es_lexicon-IPA.txt -.. _`TalnUPF Spanish gpA dictionary`: https://raw.githubusercontent.com/TalnUPF/phonetic_lexica/master/es/es_lexicon-gpA.txt -.. _`TalnUPF Catalan IPA dictionary`: https://raw.githubusercontent.com/TalnUPF/phonetic_lexica/master/ca/ca_lexicon-IPA.txt - -.. _`GlobalPhone language models`: https://www.csl.uni-bremen.de/GlobalPhone/ - -.. _`LibriSpeech language models`: https://www.openslr.org/11/ - -.. _`FalaBrasil language models`: https://gitlab.com/fb-asr/fb-asr-resources/kaldi-resources/-/tree/main/lm -.. _`FalaBrasil dictionary`: https://gitlab.com/fb-nlp/nlp-resources/-/tree/main/res - -.. _pretrained_models: - -***************** -Pretrained models -***************** - -The command for interacting with MFA models is :code:`mfa model`. The subcommands allow for inspecting currently saved pretrained models, downloading ones from MFA's model repo, and saving models you have trained to be used with a simple name rather than the full path each time. - -Following installation of MFA, :code:`mfa model list acoustic` will not list any models. If you want to download the default English model trained on Librispeech, you can run :code:`mfa model download acoustic english`. At which point, the previous ``list`` command will output "english" as an option. When referring to an acoustic model in another MFA command, rather than the full path to the acoustic model, you can now supply just ``english`` and MFA will resolve it to the saved path. - -Similarly, if you train a new model, you can run :code:`mfa model save acoustic /path/where/the/model/was/saved.zip`, then this model will be available via ``saved`` in the future. The name defaults to whatever the archive is called without the directory or extension. You can modify this name with the ``--name NEWNAME`` option - -There are a number of pretrained models for aligning and generating pronunciation dictionaries. The command -for downloading these is :code:`mfa download ` where ``model_type`` is one of ``acoustic``, ``g2p``, or -``dictionary``. - -.. _pretrained_acoustic: - -Pretrained acoustic models -========================== - -As part of using the Montreal Forced Aligner in our own research, we have trained acoustic models for a number of languages. -If you would like to use them, please download them below. Please note the dictionary that they were trained with to -see more information about the phone set. When using these with a pronunciation dictionary, the phone sets must be -compatible. If the orthography of the language is transparent, it is likely that we have a G2P model that can be used -to generate the necessary pronunciation dictionary. - -Any of the following acoustic models can be downloaded with the command :code:`mfa download acoustic `. You -can get a full list of the currently available acoustic models via :code:`mfa download acoustic`. New models contributed -by users will be periodically added. If you would like to contribute your trained models, please contact Michael McAuliffe -at michael.e.mcauliffe@gmail.com. - -.. csv-table:: - :header: "Language", "Link", "Corpus", "Number of speakers", "Audio (hours)", "Phone set" - - "Arabic", `Arabic acoustic model`_, "GlobalPhone", 80, 19.0, "GlobalPhone" - "Bulgarian", `Bulgarian acoustic model`_, "GlobalPhone", 79, 21.4, "GlobalPhone" - "Croatian", `Croatian acoustic model`_, "GlobalPhone", 94, 15.9, "GlobalPhone" - "Czech", `Czech acoustic model`_, "GlobalPhone", 102, 31.7, "GlobalPhone" - "English", `English acoustic model`_, "LibriSpeech", 2484, 982.3, "Arpabet (stressed)" - "French (FR)", `French (FR) acoustic model`_, "GlobalPhone", 100, 26.9, "GlobalPhone" - "French (FR)", `French (Prosodylab) acoustic model`_, "GlobalPhone", 100, 26.9, "Prosodylab [1]_" - "French (QC)", `French (QC) acoustic model`_, "Lab speech", "N/A", "N/A", "Prosodylab [1]_" - "German", `German acoustic model`_, "GlobalPhone", 77, 18, "GlobalPhone" - "German", `German (Prosodylab) acoustic model`_, "GlobalPhone", 77, 18, "Prosodylab [3]_" - "Hausa", `Hausa acoustic model`_, "GlobalPhone", 103, 8.7, "GlobalPhone" - "Japanese", "Not available yet", "GlobalPhone", 144, 34, "GlobalPhone" - "Korean", `Korean acoustic model`_, "GlobalPhone", 101, 20.8, "GlobalPhone" - "Mandarin", `Mandarin acoustic model`_, "GlobalPhone", 132, 31.2, "Pinyin phones [6]_" - "Polish", `Polish acoustic model`_, "GlobalPhone", 99, 24.6, "GlobalPhone" - "Portuguese", `Portuguese acoustic model`_, "GlobalPhone", 101, 26.3, "GlobalPhone" - "Russian", `Russian acoustic model`_, "GlobalPhone", 115, 26.5, "GlobalPhone" - "Spanish", `Spanish acoustic model`_, "GlobalPhone", 102, 22.1, "GlobalPhone" - "Swahili", `Swahili acoustic model`_, "GlobalPhone", 70, 11.1, "GlobalPhone" - "Swedish", `Swedish acoustic model`_, "GlobalPhone", 98, 21.7, "GlobalPhone" - "Tamil", "Not available yet", "GlobalPhone", "N/A", "N/A", "GlobalPhone" - "Thai", `Thai acoustic model`_, "GlobalPhone", 98, 28.2, "GlobalPhone" - "Turkish", `Turkish acoustic model`_, "GlobalPhone", 100, 17.1, "GlobalPhone" - "Ukrainian", `Ukrainian acoustic model`_, "GlobalPhone", 119, 14.1, "GlobalPhone" - "Vietnamese", `Vietnamese acoustic model`_, "GlobalPhone", 129, 19.7, "GlobalPhone" - "Wu", "Not available yet", "GlobalPhone", 41, 9.3, "GlobalPhone" - -.. _pretrained_g2p: - -Pretrained G2P models -===================== - - -Included with MFA is a separate tool to generate a dictionary from a preexisting model. This should be used if you're -aligning a dataset for which you have no pronunciation dictionary or the orthography is very transparent. We have pretrained -models for several languages below. - -Any of the following G2P models can be downloaded with the command :code:`mfa download g2p `. You -can get a full list of the currently available G2P models via :code:`mfa download g2p`. New models contributed -by users will be periodically added. If you would like to contribute your trained models, please contact Michael McAuliffe -at michael.e.mcauliffe@gmail.com. - -These models were generated using the `Pynini`_ package on the GlobalPhone dataset. The implementation is based on that in the -`Sigmorphon 2020 G2P task baseline`_. -This means that they will only work for transcriptions which use the same -alphabet. Current language options are listed below, with the following accuracies when trained on 90% of the data and -tested on 10%: - -.. csv-table:: - :header: "Language", "Link", "WER", "LER", "Orthography system", "Phone set" - - "Arabic", `Arabic G2P model`_, 28.45, 7.42, "Romanized [2]_", "GlobalPhone" - "Bulgarian", `Bulgarian G2P model`_, 3.08, 0.38, "Cyrillic alphabet", "GlobalPhone" - "Croatian", `Croatian G2P model`_, 9.47, 3.4, "Latin alphabet", "GlobalPhone" - "Czech", `Czech G2P model`_, 3.43, 0.71, "Latin alphabet", "GlobalPhone" - "English", `English G2P model`_, 28.45, 7.42, "Latin alphabet", "Arpabet" - "French", `French G2P model`_, 42.54, 6.98, "Latin alphabet", "GlobalPhone" - "French", `French (Lexique) G2P model`_, 5.31, 1.06, "Latin alphabet", "Lexique" - "French", `French (Prosodylab) G2P model`_ [1]_, 5.11, 0.95, "Latin alphabet", "Prosodylab" - "German", `German G2P model`_, 36.16, 7.84, "Latin alphabet", "GlobalPhone" - "German", `German (Prosodylab) G2P model`_ [3]_, 5.43, 0.65, "Latin alphabet", "Prosodylab" - "Hausa", `Hausa G2P model`_, 32.54, 7.19, "Latin alphabet", "GlobalPhone" - "Japanese", `Japanese G2P model`_, 17.45, 7.17, "Kanji and kana", "GlobalPhone" - "Korean", `Korean Hangul G2P model`_, 11.85, 1.38, "Hangul", "GlobalPhone" - "Korean", `Korean Jamo G2P model`_, 8.94, 0.95, "Jamo", "GlobalPhone" - "Mandarin", `Mandarin Pinyin G2P model`_, 0.27, 0.06, "Pinyin", "Pinyin phones" - "Mandarin", `Mandarin Character G2P model`_ [4]_, 23.81, 11.2, "Hanzi", "Pinyin phones [6]_" - "Polish", `Polish G2P model`_, 1.23, 0.33, "Latin alphabet", "GlobalPhone" - "Portuguese", `Portuguese G2P model`_, 10.67, 1.62, "Latin alphabet", "GlobalPhone" - "Russian", `Russian G2P model`_, 4.04, 0.65, "Cyrillic alphabet", "GlobalPhone" - "Spanish", `Spanish G2P model`_, 17.93, 3.02, "Latin alphabet", "GlobalPhone" - "Swahili", `Swahili G2P model`_, 0.09, 0.02, "Latin alphabet", "GlobalPhone" - "Swedish", `Swedish G2P model`_, 18.75, 3.14, "Latin alphabet", "GlobalPhone" - "Thai", `Thai G2P model`_, 27.62, 7.48, "Thai script", "GlobalPhone" - "Turkish", `Turkish G2P model`_, 8.51, 2.32, "Latin alphabet", "GlobalPhone" - "Ukrainian", `Ukrainian G2P model`_, 2.1, 0.42, "Cyrillic alphabet", "GlobalPhone" - "Vietnamese", `Vietnamese G2P model`_, 14.91, 3.46, "Vietnamese alphabet", "GlobalPhone" - "Wu", `Wu G2P model`_ [5]_ , 31.19, 13.04, "Hanzi", "GlobalPhone" - - -.. [1] The `ProsodyLab French dictionary`_ is based on `Lexique`_ with substitutions for numbers and special characters. - Note that Lexique is known to currently not work with the aligner, see the `Github issue `_ - for more information and status. -.. [2] Please see the GlobalPhone documentation for how the romanization was done for Arabic. -.. [3] The German dictionary used in training is available in the `ProsodyLab dictionary repository`_. - See http://www.let.uu.nl/~Hugo.Quene/personal/phonchar.html for more information on the CELEX phone set for German - and how it maps to other phonesets. -.. [4] The Mandarin character dictionary that served as the training data for this model was built by mapping between - characters in ``.trl`` files and pinyin syllables in ``.rmn`` files in the GlobalPhone corpus. -.. [5] The Wu G2P model was trained a fairly small lexicon, so it likely does not have the coverage to be a robust model - for most purposes. Please check carefully any resulting dictionaries, as they are likely to have missing syllables from - from unknown symbols. -.. [6] The phoneset for Mandarin was created by GlobalPhone by splitting Pinyin into onset, nucleus (any vowel sequence), - and codas, and then associating the tone of the syllable onto the nucleus (i.e. "fang2" -> "f a2 ng" and "xiao4" -> - "x iao4" - -.. _dictionaries: - -Available pronunciation dictionaries -==================================== - -Any of the following pronunciation dictionaries can be downloaded with the command :code:`mfa download dictionary `. You -can get a full list of the currently available dictionaries via :code:`mfa download dictionary`. New dictionaries contributed -by users will be periodically added. If you would like to contribute your dictionaries, please contact Michael McAuliffe -at michael.e.mcauliffe@gmail.com. - -.. csv-table:: - :header: "Language", "Link", "Orthography system", "Phone set" - - "English", `English pronunciation dictionary`_ , "Latin", "Arpabet (stressed)" - "French", `French Prosodylab dictionary`_, "Latin", "Prosodylab French" - "German", `German Prosodylab dictionary`_, "Latin", "Prosodylab German" - "Brazilian Portuguese", `FalaBrasil dictionary`_, "Latin", "" - "Spanish", `TalnUPF Spanish IPA dictionary`_, "Latin", "IPA" - "Spanish", `TalnUPF Spanish gpA dictionary`_, "Latin", "gpA" - "Catalan", `TalnUPF Catalan IPA dictionary`_, "Latin", "IPA" - -.. _language_models: - -Available language models -========================= - -There are several places that contain pretrained language models that can be imported to MFA. - -.. csv-table:: - :header: "Source", "Language", "Link" - - "GlobalPhone", "Various languages", `GlobalPhone language models`_ - "LibriSpeech", "English", `LibriSpeech language models`_ - "FalaBrasil", "Brazilian Portuguese", `FalaBrasil language models`_ diff --git a/docs/source/reference/abc.rst b/docs/source/reference/abc.rst new file mode 100644 index 00000000..8a279bd3 --- /dev/null +++ b/docs/source/reference/abc.rst @@ -0,0 +1,13 @@ +.. automodule:: montreal_forced_aligner.abc + + .. autosummary:: + :toctree: generated/ + + MfaModel -- Base model type for MFA + MfaWorker -- Base worker class for MFA + AcousticModelWorker -- MFA workers that have acoustic models + Aligner -- Aligner type interface + Dictionary -- Dictionary type interface + IvectorExtractor -- Ivector extractor type interface + Trainer -- Trainer type interface + Transcriber -- Transcriber type interface diff --git a/docs/source/reference/aligner.rst b/docs/source/reference/aligner.rst new file mode 100644 index 00000000..0334b59f --- /dev/null +++ b/docs/source/reference/aligner.rst @@ -0,0 +1,9 @@ +.. automodule:: montreal_forced_aligner.aligner + + .. autosummary:: + :toctree: generated/ + + BaseAligner -- Base aligner + AdaptingAligner -- Adapting aligner + PretrainedAligner -- Pretrained aligner + TrainableAligner -- Trainable aligner diff --git a/docs/source/reference/base_index.rst b/docs/source/reference/base_index.rst new file mode 100644 index 00000000..1df4819d --- /dev/null +++ b/docs/source/reference/base_index.rst @@ -0,0 +1,9 @@ + +Base classes +============ + +.. toctree:: + + corpus + dictionary + models diff --git a/docs/source/reference/command_line.rst b/docs/source/reference/command_line.rst new file mode 100644 index 00000000..07ab83df --- /dev/null +++ b/docs/source/reference/command_line.rst @@ -0,0 +1,29 @@ +Command line functions +====================== + +.. automodule:: montreal_forced_aligner.command_line + + .. autosummary:: + :toctree: generated/ + + main + create_parser + validate_model_arg + run_transcribe_corpus + run_validate_corpus + run_train_lm + run_train_g2p + run_align_corpus + run_train_dictionary + run_anchor + run_adapt_model + run_train_acoustic_model + run_train_ivector_extractor + run_g2p + run_create_segments + run_classify_speakers + run_model + list_model + save_model + inspect_model + download_model diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst new file mode 100644 index 00000000..c5f1f9ad --- /dev/null +++ b/docs/source/reference/config.rst @@ -0,0 +1,17 @@ +.. automodule:: montreal_forced_aligner.config + + .. autosummary:: + :toctree: generated/ + + BaseConfig -- Base configuration + AlignConfig -- Alignment configuration + DictionaryConfig -- Dictionary configuration + CommandConfig -- Command configuration + FeatureConfig -- Feature configuration + SegmentationConfig -- Segmentation configuration + SpeakerClassificationConfig -- Speaker classification configuration + TrainingConfig -- Training configuration + TrainLMConfig -- Training language model configuration + TranscribeConfig -- Transcription configuration + TrainG2PConfig -- Train G2P model configuration + G2PConfig -- G2P configuration diff --git a/docs/source/reference/corpus.rst b/docs/source/reference/corpus.rst new file mode 100644 index 00000000..20fee2ff --- /dev/null +++ b/docs/source/reference/corpus.rst @@ -0,0 +1,10 @@ + +.. automodule:: montreal_forced_aligner.corpus + + .. autosummary:: + :toctree: generated/ + + Corpus -- Class for defining corpora in MFA + Speaker -- Class for collecting metadata about speakers in corpora + File -- Class for representing sound file/transcription file pairs in corpora + Utterance -- Class for collecting information about utterances diff --git a/docs/source/reference/data.rst b/docs/source/reference/data.rst new file mode 100644 index 00000000..1812000b --- /dev/null +++ b/docs/source/reference/data.rst @@ -0,0 +1,6 @@ +.. automodule:: montreal_forced_aligner.data + + .. autosummary:: + :toctree: generated/ + + CtmInterval -- Data class for representing intervals in Kaldi's CTM files diff --git a/docs/source/reference/dictionary.rst b/docs/source/reference/dictionary.rst new file mode 100644 index 00000000..f6f5e3e7 --- /dev/null +++ b/docs/source/reference/dictionary.rst @@ -0,0 +1,9 @@ + +.. automodule:: montreal_forced_aligner.dictionary + + .. autosummary:: + :toctree: generated/ + + PronunciationDictionary -- Pronunciation dictionary for Kaldi + MultispeakerDictionary -- Collection of pronunciation dictionaries that specify speaker-dictionary mappings + DictionaryData -- Data class generated by PronunciationDictionary to parse to and from Kaldi-internal strings diff --git a/docs/source/reference/exceptions.rst b/docs/source/reference/exceptions.rst new file mode 100644 index 00000000..2ee5192e --- /dev/null +++ b/docs/source/reference/exceptions.rst @@ -0,0 +1,34 @@ +.. automodule:: montreal_forced_aligner.exceptions + + .. autosummary:: + :toctree: generated/ + + MFAError + SoxError + G2PError + ConfigError + LMError + LanguageModelNotFoundError + ModelExtensionError + ThirdpartyError + TrainerError + ModelError + CorpusError + ModelLoadError + CorpusReadError + ArgumentError + AlignmentExportError + NoSuccessfulAlignments + KaldiProcessingError + TextParseError + TextGridParseError + DictionaryError + NoDefaultSpeakerDictionaryError + DictionaryPathError + DictionaryFileError + FileArgumentNotFoundError + PretrainedModelNotFoundError + MultipleModelTypesFoundError + ModelTypeNotSupportedError + PronunciationAcousticMismatchError + PronunciationOrthographyMismatchError diff --git a/docs/source/reference/g2p.rst b/docs/source/reference/g2p.rst new file mode 100644 index 00000000..ddaf274a --- /dev/null +++ b/docs/source/reference/g2p.rst @@ -0,0 +1,7 @@ +.. automodule:: montreal_forced_aligner.g2p + + .. autosummary:: + :toctree: generated/ + + PyniniTrainer -- Trainer for Pynini G2P model + PyniniDictionaryGenerator -- Generator for Pynini G2P model diff --git a/docs/source/reference/helper.rst b/docs/source/reference/helper.rst new file mode 100644 index 00000000..dd9269c6 --- /dev/null +++ b/docs/source/reference/helper.rst @@ -0,0 +1,15 @@ +.. automodule:: montreal_forced_aligner.helper + + .. autosummary:: + :toctree: generated/ + + TerminalPrinter + comma_join + make_safe + make_scp_safe + save_scp + load_scp + load_scp_safe + score + edit_distance + output_mapping diff --git a/docs/source/reference/helper_index.rst b/docs/source/reference/helper_index.rst new file mode 100644 index 00000000..52e42f30 --- /dev/null +++ b/docs/source/reference/helper_index.rst @@ -0,0 +1,14 @@ +Helper +====== + +.. toctree:: + + command_line + abc + config + data + exceptions + helper + multiprocessing/index + textgrid + utils diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst new file mode 100644 index 00000000..2364d7e7 --- /dev/null +++ b/docs/source/reference/index.rst @@ -0,0 +1,18 @@ +.. _mfa_api: + +MFA API +======= + +.. warning:: + + While the MFA API is fairly stable, I do tend to do refactors on fairly regular basis. As 2.0 gets more stable, these are likely to get smaller and smaller, and I will try to keep the API docs as up-to-date as possible, so if something breaks in any scripts depending on MFA, please check back here. + +API definition +-------------- + +.. toctree:: + :maxdepth: 2 + + base_index + workers_index + helper_index diff --git a/docs/source/reference/lm.rst b/docs/source/reference/lm.rst new file mode 100644 index 00000000..49861751 --- /dev/null +++ b/docs/source/reference/lm.rst @@ -0,0 +1,6 @@ +.. automodule:: montreal_forced_aligner.lm + + .. autosummary:: + :toctree: generated/ + + LmTrainer -- Trainer for language model diff --git a/docs/source/reference/models.rst b/docs/source/reference/models.rst new file mode 100644 index 00000000..9aa5e22c --- /dev/null +++ b/docs/source/reference/models.rst @@ -0,0 +1,12 @@ + +.. automodule:: montreal_forced_aligner.models + + .. autosummary:: + :toctree: generated/ + + Archive + LanguageModel + AcousticModel + IvectorExtractorModel + DictionaryModel + G2PModel diff --git a/docs/source/reference/multiprocessing/alignment.rst b/docs/source/reference/multiprocessing/alignment.rst new file mode 100644 index 00000000..8be63d82 --- /dev/null +++ b/docs/source/reference/multiprocessing/alignment.rst @@ -0,0 +1,107 @@ +Alignment +========= + +Basic +----- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.alignment + +.. autosummary:: + :toctree: generated/ + + acc_stats + acc_stats_func + align + align_func + mono_align_equal + mono_align_equal_func + tree_stats + tree_stats_func + compile_train_graphs + compile_train_graphs_func + convert_alignments + convert_alignments_func + +LDA training +------------ + +.. currentmodule:: montreal_forced_aligner.multiprocessing.alignment + +.. autosummary:: + :toctree: generated/ + + calc_lda_mllt + calc_lda_mllt_func + lda_acc_stats + lda_acc_stats_func + +Speaker adapted models +---------------------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.alignment + +.. autosummary:: + :toctree: generated/ + + calc_fmllr + calc_fmllr_func + create_align_model + acc_stats_two_feats_func + +Acoustic model adaptation +------------------------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.alignment + +.. autosummary:: + :toctree: generated/ + + train_map + map_acc_stats_func + + +TextGrid Export +--------------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.alignment + +.. autosummary:: + :toctree: generated/ + + ctms_to_textgrids_mp + convert_ali_to_textgrids + ali_to_ctm_func + PhoneCtmProcessWorker + CleanupWordCtmProcessWorker + NoCleanupWordCtmProcessWorker + CombineProcessWorker + ExportPreparationProcessWorker + ExportTextGridProcessWorker + +Pronunciation probabilities +--------------------------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.pronunciations + +.. autosummary:: + :toctree: generated/ + + generate_pronunciations + generate_pronunciations_func + +Validation +---------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.alignment + +.. autosummary:: + :toctree: generated/ + + compile_information + compile_information_func + compute_alignment_improvement + compute_alignment_improvement_func + compare_alignments + parse_iteration_alignments + compile_utterance_train_graphs_func + test_utterances_func diff --git a/docs/source/reference/multiprocessing/corpus.rst b/docs/source/reference/multiprocessing/corpus.rst new file mode 100644 index 00000000..568567b4 --- /dev/null +++ b/docs/source/reference/multiprocessing/corpus.rst @@ -0,0 +1,23 @@ +Corpora +======= + +.. automodule:: montreal_forced_aligner.multiprocessing.corpus + + .. autosummary:: + :toctree: generated/ + + CorpusProcessWorker + +Features +-------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.features + +.. autosummary:: + :toctree: generated/ + + mfcc + mfcc_func + calc_cmvn + compute_vad + compute_vad_func diff --git a/docs/source/reference/multiprocessing/helper.rst b/docs/source/reference/multiprocessing/helper.rst new file mode 100644 index 00000000..24ba9ce0 --- /dev/null +++ b/docs/source/reference/multiprocessing/helper.rst @@ -0,0 +1,65 @@ +Helper +====== + +Functions +--------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.helper + +.. autosummary:: + :toctree: generated/ + + Counter + Stopped + ProcessWorker + run_mp + run_non_mp + +Classes +------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.classes + +.. autosummary:: + :toctree: generated/ + + Job + AlignArguments + VadArguments + SegmentVadArguments + CreateHclgArguments + AccGlobalStatsArguments + AccStatsArguments + AccIvectorStatsArguments + AccStatsTwoFeatsArguments + AliToCtmArguments + MfccArguments + ScoreArguments + DecodeArguments + PhoneCtmArguments + CombineCtmArguments + CleanupWordCtmArguments + NoCleanupWordCtmArguments + LmRescoreArguments + AlignmentImprovementArguments + ConvertAlignmentsArguments + CalcFmllrArguments + CalcLdaMlltArguments + GmmGselectArguments + FinalFmllrArguments + LatGenFmllrArguments + FmllrRescoreArguments + TreeStatsArguments + LdaAccStatsArguments + MapAccStatsArguments + GaussToPostArguments + InitialFmllrArguments + ExtractIvectorsArguments + ExportTextGridArguments + CompileTrainGraphsArguments + CompileInformationArguments + CompileUtteranceTrainGraphsArguments + MonoAlignEqualArguments + TestUtterancesArguments + CarpaLmRescoreArguments + GeneratePronunciationsArguments diff --git a/docs/source/reference/multiprocessing/index.rst b/docs/source/reference/multiprocessing/index.rst new file mode 100644 index 00000000..f24e7fd8 --- /dev/null +++ b/docs/source/reference/multiprocessing/index.rst @@ -0,0 +1,10 @@ +Multiprocessing helper functions +================================ + +.. toctree:: + + corpus + alignment + ivector + transcription + helper diff --git a/docs/source/reference/multiprocessing/ivector.rst b/docs/source/reference/multiprocessing/ivector.rst new file mode 100644 index 00000000..c048750a --- /dev/null +++ b/docs/source/reference/multiprocessing/ivector.rst @@ -0,0 +1,35 @@ +Ivector +======= + +.. automodule:: montreal_forced_aligner.multiprocessing.ivector + + .. autosummary:: + :toctree: generated/ + + gmm_gselect + gmm_gselect_func + gauss_to_post + gauss_to_post_func + acc_global_stats + acc_global_stats_func + acc_ivector_stats + acc_ivector_stats_func + extract_ivectors + extract_ivectors_func + segment_vad + segment_vad_func + get_initial_segmentation + merge_segments + +File segmentation +----------------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.ivector + +.. autosummary:: + :toctree: generated/ + + segment_vad + segment_vad_func + get_initial_segmentation + merge_segments diff --git a/docs/source/reference/multiprocessing/transcription.rst b/docs/source/reference/multiprocessing/transcription.rst new file mode 100644 index 00000000..5a9275ba --- /dev/null +++ b/docs/source/reference/multiprocessing/transcription.rst @@ -0,0 +1,47 @@ +Transcription +============= + +Decoding graph +-------------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.transcription + +.. autosummary:: + :toctree: generated/ + + create_hclgs + create_hclg_func + compose_hclg + compose_clg + compose_lg + compose_g + compose_g_carpa + +Speaker-independent transcription +--------------------------------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.transcription + +.. autosummary:: + :toctree: generated/ + + transcribe + decode_func + lm_rescore_func + carpa_lm_rescore_func + score_transcriptions + score_func + +Speaker-adapted transcription +----------------------------- + +.. currentmodule:: montreal_forced_aligner.multiprocessing.transcription + +.. autosummary:: + :toctree: generated/ + + transcribe_fmllr + initial_fmllr_func + lat_gen_fmllr_func + fmllr_rescore_func + final_fmllr_est_func diff --git a/docs/source/reference/segmenter.rst b/docs/source/reference/segmenter.rst new file mode 100644 index 00000000..be2a29a6 --- /dev/null +++ b/docs/source/reference/segmenter.rst @@ -0,0 +1,6 @@ +.. automodule:: montreal_forced_aligner.segmenter + + .. autosummary:: + :toctree: generated/ + + Segmenter diff --git a/docs/source/reference/speaker_classifier.rst b/docs/source/reference/speaker_classifier.rst new file mode 100644 index 00000000..34a81d0e --- /dev/null +++ b/docs/source/reference/speaker_classifier.rst @@ -0,0 +1,6 @@ +.. automodule:: montreal_forced_aligner.speaker_classifier + + .. autosummary:: + :toctree: generated/ + + SpeakerClassifier diff --git a/docs/source/reference/textgrid.rst b/docs/source/reference/textgrid.rst new file mode 100644 index 00000000..a924c0e6 --- /dev/null +++ b/docs/source/reference/textgrid.rst @@ -0,0 +1,14 @@ +.. automodule:: montreal_forced_aligner.textgrid + + .. autosummary:: + :toctree: generated/ + + process_ctm_line + parse_from_word + parse_from_word_no_cleanup + parse_from_phone + generate_tiers + export_textgrid + ctm_to_textgrid + output_textgrid_writing_errors + ctms_to_textgrids_non_mp diff --git a/docs/source/reference/trainers.rst b/docs/source/reference/trainers.rst new file mode 100644 index 00000000..bce0e269 --- /dev/null +++ b/docs/source/reference/trainers.rst @@ -0,0 +1,12 @@ + +.. automodule:: montreal_forced_aligner.trainers + + .. autosummary:: + :toctree: generated/ + + BaseTrainer -- Base trainer + MonophoneTrainer -- Monophone trainer + TriphoneTrainer -- Triphone trainer + LdaTrainer -- LDA trainer + SatTrainer -- Speaker adapted trainer + IvectorExtractorTrainer -- Trainer for IvectorExtractor diff --git a/docs/source/reference/transcriber.rst b/docs/source/reference/transcriber.rst new file mode 100644 index 00000000..5b6d32ba --- /dev/null +++ b/docs/source/reference/transcriber.rst @@ -0,0 +1,6 @@ +.. automodule:: montreal_forced_aligner.transcriber + + .. autosummary:: + :toctree: generated/ + + Transcriber diff --git a/docs/source/reference/utils.rst b/docs/source/reference/utils.rst new file mode 100644 index 00000000..6b100776 --- /dev/null +++ b/docs/source/reference/utils.rst @@ -0,0 +1,24 @@ +.. automodule:: montreal_forced_aligner.utils + + .. autosummary:: + :toctree: generated/ + + thirdparty_binary + get_available_dictionaries + log_config + log_kaldi_errors + get_available_models + get_available_language_models + get_available_acoustic_models + get_available_g2p_models + get_pretrained_language_model_path + get_pretrained_g2p_path + get_pretrained_ivector_path + get_pretrained_path + get_pretrained_acoustic_path + get_dictionary_path + get_available_ivector_extractors + guess_model_type + parse_logs + setup_logger + CustomFormatter diff --git a/docs/source/reference/validator.rst b/docs/source/reference/validator.rst new file mode 100644 index 00000000..c0e951cd --- /dev/null +++ b/docs/source/reference/validator.rst @@ -0,0 +1,6 @@ +.. automodule:: montreal_forced_aligner.validator + + .. autosummary:: + :toctree: generated/ + + CorpusValidator diff --git a/docs/source/reference/workers_index.rst b/docs/source/reference/workers_index.rst new file mode 100644 index 00000000..f0cf27a5 --- /dev/null +++ b/docs/source/reference/workers_index.rst @@ -0,0 +1,13 @@ +MFA workers +=========== + +.. toctree:: + + aligner + g2p + lm + segmenter + speaker_classifier + trainers + transcriber + validator diff --git a/docs/source/sound_files.rst b/docs/source/sound_files.rst deleted file mode 100644 index 2b6a3f1b..00000000 --- a/docs/source/sound_files.rst +++ /dev/null @@ -1,48 +0,0 @@ - -.. _sound_files: - -*********** -Sound files -*********** - -The default format for sound files in Kaldi is ``.wav``. However, if you have :code:`sox` available on your machine, -MFA will use it to convert ``.flac``, ``.ogg`` and ``.aiff`` files to WAV for Kaldi to process. - -.. note:: - - Sound files will be ignored if there is no ``.lab`` or ``.TextGrid`` with the same name as the sound file. The validation - utility (:ref:`validating_data`) will print a warning message when this happens and log all such files. - -.. note:: - - ``.mp3`` files are supported if ``sox`` can parse them. On Ubuntu this is available via ``sudo apt-get install libsox-fmt-mp3``. - You can verify whether sox can read them via the ``soxi yoursoundfile.mp3``. - -Sampling rate -============= - -Feature generation for MFA uses a consistent frequency range (20-7800 Hz). Files that are higher or lower sampling rate -than 16 kHz will be up- or down-sampled by default to 16 kHz during the feature generation procedure, which may produce artifacts for -upsampled files. You can modify this sample rate as part of configuring features (see :ref:`feature_config` for more details). - -.. note:: - - The validation utility (:ref:`validating_data`) will note any ignored files, and the list of such files will be available in - a log file. - -Bit depth -========= - -Kaldi can only process 16-bit WAV files. Higher bit depths (24 and 32 bit) are getting more common for recording, so -MFA will automatically convert higher bit depths if you have :code:`sox` available on your machine. - -Duration -======== - -In general, audio segments (sound files for Prosodylab-aligner format or intervals -for the TextGrid format) should be less than 30 seconds for best performance -(the shorter the faster). We recommend using breaks like breaths -or silent pauses (i.e., not associated with a stop closure) to separate the audio segments. For longer segments, -setting the beam and retry beam higher than their defaults will allow them to be aligned. The default beam/retry beam is very -conservative 10/40, so something like 400/1000 will allow for much longer sequences to be aligned. See :ref:`configuration` -for more details. diff --git a/docs/source/train_ivector.rst b/docs/source/train_ivector.rst deleted file mode 100644 index f68f4485..00000000 --- a/docs/source/train_ivector.rst +++ /dev/null @@ -1,65 +0,0 @@ -.. _train_ivector: - -***************************** -Training an ivector extractor -***************************** - -The Montreal Forced Aligner can train ivector extractors using an acoustic model for generating alignments. As part -of this training process, a classifier is built in that can be used as part of :ref:`classify_speakers`. - -Steps to train ivector extractor: - -1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that - MFA was installed in. -2. Run the following command, substituting the arguments with your own paths: - - .. code-block:: bash - - mfa train_ivector corpus_directory dictionary_path acoustic_model_path output_model_path - - -Options available: - -.. option:: -h - --help - - Display help message for the command - -.. option:: --config_path PATH - - Path to a YAML config file that will specify the training configuration. See - :ref:`configuration_ivector` for more details. - -.. option:: -s NUMBER - --speaker_characters NUMBER - - Number of characters to use to identify speakers; if not specified, - the aligner assumes that the directory name is the identifier for the - speaker. Additionally, it accepts the value ``prosodylab`` to use the second field of a ``_`` delimited file name, - following the convention of labelling production data in the ProsodyLab at McGill. - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for aligning, default is ``~/Documents/MFA`` - -.. option:: -j NUMBER - --num_jobs NUMBER - - Number of jobs to use; defaults to 3, set higher if you have more - processors available and would like to process faster - -.. option:: -v - --verbose - - The aligner will print out more information if present - -.. option:: -d - --debug - - The aligner will run in debug mode - -.. option:: -c - --clean - - Forces removal of temporary files in ``~/Documents/MFA`` diff --git a/docs/source/training_dictionary.rst b/docs/source/training_dictionary.rst deleted file mode 100644 index 930a3d70..00000000 --- a/docs/source/training_dictionary.rst +++ /dev/null @@ -1,75 +0,0 @@ -.. _training_dictionary: - -************************************ -Modeling pronunciation probabilities -************************************ - -MFA includes a utility command for training pronunciation probabilities of a dictionary given a corpus for alignment. - -The resulting dictionary can then be used as a dictionary for alignment or transcription. - -Steps to train: - -1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that - MFA was installed in. - -2. Run the following command, substituting the arguments with your own paths: - - .. code-block:: bash - - mfa train_dictionary corpus_directory dictionary_path acoustic_model_path output_dictionary_path - -.. note:: - - ``acoustic_model_path`` can also be a language that has been pretrained by MFA developers. For instance, to use - the pretrained English model, first download it via :code:`mfa download acoustic english`. A list of available - acoustic models will be provided if you run :code:`mfa download acoustic`. See :ref:`pretrained_models` for more details. - -Options available: - -.. option:: -h - --help - - Display help message for the command - -.. option:: --config_path PATH - - Path to a YAML config file that will specify either the alignment options or the training configuration. See - :ref:`configuration` for more details. - -.. option:: -s NUMBER - --speaker_characters NUMBER - - Number of characters to use to identify speakers; if not specified, - the aligner assumes that the directory name is the identifier for the - speaker. Additionally, it accepts the value ``prosodylab`` to use the second field of a ``_`` delimited file name, - following the convention of labelling production data in the ProsodyLab at McGill. - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for aligning, default is ``~/Documents/MFA`` - -.. option:: -j NUMBER - --num_jobs NUMBER - - Number of jobs to use; defaults to 3, set higher if you have more - processors available and would like to align faster - -.. option:: -v - --verbose - - The aligner will print out more information if present - -.. option:: -d - --debug - - The aligner will run in debug mode - -.. option:: -c - --clean - - Forces removal of temporary files in ``~/Documents/MFA`` - prior to aligning. This is good to use when aligning a new dataset, - but it shares a name with a previously aligned dataset. Cleaning automatically happens if the previous alignment - run had an error. diff --git a/docs/source/training_lm.rst b/docs/source/training_lm.rst deleted file mode 100644 index 550684a3..00000000 --- a/docs/source/training_lm.rst +++ /dev/null @@ -1,44 +0,0 @@ -.. _training_lm: - -************************ -Training language models -************************ - -MFA has a utility function for training ARPA-format ngram language models, as well as merging with a pre-existing model. - -Steps to train: - -1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that - MFA was installed in. - -2. Run the following command, substituting the arguments with your own paths: - - .. code-block:: bash - - mfa train_lm corpus_directory output_model_path - - -Options available: - -.. option:: -h - --help - - Display help message for the command - -.. option:: --config_path PATH - - Path to a YAML config file for training the language model. see - :ref:`train_lm_config` for more details. - -.. option:: --model_path PATH - - Path to an existing language model to merge with the training data. - -.. option:: --model_weight WEIGHT - - Specify the weight of the supplemental model when merging with the model from the training data. - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for aligning, default is ``~/Documents/MFA`` diff --git a/docs/source/transcribing.rst b/docs/source/transcribing.rst deleted file mode 100644 index f042b37f..00000000 --- a/docs/source/transcribing.rst +++ /dev/null @@ -1,77 +0,0 @@ -.. _transcribing: - -*********************** -Running the transcriber -*********************** - -Steps to transcribe: - -1. Provided the steps in :ref:`installation` have been completed and you are in the same Conda/virtual environment that - MFA was installed in. - -2. Run the following command, substituting the arguments with your own paths: - - .. code-block:: bash - - mfa transcribe corpus_directory dictionary_path acoustic_model_path language_model_path output_directory - - -.. note:: - - ``acoustic_model_path`` can also be a language that has been pretrained by MFA developers. For instance, to use - the pretrained English model, first download it via :code:`mfa download acoustic english`. A list of available - acoustic models will be provided if you run :code:`mfa download acoustic`. See :ref:`pretrained_models` for more details. - -.. note:: - - ``language_model_path`` should specify an ARPA-format ngram language model. Note that the text should be all lower case - in the arpa file, as MFA uses all lower case. - -Options available: - -.. option:: -h - --help - - Display help message for the command - -.. option:: --config_path PATH - - Path to a YAML config file that will specify the transcription configuration. See - :ref:`transcribe_config` for more details. - -.. option:: -s NUMBER - --speaker_characters NUMBER - - Number of characters to use to identify speakers; if not specified, - the aligner assumes that the directory name is the identifier for the - speaker. Additionally, it accepts the value ``prosodylab`` to use the second field of a ``_`` delimited file name, - following the convention of labelling production data in the ProsodyLab at McGill. - -.. option:: -t DIRECTORY - --temp_directory DIRECTORY - - Temporary directory root to use for aligning, default is ``~/Documents/MFA`` - -.. option:: -j NUMBER - --num_jobs NUMBER - - Number of jobs to use; defaults to 3, set higher if you have more - processors available and would like to align faster - -.. option:: -v - --verbose - - The aligner will print out more information if present - -.. option:: -d - --debug - - The aligner will run in debug mode - -.. option:: -c - --clean - - Forces removal of temporary files in ``~/Documents/MFA`` - prior to aligning. This is good to use when aligning a new dataset, - but it shares a name with a previously aligned dataset. Cleaning automatically happens if the previous alignment - run had an error. diff --git a/docs/source/commands.rst b/docs/source/user_guide/commands.rst similarity index 88% rename from docs/source/commands.rst rename to docs/source/user_guide/commands.rst index 547ab7f0..1ec3c38e 100644 --- a/docs/source/commands.rst +++ b/docs/source/user_guide/commands.rst @@ -2,35 +2,32 @@ .. _commands: -******** -Commands -******** +************ +All commands +************ The ``mfa`` command line utility has several subcommands, which are listed below grouped by general domain. -Forced Alignment -================ +Preparation +=========== .. csv-table:: :header: "Command", "Description", "Link" :widths: 10, 110, 40 - "align", "Perform forced alignment with a pretrained model", :ref:`pretrained_alignment` - "train", "Train an acoustic model and export resulting alignment", :ref:`trained_alignment` - "adapt", "Adapt a pretrained acoustic model on a new dataset", :ref:`adapting_model` - "validate", "Validate a corpus to ensure there are no issues with the data format", :ref:`validating_data` - "train_dictionary", "Estimate pronunciation probabilities from aligning a corpus", :ref:`training_dictionary` + "validate", "Validate a corpus", :ref:`validating_data` - -Transcription -============= +Forced Alignment +================ .. csv-table:: :header: "Command", "Description", "Link" :widths: 10, 110, 40 - "transcribe", "Generate transcriptions using an acoustic model, dictionary, and language model", :ref:`transcribing` - "train_lm", "Train a language model from a text corpus or from an existing language model", :ref:`training_lm` + "align", "Perform forced alignment with a pretrained model", :ref:`pretrained_alignment` + "train", "Train an acoustic model and export resulting alignment", :ref:`train_acoustic_model` + "adapt", "Adapt a pretrained acoustic model on a new dataset", :ref:`adapt_acoustic_model` + "train_dictionary", "Estimate pronunciation probabilities from aligning a corpus", :ref:`training_dictionary` Corpus creation =============== @@ -42,8 +39,9 @@ Corpus creation "create_segments", "Use voice activity detection to create segments", :ref:`create_segments` "train_ivector", "Train an ivector extractor for speaker classification", :ref:`train_ivector` "classify_speakers", "Use ivector extractor to classify files or cluster them", :ref:`classify_speakers` - "anchor", "Run the Anchor annotator utility (if installed) for editing and managing corpora", :ref:`annotator` - + "transcribe", "Generate transcriptions using an acoustic model, dictionary, and language model", :ref:`transcribing` + "train_lm", "Train a language model from a text corpus or from an existing language model", :ref:`training_lm` + "anchor", "Run the Anchor annotator utility (if installed) for editing and managing corpora", :ref:`anchor` Other utilities =============== diff --git a/docs/source/configuration_align.rst b/docs/source/user_guide/configuration/align.rst similarity index 100% rename from docs/source/configuration_align.rst rename to docs/source/user_guide/configuration/align.rst diff --git a/docs/source/user_guide/configuration/dictionary.rst b/docs/source/user_guide/configuration/dictionary.rst new file mode 100644 index 00000000..25f7d836 --- /dev/null +++ b/docs/source/user_guide/configuration/dictionary.rst @@ -0,0 +1,32 @@ + +.. _configuration_dictionary: + +************************ +Dictionary Configuration +************************ + +Text normalization and parsing of words from text can be configured in yaml configuration files. Punctuation is stripped from all words, so if a character is part of a language's orthography, modifying the :code:`punctuation` parameter to exclude that character would keep that character in the words. See more examples of how these :code:`punctuation`, :code:`clitic_markers`, and :code:`compound_markers` are used in :ref:`text_normalization`. + +The :code:`multilingual_ipa`, :code:`strip_diacritics`, and :code:`digraphs` are all used as part of :ref:`multilingual_ipa`. + +.. csv-table:: + :header: "Parameter", "Default value", "Notes" + :escape: ' + + "oov_word", "", "Internal word symbol to use for out of vocabulary items" + "oov_phone", "spn", "Internal phone symbol to use for out of vocabulary items" + "silence_word", "!sil", "Internal word symbol to use initial silence" + "nonoptional_silence_phone", "sil", "Internal phone symbol to use initial silence" + "optional_silence_phone", "sp", "Internal phone symbol to use optional silence in the middle of utterances" + "position_dependent_phones", "True", "Flag for whether phones should mark their position in the word as part of the phone symbol internally" + "num_silence_states", "5", "Number of states to use for silence phones" + "num_non_silence_states", "3", "Number of states to use for non-silence phones" + "shared_silence_phones", "True", "Flag for whether to share silence phone models" + "silence_probability", "0.5", "Probability of inserting silence around and within utterances, setting to 0 removes silence modelling" + "punctuation", "、。।,@<>'"'(),.:;¿?¡!\\&%#*~【】,…‥「」『』〝〟″⟨⟩♪・‹›«»~′$+=", "Characters to treat as punctuation and strip from around words" + "clitic_markers", "'''’", "Characters to treat as clitic markers, will be collapsed to the first character in the string" + "compound_markers", "\-", "Characters to treat as marker in compound words (i.e., doesn't need to be preserved like for clitics)" + "multilingual_ipa", False, "Flag for enabling multilingual IPA mode, see :ref:`multilingual_ipa` for more details" + "strip_diacritics", "/iː/ /iˑ/ /ĭ/ /i̯/ /t͡s/ /t‿s/ /t͜s/ /n̩/", "IPA diacritics to strip in multilingual IPA mode (phone symbols for proper display, when specifying them just have the diacritic)" + "digraphs", "[dt][szʒʃʐʑʂɕç], [aoɔe][ʊɪ]", "Digraphs to split up in multilingual IPA mode" + "brackets", "('[', ']'), ('{', '}'), ('<', '>'), ('(', ')')", "Punctuation to keep as bracketing a whole word, i.e., a restart, disfluency, etc" diff --git a/docs/source/configuration_g2p.rst b/docs/source/user_guide/configuration/g2p.rst similarity index 100% rename from docs/source/configuration_g2p.rst rename to docs/source/user_guide/configuration/g2p.rst diff --git a/docs/source/user_guide/configuration/index.rst b/docs/source/user_guide/configuration/index.rst new file mode 100644 index 00000000..445d9bde --- /dev/null +++ b/docs/source/user_guide/configuration/index.rst @@ -0,0 +1,29 @@ + +.. _configuration: + +************* +Configuration +************* + +Global configuration for MFA can be updated via the ``mfa configure`` subcommand. Once the command is called with a flag, it will set a default value for any future runs (though, you can overwrite most settings when you call other commands). + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: configure + +Configuring specific commands +============================= + +.. toctree:: + :maxdepth: 1 + + dictionary.rst + align.rst + transcription.rst + lm.rst + segment.rst + ivector.rst + g2p.rst diff --git a/docs/source/configuration_ivector.rst b/docs/source/user_guide/configuration/ivector.rst similarity index 89% rename from docs/source/configuration_ivector.rst rename to docs/source/user_guide/configuration/ivector.rst index 9865f7f8..4bec1ed7 100644 --- a/docs/source/configuration_ivector.rst +++ b/docs/source/user_guide/configuration/ivector.rst @@ -5,9 +5,7 @@ Ivector Configuration ********************* -For the Kaldi recipe that ivector extractor training is based on, see -https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh and -https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh +For the Kaldi recipe that ivector extractor training is based on, see :kaldi_steps_sid:`train_diag_ubm` and :kaldi_steps_sid:`train_ivector_extractor`. .. csv-table:: :header: "Parameter", "Default value", "Notes" diff --git a/docs/source/configuration_lm.rst b/docs/source/user_guide/configuration/lm.rst similarity index 100% rename from docs/source/configuration_lm.rst rename to docs/source/user_guide/configuration/lm.rst diff --git a/docs/source/configuration_segment.rst b/docs/source/user_guide/configuration/segment.rst similarity index 100% rename from docs/source/configuration_segment.rst rename to docs/source/user_guide/configuration/segment.rst diff --git a/docs/source/configuration_transcription.rst b/docs/source/user_guide/configuration/transcription.rst similarity index 100% rename from docs/source/configuration_transcription.rst rename to docs/source/user_guide/configuration/transcription.rst diff --git a/docs/source/user_guide/data_validation.rst b/docs/source/user_guide/data_validation.rst new file mode 100644 index 00000000..5d8abb3c --- /dev/null +++ b/docs/source/user_guide/data_validation.rst @@ -0,0 +1,38 @@ + +.. _validating_data: + +*************** +Validating data +*************** + +The validation utility will perform the basic set up that alignment would perform, but analyzes and reports any issues +that the user may want to fix. + +First, the utility parses the corpus and dictionary, prints out summary information about the corpus, +and logs any of the following issues: + +- If there are any words in transcriptions that are not in the dictionary, these are logged as out-of-vocabulary items (OOVs). + A list of these OOVs and which utterances they appear in are saved to text files. +- Any issues reading sound files +- Any issues generating features, skipped if ``--ignore_acoustics`` is flagged +- Any transcription files missing .wav files +- Any .wav files missing transcription files +- Any issues reading transcription files +- Any unsupported sampling rates of .wav files +- Any unaligned files from a basic monophone acoustic model trained on the dataset (or using a supplied acoustic model), + skipped if ``--ignore_acoustics`` is flagged +- Any files that have deviations from their original transcription to decoded transcriptions using a simple language model + + +.. _running_the_validator: + +Running the validation utility +============================== + + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: validate diff --git a/docs/source/user_guide/formats/corpus_structure.rst b/docs/source/user_guide/formats/corpus_structure.rst new file mode 100644 index 00000000..41ad926e --- /dev/null +++ b/docs/source/user_guide/formats/corpus_structure.rst @@ -0,0 +1,144 @@ + + +.. _corpus_structure: + +**************** +Corpus structure +**************** + +Prior to running the aligner, make sure the following are set up: + +1. A pronunciation dictionary for your language should specify the pronunciations of orthographic transcriptions. + +2. The sound files to align. + +3. Orthographic annotations in .lab files for individual sound files (:ref:`prosodylab_format`) + or in TextGrid intervals for longer sound files (:ref:`textgrid_format`). + +The sound files and the orthographic annotations should be contained in one directory structured as follows:: + + +-- textgrid_corpus_directory + | --- recording1.wav + | --- recording1.TextGrid + | --- recording2.wav + | --- recording2.TextGrid + | --- ... + + +-- prosodylab_corpus_directory + | +-- speaker1 + | --- recording1.wav + | --- recording1.lab + | --- recording2.wav + | --- recording2.lab + | +-- speaker2 + | --- recording3.wav + | --- recording3.lab + | --- ... + + + +.. note:: + + A collection of preprocessing scripts to get various corpora of other formats is available in the :xref:`mfa_reorg_scripts`. + +Transcription file formats +========================== + +In addition to the sections below about file format, see :ref:`text_normalization` for details on how the transcription text is normalized for dictionary look up, and :ref:`configuration_dictionary` for how this normalization can be customized. + +.. _prosodylab_format: + +Prosodylab-aligner format +------------------------- + +Every audio file you are aligning must have a corresponding .lab +file containing the text transcription of that audio file. The audio and +transcription files must have the same name. For example, if you have ``givrep_1027_2_1.wav``, +its transcription should be in ``givrep_1027_2_1.lab`` (which is just a +text file with the .lab extension). + +.. note:: If you have transcriptions in a + tab-separated text file (or an Excel file, which can be saved as one), + you can generate .lab files from it using the relabel function of relabel_clean.py. + The relabel_clean.py script is currently in the prosodylab.alignertools repository on GitHub. + +If no ``.lab`` file is found, then the aligner will look for any matching ``.txt`` files and use those. + +In terms of directory structure, the default configuration assumes that +files are separated into subdirectories based on their speaker (with one +speaker per file). + +An alternative way to specify which speaker says which +segment is to use the ``-s`` flag with some number of characters of the file name as the speaker identifier. + +The output from aligning this format of data will be TextGrids that have a tier +for words and a tier for phones. + +.. _textgrid_format: + +TextGrid format +--------------- + +The other main format that is supported is long sound files accompanied +by TextGrids that specify orthographic transcriptions for short intervals +of speech. + + + .. figure:: ../../_static/librispeech_textgrid.png + :align: center + :alt: Input TextGrid in Praat with intervals for each utterance and a single tier for a speaker + +If the ``-s`` flag is specified, the tier names will not be used as speaker names, and instead the first X characters +specified by the flag will be used as the speaker name. + +By default, each tier corresponds to a speaker (speaker "237" in the above example), so it is possible to +align speech for multiple speakers per sound file using this format. + + + .. figure:: ../../_static/multiple_speakers_textgrid.png + :align: center + :alt: Input TextGrid in Praat with intervals for each utterance and tiers for each speaker + +Stereo files are supported as well, where it assumes that if there are +multiple talkers, the first half of speaker tiers are associated with the first +channel, and the second half of speaker tiers are associated with the second channel. + +The output from aligning will be a TextGrid with word and phone tiers for +each speaker. + + .. figure:: ../../_static/multiple_speakers_output_textgrid.png + :align: center + :alt: TextGrid in Praat following alignment with interval tiers for each speaker's words and phones + +.. note:: + + Intervals in the TextGrid less than 100 milliseconds will not be aligned. + +Sound files +----------- + +The default format for sound files in Kaldi is ``.wav``. However, if MFA is installed via conda, you should have :code:`sox` available which will pipe sound files of various formats to Kaldi in wav format. Running :code:`sox` by itself will a list of formats that it supports. Of interest to speech researchers, the version on conda-forge supports non-standard :code:`wav` formats, :code:`aiff`, :code:`flac`, :code:`ogg`, and :code:`vorbis`. + +.. note:: + + ``.mp3`` files are supported on Linux and Mac only at the moment. You can verify whether sox can read them via the ``soxi yoursoundfile.mp3``. + + :code:`opus` files are not currently supported, but I would like to get them working soon. + +Sampling rate +============= + +Feature generation for MFA uses a consistent frequency range (20-7800 Hz). Files that are higher or lower sampling rate +than 16 kHz will be up- or down-sampled by default to 16 kHz during the feature generation procedure, which may produce artifacts for +upsampled files. You can modify this default sample rate as part of configuring features (see :ref:`feature_config` for more details). + +Bit depth +========= + +Kaldi can only process 16-bit WAV files. Higher bit depths (24 and 32 bit) are getting more common for recording, so +MFA will automatically convert higher bit depths via the :code:`sox` conda package. + +Duration +======== + +In general, audio segments (sound files for Prosodylab-aligner format or intervals for the TextGrid format) should be less than 30 seconds for best performance (the shorter the faster). We recommend using breaks like breaths or silent pauses (i.e., not associated with a stop closure) to separate the audio segments. For longer segments, setting the beam and retry beam higher than their defaults will allow them to be aligned. The default beam/retry beam is very conservative 10/40, so something like 400/1000 will allow for much longer sequences to be aligned. Though also note that the higher the beam value, the slower alignment will be as well. See :ref:`configuration_alignment` for more details. diff --git a/docs/source/dictionary.rst b/docs/source/user_guide/formats/dictionary.rst similarity index 64% rename from docs/source/dictionary.rst rename to docs/source/user_guide/formats/dictionary.rst index 87f6f7c7..b7648724 100644 --- a/docs/source/dictionary.rst +++ b/docs/source/user_guide/formats/dictionary.rst @@ -7,12 +7,92 @@ .. _`Prosodylab-aligner French dictionary`: https://github.com/prosodylab/prosodylab-alignermodels/blob/master/FrenchQuEu/fr-QuEu.dict -.. _dictionary: +.. _dictionary_format: ***************** Dictionary format ***************** +.. _text_normalization: + +Text normalization and dictionary lookup +======================================== + +If a word is not found in the dictionary, and has no orthographic +markers for morpheme boundaries (apostrophes or hyphens), then it will +be replaced in the output with '' for unknown word. + +.. note:: + + The list of all unknown words (out of vocabulary words; OOV words) will + be output to a file named ``oovs_found.txt`` + in the output directory, if you would like to add them to the dictionary + you are using. To help find any typos in transcriptions, a file named + ``utterance_oovs.txt`` will be put in the output directory and will list + the unknown words per utterance. + +As part of parsing orthographic transcriptions, punctuation is stripped +from the ends and beginnings of words, except for the :code:`brackets` specified in :ref:`configuration_dictionary`. In addition, all words are converted to lowercase so that dictionary lookup is not case-sensitive. + +.. note:: + + The definition of punctuation, clitic markers, and compound markers can be set in a config file, see :ref:`configuration_dictionary` for more details. + +Dictionary lookup will attempt to generate the most maximal coverage of +novel forms if they use some overt morpheme boundary in the orthography. + +For instance, in French, clitics are marked using apostrophes between the +bound clitic and the stem. Thus given a dictionary like: + +.. highlight:: none + +:: + + c'est S E + c S E + c' S + etait E T E + un A N + +And two example orthographic transcriptions: + +:: + + c'est un c + c'etait un c + +The normalization would result in the following: + +:: + + c'est un c + c' était un c + +With a pronunciation of: + +:: + + S E A N S E + S E T E A N S E + +The key point to note is that the pronunciation of the clitic ``c'`` is ``S`` +and the pronunciation of the letter ``c`` in French is ``S A``. + +The algorithm will try to associate the clitic marker with either the element +before (as for French clitics) or the element after (as for English clitics +like the possessive marker). The default clitic markers are ``'`` and ``’`` (but they are collapsed into a single +clitic marker, ``'`` by default). + +The default compound marker is a hyphen (``-``). +Compound markers are treated similarly to clitic markers, but they are not associated with one +particular element in the word over another. Instead, they are used to simply split the compound word. +For example, ``merry-go-round`` will +become ``merry go round`` if the hyphenated form is not in the dictionary. +If no words are found on splitting the word based on hyphens or apostrophes, +then the word will be treated as a single unit (single unknown word). + +The default behavior of the aligner to is to clean up these internal splits and reconstruct the original word. If this is not desirable, you can disable clean up via the :code:`--disable_textgrid_cleanup` flag (see :ref:`configuration`). + Non-probabilistic format ======================== @@ -33,7 +113,7 @@ This lexicon uses the Arpabet transcription format (like the `CMU Pronouncing Di The Prosodylab-aligner has two preconstructed dictionaries as well, one for English (`Prosodylab-aligner English dictionary`_) -and one for Quebec French (`Prosodylab-aligner French dictionary`_), also see :ref:`dictionaries` for a list of supported dictionaries. +and one for Quebec French (`Prosodylab-aligner French dictionary`_), also see :ref:`pretrained_dictionaries` for a list of supported dictionaries. .. note:: diff --git a/docs/source/user_guide/formats/index.rst b/docs/source/user_guide/formats/index.rst new file mode 100644 index 00000000..72d2d1c5 --- /dev/null +++ b/docs/source/user_guide/formats/index.rst @@ -0,0 +1,11 @@ +.. _data_format: + +************************* +File and directory format +************************* + +.. toctree:: + :maxdepth: 1 + + corpus_structure.rst + dictionary.rst diff --git a/docs/source/introduction.rst b/docs/source/user_guide/index.rst similarity index 55% rename from docs/source/introduction.rst rename to docs/source/user_guide/index.rst index 0fa6e8c1..238096cf 100644 --- a/docs/source/introduction.rst +++ b/docs/source/user_guide/index.rst @@ -1,41 +1,10 @@ -.. _`Kaldi homepage`: http://kaldi-asr.org/ -.. _`Kaldi feature and model-space transforms page`: http://kaldi-asr.org/doc/transform.html +.. _user_guide: -.. _`Phonetisaurus repository`: https://github.com/AdolfVonKleist/Phonetisaurus - -.. _`HTK homepage`: http://htk.eng.cam.ac.uk/ - -.. _`Prosodylab-aligner homepage`: http://prosodylab.org/tools/aligner/ - -.. _`P2FA homepage`: https://www.ling.upenn.edu/phonetics/old_website_2015/p2fa/ - -.. _`FAVE-align homepage`: https://github.com/JoFrhwld/FAVE/wiki/FAVE-align - -.. _`MAUS homepage`: http://www.bas.uni-muenchen.de/Bas/BasMAUS.html - -.. _`Praat homepage`: http://www.fon.hum.uva.nl/praat/ - -.. _`EasyAlign homepage`: http://latlcui.unige.ch/phonetique/easyalign.php - -.. _`Gentle homepage`: https://lowerquality.com/gentle/ - -.. _`@wavable`: https://twitter.com/wavable - -.. _`Github`: http://mmcauliffe.github.io/ - -.. _`mailing list`: https://groups.google.com/forum/#!forum/mfa-users - -.. _`Kaldi tutorial`: https://eleanorchodroff.com/tutorial/kaldi/index.html - -.. _`Corpus Phonetics Tutorial`: https://eleanorchodroff.com/tutorial/intro.html - -.. _introduction: - -************ -Introduction -************ +********** +User Guide +********** What is forced alignment? ========================= @@ -44,12 +13,11 @@ Forced alignment is a technique to take an orthographic transcription of an audio file and generate a time-aligned version using a pronunciation dictionary to look up phones for words. -Many languages have :ref:`pretrained_acoustic` available for download and use. +Many languages have :ref:`pretrained_acoustic_models` available for download and use. .. note:: - For a more detailed background on forced alignment, please see Eleanor Chodroff's excellent `Kaldi tutorial`_ within her - larger `Corpus Phonetics Tutorial`_ + For a more detailed background on forced alignment, please see Eleanor Chodroff's excellent :xref:`chodroff_kaldi` within her larger :xref:`chodroff_phonetics`. Montreal Forced Aligner @@ -65,69 +33,71 @@ models, where context on either side of a phone is taken into account for acoustic models. The third pass performs LDA+MLLT to learn a transform of the features that makes each phone's features maximally different. The final pass enhances the triphone model by taking into account speaker differences, and calculates a transformation of the -mel frequency cepstrum coefficients (MFCC) features for each speaker. See the `Kaldi feature and model-space transforms page`_ +mel frequency cepstrum coefficients (MFCC) features for each speaker. See the :xref:`kaldi` page on feature transformations for more detail on these final passes. -The Montreal Forced Aligner can also train using deep neural networks (DNNs). For more technical information about the structure of the aligner, see -:ref:`api_reference`. +:ref:`mfa_api`. -If you run into any issues, please check the `mailing list`_ for fixes/workarounds or to post a new issue. +If you run into any issues, please check the :xref:`mfa_mailing_list` for fixes/workarounds or to post a new issue on in the :xref:`mfa_github_issues`. Use of speaker information -------------------------- A key feature of the Montreal Forced Aligner is the use of speaker -adaptatation in alignment. The command line interface provides multiple +adaptation in alignment. The command line interface provides multiple ways of grouping audio files by speaker, depending on the input file format (either :ref:`prosodylab_format` or :ref:`textgrid_format`). In addition to speaker-adaptation in the final pass of alignment, speaker information is used for grouping audio files together for multiprocessing -and ceptstral mean and variance normalization (CMVN). If speakers are not +and cepstral mean and variance normalization (CMVN). If speakers are not properly specified, then feature calculation might not succeed due to limits on the numbers of files open. Underlying technology --------------------- -The Montreal Forced Aligner uses the Kaldi ASR toolkit -(`Kaldi homepage`_) to perform forced alignment. +The Montreal Forced Aligner uses the :xref:`kaldi` ASR toolkit to perform forced alignment. Kaldi is under active development and uses modern ASR and includes state-of-the-art algorithms for tasks -in automatic speech recognition beyond forced alignment. For grapheme-to-phoneme capabilities, MFA uses Phonetisaurus -(`Phonetisaurus repository`_). +in automatic speech recognition beyond forced alignment. For grapheme-to-phoneme capabilities, MFA 1.0 used :xref:`phonetisaurus`, but MFA 2.0 has switched to using :xref:`pynini`. Other forced alignment tools ============================ Most tools for forced alignment used by linguists rely on the HMM Toolkit -(HTK; `HTK homepage`_), including: +(:xref:`htk`), including: -* Prosodylab-aligner (`Prosodylab-aligner homepage`_) -* Penn Phonetics Forced Aligner (P2FA, `P2FA homepage`_) -* FAVE-align (`FAVE-align homepage`_) -* (Web) MAUS (`MAUS homepage`_) +* :xref:`prosodylab_aligner` +* :xref:`p2fa` +* :xref:`fave` +* :xref:`maus` -EasyAlign (`EasyAlign homepage`_) is a Praat (`Praat homepage`_) plug-in for forced alignment as well. +:xref:`easy_align` is a :xref:`praat` plug-in for forced alignment as well. Montreal Forced Aligner is most similar to the Prosodylab-aligner, and was developed at the same lab. Because the Montreal Forced Aligner uses a different toolkit to do alignment, trained models cannot be used with the Prosodylab-aligner, and vice versa. -Another Kaldi-based forced aligner is Gentle (`Gentle homepage`_) which uses Kaldi's neural networks to +Another Kaldi-based forced aligner is :xref:`gentle` which uses Kaldi's neural networks to align English data. The Montreal Forced Aligner allows for training on any data that you might have, and can be used with languages other than English. Contributors ============ -* Michael McAuliffe (michael.e.mcauliffe@gmail.com, `Github`_, `@wavable`_) -* Michaela Socolof -* Elias Stengel-Eskin -* Sarah Mihuc -* Arlie Coles -* Michael Wagner -* Morgan Sonderegger +* Michael McAuliffe + + - :fa:`envelope` michael.e.mcauliffe@gmail.com + - :fa:`blog` :xref:`memcauliffe.com` + - :fa:`twitter` :xref:`@wavable` + +* :xref:`socolof` +* :xref:`stengel-eskin` +* :xref:`mihuc` +* :xref:`coles` +* :xref:`wagner` +* :xref:`sonderegger` Citation ======== @@ -140,10 +110,20 @@ Or: McAuliffe, Michael, Michaela Socolof, Sarah Mihuc, Michael Wagner, and Morgan Sonderegger (2017). Montreal Forced Aligner: trainable text-speech alignment using Kaldi. In -*Proceedings of the 18th Conference of the International Speech Communication Association*. :download:`Paper PDF <_static/MFA_paper_Interspeech2017.pdf>` +*Proceedings of the 18th Conference of the International Speech Communication Association*. :download:`Paper PDF <../_static/MFA_paper_Interspeech2017.pdf>` Funding ======= We acknowledge funding from Social Sciences and Humanities Research Council (SSHRC) #430-2014-00018, Fonds de Recherche du Québec – Société et Culture (FRQSC) #183356 and Canada Foundation for Innovation (CFI) #32451 to Morgan Sonderegger. + +.. toctree:: + :hidden: + + commands + formats/index + data_validation + workflows/index + configuration/index + models/index diff --git a/docs/source/user_guide/models/acoustic.rst b/docs/source/user_guide/models/acoustic.rst new file mode 100644 index 00000000..f2e7d9e6 --- /dev/null +++ b/docs/source/user_guide/models/acoustic.rst @@ -0,0 +1,60 @@ + + +.. _`ProsodyLab dictionary repository`: https://github.com/prosodylab/prosodylab.dictionaries + +.. _`Lexique`: http://www.lexique.org/ + +.. _`ProsodyLab French dictionary`: https://github.com/prosodylab/prosodylab.dictionaries/raw/master/fr.dict + +.. _pretrained_acoustic_models: + +************************** +Pretrained acoustic models +************************** + +As part of using the Montreal Forced Aligner in our own research, we have trained acoustic models for a number of languages. +If you would like to use them, please download them below. Please note the dictionary that they were trained with to +see more information about the phone set. When using these with a pronunciation dictionary, the phone sets must be +compatible. If the orthography of the language is transparent, it is likely that we have a G2P model that can be used +to generate the necessary pronunciation dictionary. + +Any of the following acoustic models can be downloaded with the command :code:`mfa model download acoustic `. You +can get a full list of the currently available acoustic models via :code:`mfa model download acoustic`. New models contributed +by users will be periodically added. If you would like to contribute your trained models, please contact Michael McAuliffe +at michael.e.mcauliffe@gmail.com. + +.. csv-table:: + :header: "Language", "Link", "Corpus", "Number of speakers", "Audio (hours)", "Phone set" + + "Arabic", "Use not recommended due to issues in GlobalPhone", "GlobalPhone", 80, 19.0, "GlobalPhone" + "Bulgarian", :mfa_model:`acoustic/bulgarian`, "GlobalPhone", 79, 21.4, "GlobalPhone" + "Croatian", :mfa_model:`acoustic/croatian`, "GlobalPhone", 94, 15.9, "GlobalPhone" + "Czech", :mfa_model:`acoustic/czech`, "GlobalPhone", 102, 31.7, "GlobalPhone" + "English", :mfa_model:`acoustic/english`, "LibriSpeech", 2484, 982.3, "Arpabet (stressed)" + "French (FR)", :mfa_model:`acoustic/french`, "GlobalPhone", 100, 26.9, "GlobalPhone" + "French (FR)", :mfa_model:`acoustic/french_prosodylab`, "GlobalPhone", 100, 26.9, "Prosodylab [1]_" + "French (QC)", :mfa_model:`acoustic/french_qc`, "Lab speech", "N/A", "N/A", "Prosodylab [1]_" + "German", :mfa_model:`acoustic/german`, "GlobalPhone", 77, 18, "GlobalPhone" + "German", :mfa_model:`acoustic/german_prosodylab`, "GlobalPhone", 77, 18, "Prosodylab [2]_" + "Hausa", :mfa_model:`acoustic/hausa`, "GlobalPhone", 103, 8.7, "GlobalPhone" + "Japanese", "Not available yet", "GlobalPhone", 144, 34, "GlobalPhone" + "Korean", :mfa_model:`acoustic/korean`, "GlobalPhone", 101, 20.8, "GlobalPhone" + "Mandarin", :mfa_model:`acoustic/mandarin`, "GlobalPhone", 132, 31.2, "Pinyin phones [3]_" + "Polish", :mfa_model:`acoustic/polish`, "GlobalPhone", 99, 24.6, "GlobalPhone" + "Portuguese", :mfa_model:`acoustic/portuguese`, "GlobalPhone", 101, 26.3, "GlobalPhone" + "Russian", :mfa_model:`acoustic/russian`, "GlobalPhone", 115, 26.5, "GlobalPhone" + "Spanish", :mfa_model:`acoustic/spanish`, "GlobalPhone", 102, 22.1, "GlobalPhone" + "Swahili", :mfa_model:`acoustic/swahili`, "GlobalPhone", 70, 11.1, "GlobalPhone" + "Swedish", :mfa_model:`acoustic/swedish`, "GlobalPhone", 98, 21.7, "GlobalPhone" + "Tamil", "Not available yet", "GlobalPhone", "N/A", "N/A", "GlobalPhone" + "Thai", :mfa_model:`acoustic/thai`, "GlobalPhone", 98, 28.2, "GlobalPhone" + "Turkish", :mfa_model:`acoustic/turkish`, "GlobalPhone", 100, 17.1, "GlobalPhone" + "Ukrainian", :mfa_model:`acoustic/ukrainian`, "GlobalPhone", 119, 14.1, "GlobalPhone" + "Vietnamese", :mfa_model:`acoustic/vietnamese`, "GlobalPhone", 129, 19.7, "GlobalPhone" + "Wu", "Not available yet", "GlobalPhone", 41, 9.3, "GlobalPhone" + +.. [1] The `ProsodyLab French dictionary`_ is based on `Lexique`_ with substitutions for numbers and special characters. Note that Lexique is known to currently not work with the aligner, see the `Github issue `_ for more information and status. +.. [2] The German dictionary used in training is available in the `ProsodyLab dictionary repository`_. + See http://www.let.uu.nl/~Hugo.Quene/personal/phonchar.html for more information on the CELEX phone set for German and how it maps to other phonesets. +.. [3] The phoneset for Mandarin was created by GlobalPhone by splitting Pinyin into onset, nucleus (any vowel sequence), + and codas, and then associating the tone of the syllable onto the nucleus (i.e. "fang2" -> "f a2 ng" and "xiao4" -> "x iao4" diff --git a/docs/source/user_guide/models/dictionary.rst b/docs/source/user_guide/models/dictionary.rst new file mode 100644 index 00000000..919b282f --- /dev/null +++ b/docs/source/user_guide/models/dictionary.rst @@ -0,0 +1,30 @@ + +.. _`English pronunciation dictionary`: https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/english.dict +.. _`French Prosodylab dictionary`: https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/fr.dict +.. _`German Prosodylab dictionary`: https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main/dictionary/de.dict +.. _`TalnUPF Spanish IPA dictionary`: https://raw.githubusercontent.com/TalnUPF/phonetic_lexica/master/es/es_lexicon-IPA.txt +.. _`TalnUPF Spanish gpA dictionary`: https://raw.githubusercontent.com/TalnUPF/phonetic_lexica/master/es/es_lexicon-gpA.txt +.. _`TalnUPF Catalan IPA dictionary`: https://raw.githubusercontent.com/TalnUPF/phonetic_lexica/master/ca/ca_lexicon-IPA.txt +.. _`FalaBrasil dictionary`: https://gitlab.com/fb-nlp/nlp-resources/-/tree/main/res + +.. _pretrained_dictionaries: + +************************************ +Available pronunciation dictionaries +************************************ + +Any of the following pronunciation dictionaries can be downloaded with the command :code:`mfa model download dictionary `. You +can get a full list of the currently available dictionaries via :code:`mfa model download dictionary`. New dictionaries contributed +by users will be periodically added. If you would like to contribute your dictionaries, please contact Michael McAuliffe +at michael.e.mcauliffe@gmail.com. + +.. csv-table:: + :header: "Language", "Link", "Orthography system", "Phone set" + + "English", `English pronunciation dictionary`_ , "Latin", "Arpabet (stressed)" + "French", `French Prosodylab dictionary`_, "Latin", "Prosodylab French" + "German", `German Prosodylab dictionary`_, "Latin", "Prosodylab German" + "Brazilian Portuguese", `FalaBrasil dictionary`_, "Latin", "" + "Spanish", `TalnUPF Spanish IPA dictionary`_, "Latin", "IPA" + "Spanish", `TalnUPF Spanish gpA dictionary`_, "Latin", "gpA" + "Catalan", `TalnUPF Catalan IPA dictionary`_, "Latin", "IPA" diff --git a/docs/source/user_guide/models/g2p.rst b/docs/source/user_guide/models/g2p.rst new file mode 100644 index 00000000..56187ebb --- /dev/null +++ b/docs/source/user_guide/models/g2p.rst @@ -0,0 +1,77 @@ + + +.. _`Pynini`: https://github.com/kylebgormon/Pynini +.. _`Sigmorphon 2020 G2P task baseline`: https://github.com/sigmorphon/2020/tree/master/task1/baselines/fst + +.. _`ProsodyLab dictionary repository`: https://github.com/prosodylab/prosodylab.dictionaries + +.. _`Lexique`: http://www.lexique.org/ + +.. _`ProsodyLab French dictionary`: https://github.com/prosodylab/prosodylab.dictionaries/raw/master/fr.dict + +.. _pretrained_g2p: + +********************* +Pretrained G2P models +********************* + + +Included with MFA is a separate tool to generate a dictionary from a preexisting model. This should be used if you're +aligning a dataset for which you have no pronunciation dictionary or the orthography is very transparent. We have pretrained +models for several languages below. + +Any of the following G2P models can be downloaded with the command :code:`mfa model download g2p `. You can get a full list of the currently available G2P models via :code:`mfa download g2p`. New models contributed by users will be periodically added. If you would like to contribute your trained models, please contact Michael McAuliffe at michael.e.mcauliffe@gmail.com. + +These models were generated using the `Pynini`_ package on the GlobalPhone dataset. The implementation is based on that in the +`Sigmorphon 2020 G2P task baseline`_. +This means that they will only work for transcriptions which use the same +alphabet. Current language options are listed below, with the following accuracies when trained on 90% of the data and +tested on 10%: + +.. csv-table:: + :header: "Language", "Link", "WER", "LER", "Orthography system", "Phone set" + + "Arabic", "Use not recommended due to issues in GlobalPhone", 28.45, 7.42, "Romanized [2]_", "GlobalPhone" + "Bulgarian", :mfa_model:`g2p/bulgarian_g2p`, 3.08, 0.38, "Cyrillic alphabet", "GlobalPhone" + "Croatian", :mfa_model:`g2p/croatian_g2p`, 9.47, 3.4, "Latin alphabet", "GlobalPhone" + "Czech", :mfa_model:`g2p/czech_g2p`, 3.43, 0.71, "Latin alphabet", "GlobalPhone" + "English", :mfa_model:`g2p/english_g2p`, 28.45, 7.42, "Latin alphabet", "Arpabet" + "French", :mfa_model:`g2p/french_g2p`, 42.54, 6.98, "Latin alphabet", "GlobalPhone" + "French", :mfa_model:`g2p/french_lexique_g2p`, 5.31, 1.06, "Latin alphabet", "Lexique" + "French", :mfa_model:`g2p/french_prosodylab_g2p` [1]_, 5.11, 0.95, "Latin alphabet", "Prosodylab" + "German", :mfa_model:`g2p/german_g2p`, 36.16, 7.84, "Latin alphabet", "GlobalPhone" + "German", :mfa_model:`g2p/german_prosodylab_g2p` [3]_, 5.43, 0.65, "Latin alphabet", "Prosodylab" + "Hausa", :mfa_model:`g2p/hausa_g2p`, 32.54, 7.19, "Latin alphabet", "GlobalPhone" + "Japanese", :mfa_model:`g2p/japanese_character_g2p`, 17.45, 7.17, "Kanji and kana", "GlobalPhone" + "Korean", :mfa_model:`g2p/korean_hangul_g2p`, 11.85, 1.38, "Hangul", "GlobalPhone" + "Korean", :mfa_model:`g2p/korean_jamo_g2p`, 8.94, 0.95, "Jamo", "GlobalPhone" + "Mandarin", :mfa_model:`g2p/mandarin_pinyin_g2p`, 0.27, 0.06, "Pinyin", "Pinyin phones" + "Mandarin", :mfa_model:`g2p/mandarin_character_g2p` [4]_, 23.81, 11.2, "Hanzi", "Pinyin phones [6]_" + "Polish", :mfa_model:`g2p/polish_g2p`, 1.23, 0.33, "Latin alphabet", "GlobalPhone" + "Portuguese", :mfa_model:`g2p/portuguese_g2p`, 10.67, 1.62, "Latin alphabet", "GlobalPhone" + "Russian", :mfa_model:`g2p/russian_g2p`, 4.04, 0.65, "Cyrillic alphabet", "GlobalPhone" + "Spanish", :mfa_model:`g2p/spanish_g2p`, 17.93, 3.02, "Latin alphabet", "GlobalPhone" + "Swahili", :mfa_model:`g2p/swahili_g2p`, 0.09, 0.02, "Latin alphabet", "GlobalPhone" + "Swedish", :mfa_model:`g2p/swedish_g2p`, 18.75, 3.14, "Latin alphabet", "GlobalPhone" + "Thai", :mfa_model:`g2p/thai_g2p`, 27.62, 7.48, "Thai script", "GlobalPhone" + "Turkish", :mfa_model:`g2p/turkish_g2p`, 8.51, 2.32, "Latin alphabet", "GlobalPhone" + "Ukrainian", :mfa_model:`g2p/ukrainian_g2p`, 2.1, 0.42, "Cyrillic alphabet", "GlobalPhone" + "Vietnamese", :mfa_model:`g2p/vietnamese_g2p`, 14.91, 3.46, "Vietnamese alphabet", "GlobalPhone" + "Wu", :mfa_model:`g2p/wu_g2p` [5]_ , 31.19, 13.04, "Hanzi", "GlobalPhone" + + +.. [1] The `ProsodyLab French dictionary`_ is based on `Lexique`_ with substitutions for numbers and special characters. + Note that Lexique is known to currently not work with the aligner, see the `Github issue `_ + for more information and status. +.. [2] Please see the GlobalPhone documentation for how the romanization was done for Arabic. +.. [3] The German dictionary used in training is available in the `ProsodyLab dictionary repository`_. + See http://www.let.uu.nl/~Hugo.Quene/personal/phonchar.html for more information on the CELEX phone set for German + and how it maps to other phonesets. +.. [4] The Mandarin character dictionary that served as the training data for this model was built by mapping between + characters in ``.trl`` files and pinyin syllables in ``.rmn`` files in the GlobalPhone corpus. +.. [5] The Wu G2P model was trained a fairly small lexicon, so it likely does not have the coverage to be a robust model + for most purposes. Please check carefully any resulting dictionaries, as they are likely to have missing syllables from + from unknown symbols. +.. [6] The phoneset for Mandarin was created by GlobalPhone by splitting Pinyin into onset, nucleus (any vowel sequence), + and codas, and then associating the tone of the syllable onto the nucleus (i.e. "fang2" -> "f a2 ng" and "xiao4" -> + "x iao4" diff --git a/docs/source/user_guide/models/index.rst b/docs/source/user_guide/models/index.rst new file mode 100644 index 00000000..6ff48f68 --- /dev/null +++ b/docs/source/user_guide/models/index.rst @@ -0,0 +1,33 @@ +.. _pretrained_models: + +***************** +Pretrained models +***************** + +The command for interacting with MFA models is :code:`mfa model`. The subcommands allow for inspecting currently saved pretrained models, downloading ones from MFA's model repo, and saving models you have trained to be used with a simple name rather than the full path each time. + +Following installation of MFA, :code:`mfa model list acoustic` will not list any models. If you want to download the default English model trained on Librispeech, you can run :code:`mfa model download acoustic english`. At which point, the previous ``list`` command will output "english" as an option. When referring to an acoustic model in another MFA command, rather than the full path to the acoustic model, you can now supply just ``english`` and MFA will resolve it to the saved path. + +Similarly, if you train a new model, you can run :code:`mfa model save acoustic /path/where/the/model/was/saved.zip`, then this model will be available via ``saved`` in the future. The name defaults to whatever the archive is called without the directory or extension. You can modify this name with the ``--name NEWNAME`` option + +There are a number of pretrained models for aligning and generating pronunciation dictionaries. The command +for downloading these is :code:`mfa model download ` where ``model_type`` is one of ``acoustic``, ``g2p``, or +``dictionary``. + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: model + +Pretrained models +================= + +.. toctree:: + :maxdepth: 1 + + acoustic.rst + g2p.rst + dictionary.rst + lm.rst diff --git a/docs/source/user_guide/models/lm.rst b/docs/source/user_guide/models/lm.rst new file mode 100644 index 00000000..4bfc5a57 --- /dev/null +++ b/docs/source/user_guide/models/lm.rst @@ -0,0 +1,21 @@ + +.. _`GlobalPhone language models`: https://www.csl.uni-bremen.de/GlobalPhone/ + +.. _`LibriSpeech language models`: https://www.openslr.org/11/ + +.. _`FalaBrasil language models`: https://gitlab.com/fb-asr/fb-asr-resources/kaldi-resources/-/tree/main/lm + +.. _pretrained_language_models: + +************************** +Pretrained language models +************************** + +There are several places that contain pretrained language models that can be imported to MFA. + +.. csv-table:: + :header: "Source", "Language", "Link" + + "GlobalPhone", "Various languages", `GlobalPhone language models`_ + "LibriSpeech", "English", `LibriSpeech language models`_ + "FalaBrasil", "Brazilian Portuguese", `FalaBrasil language models`_ diff --git a/docs/source/user_guide/workflows/aligning/adapt_acoustic_model.rst b/docs/source/user_guide/workflows/aligning/adapt_acoustic_model.rst new file mode 100644 index 00000000..d52e28e5 --- /dev/null +++ b/docs/source/user_guide/workflows/aligning/adapt_acoustic_model.rst @@ -0,0 +1,15 @@ +.. _adapt_acoustic_model: + +*********************************** +Adapting acoustic model to new data +*********************************** + +A recent 2.0 functionality for MFA is to adapt pretrained models to a new dataset. MFA will first align the dataset using the pretrained model, and then perform a couple of rounds of speaker-adaptation training. + + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: adapt diff --git a/docs/source/user_guide/workflows/aligning/index.rst b/docs/source/user_guide/workflows/aligning/index.rst new file mode 100644 index 00000000..ecc53fbe --- /dev/null +++ b/docs/source/user_guide/workflows/aligning/index.rst @@ -0,0 +1,13 @@ + +.. _aligning: + +********************* +Generating alignments +********************* + + +.. toctree:: + :maxdepth: 3 + + adapt_acoustic_model.rst + pretrained.rst diff --git a/docs/source/user_guide/workflows/aligning/pretrained.rst b/docs/source/user_guide/workflows/aligning/pretrained.rst new file mode 100644 index 00000000..36ba8e67 --- /dev/null +++ b/docs/source/user_guide/workflows/aligning/pretrained.rst @@ -0,0 +1,14 @@ + +.. _pretrained_alignment: + +************************************ +Align with pretrained acoustic model +************************************ + + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: align diff --git a/docs/source/annotator.rst b/docs/source/user_guide/workflows/anchor.rst similarity index 98% rename from docs/source/annotator.rst rename to docs/source/user_guide/workflows/anchor.rst index 73fcf3ea..d4bc2e0a 100644 --- a/docs/source/annotator.rst +++ b/docs/source/user_guide/workflows/anchor.rst @@ -1,7 +1,7 @@ .. _`Anchor Annotator documentation`: https://anchor-annotator.readthedocs.io/en/latest/ -.. _annotator: +.. _anchor: **************** Anchor annotator diff --git a/docs/source/user_guide/workflows/classify_speakers.rst b/docs/source/user_guide/workflows/classify_speakers.rst new file mode 100644 index 00000000..7a309687 --- /dev/null +++ b/docs/source/user_guide/workflows/classify_speakers.rst @@ -0,0 +1,15 @@ +.. _classify_speakers: + +********************** +Speaker classification +********************** + +The Montreal Forced Aligner can use trained ivector models (see :ref:`train_ivector` for more information about training +these models) to classify or cluster utterances according to speakers. + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: classify_speakers diff --git a/docs/source/corpus_creation.rst b/docs/source/user_guide/workflows/corpus_creation.rst similarity index 91% rename from docs/source/corpus_creation.rst rename to docs/source/user_guide/workflows/corpus_creation.rst index f485dc33..3b6ca42f 100644 --- a/docs/source/corpus_creation.rst +++ b/docs/source/user_guide/workflows/corpus_creation.rst @@ -10,7 +10,7 @@ follows: 1. If the corpus made up of long sound file that need segmenting, :ref:`create_segments` 2. If the corpus does not contain transcriptions, transcribe utterances using existing acoustic models, language models, and dictionaries (:ref:`transcribing`) -3. Use the annotator tool to fix up any errors (:ref:`annotator`) +3. Use the annotator tool to fix up any errors (:ref:`anchor`) 4. As necessary, bootstrap better transcriptions: 1. Retrain language model with new fixed transcriptions (:ref:`training_lm`) @@ -25,3 +25,4 @@ follows: transcribing.rst training_lm.rst training_dictionary.rst + anchor.rst diff --git a/docs/source/user_guide/workflows/create_segments.rst b/docs/source/user_guide/workflows/create_segments.rst new file mode 100644 index 00000000..26c4dd19 --- /dev/null +++ b/docs/source/user_guide/workflows/create_segments.rst @@ -0,0 +1,22 @@ +.. _create_segments: + +*************** +Create segments +*************** + +The Montreal Forced Aligner can use Voice Activity Detection (VAD) capabilities from Kaldi to generate segments from +a longer sound file. + +.. note:: + + The default configuration for VAD uses configuration values based on quiet speech. The algorithm is based on energy, + so if your recordings are more noisy, you may need to adjust the configuration. See :ref:`configuration_segments` + for more information on changing these parameters. + + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: create_segments diff --git a/docs/source/user_guide/workflows/g2p/dictionary_generating.rst b/docs/source/user_guide/workflows/g2p/dictionary_generating.rst new file mode 100644 index 00000000..55629b14 --- /dev/null +++ b/docs/source/user_guide/workflows/g2p/dictionary_generating.rst @@ -0,0 +1,39 @@ + + +.. _g2p_dictionary_generating: + +*********************** +Generating a dictionary +*********************** + +We have trained several G2P models that are available for download (:ref:`pretrained_g2p`). + +.. warning:: + + Please note that G2P models trained prior to 2.0 cannot be used with MFA 2.0. If you would like to use + these models, please use the the 1.0.1 or 1.1 g2p utilities or retrain a new G2P model following + :ref:`g2p_model_training`. + +.. note:: + + Generating pronunciations to supplement your existing pronunciation + dictionary can be done by running the validation utility (see :ref:`running_the_validator`), and then use the path + to the ``oovs_found.txt`` file that it generates. + + +Pronunciation dictionaries can also be generated from the orthographies of the words themselves, rather than relying on +a trained G2P model. This functionality should be reserved for languages with transparent orthographies, close to 1-to-1 +grapheme-to-phoneme mapping. + +See :ref:`dict_generating_example` for an example of how to use G2P functionality with a premade example. + +.. warning:: + + Please note that this functionality is not available on Windows natively, however, you can install it using :xref:`wsl`, see :ref:`installation_ref` for more details. + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: g2p diff --git a/docs/source/g2p.rst b/docs/source/user_guide/workflows/g2p/index.rst similarity index 93% rename from docs/source/g2p.rst rename to docs/source/user_guide/workflows/g2p/index.rst index d152106d..6a41fcbc 100644 --- a/docs/source/g2p.rst +++ b/docs/source/user_guide/workflows/g2p/index.rst @@ -16,5 +16,5 @@ always check the resulting dictionary carefully before potentially propagating e .. toctree:: :maxdepth: 3 - g2p_dictionary_generating.rst - g2p_model_training.rst + dictionary_generating.rst + model_training.rst diff --git a/docs/source/user_guide/workflows/g2p/model_training.rst b/docs/source/user_guide/workflows/g2p/model_training.rst new file mode 100644 index 00000000..d2852f33 --- /dev/null +++ b/docs/source/user_guide/workflows/g2p/model_training.rst @@ -0,0 +1,30 @@ + +.. _`Sigmorphon 2020 G2P task baseline`: https://github.com/sigmorphon/2020/tree/master/task1/baselines/fst + +.. _g2p_model_training: + +************************ +Training a new G2P model +************************ + +Another tool included with MFA allows you to train a G2P (Grapheme to Phoneme) model automatically from a given +pronunciation dictionary. +This type of model can be used for :ref:`g2p_dictionary_generating`. +It requires a pronunciation dictionary with each line consisting of the orthographic transcription followed by the +phonetic transcription. The model is generated using the :xref:`pynini` package, which generates FST (finite state transducer) +files. The implementation is based on that in the `Sigmorphon 2020 G2P task baseline`_. +The G2P model output will be a .zip file like the acoustic model generated from alignment. + + +See :ref:`g2p_model_training_example` for an example of how to train a G2P model with a premade toy example. + +.. warning:: + + Please note that this functionality is not available on Windows natively, however, you can install it using :xref:`wsl`, see :ref:`installation_ref` for more details. + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: train_g2p diff --git a/docs/source/user_guide/workflows/index.rst b/docs/source/user_guide/workflows/index.rst new file mode 100644 index 00000000..01216529 --- /dev/null +++ b/docs/source/user_guide/workflows/index.rst @@ -0,0 +1,10 @@ +Workflows available +=================== + +.. toctree:: + :maxdepth: 2 + + aligning/index + train_acoustic_model + g2p/index + corpus_creation diff --git a/docs/source/user_guide/workflows/train_acoustic_model.rst b/docs/source/user_guide/workflows/train_acoustic_model.rst new file mode 100644 index 00000000..f04e82da --- /dev/null +++ b/docs/source/user_guide/workflows/train_acoustic_model.rst @@ -0,0 +1,14 @@ +.. _train_acoustic_model: + +***************************** +Training a new acoustic model +***************************** + + +Command reference +----------------- + + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: train diff --git a/docs/source/user_guide/workflows/train_ivector.rst b/docs/source/user_guide/workflows/train_ivector.rst new file mode 100644 index 00000000..e15a017e --- /dev/null +++ b/docs/source/user_guide/workflows/train_ivector.rst @@ -0,0 +1,15 @@ +.. _train_ivector: + +***************************** +Training an ivector extractor +***************************** + +The Montreal Forced Aligner can train ivector extractors using an acoustic model for generating alignments. As part +of this training process, a classifier is built in that can be used as part of :ref:`classify_speakers`. + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: train_ivector diff --git a/docs/source/user_guide/workflows/training_dictionary.rst b/docs/source/user_guide/workflows/training_dictionary.rst new file mode 100644 index 00000000..fb103cde --- /dev/null +++ b/docs/source/user_guide/workflows/training_dictionary.rst @@ -0,0 +1,17 @@ +.. _training_dictionary: + +************************************ +Modeling pronunciation probabilities +************************************ + +MFA includes a utility command for training pronunciation probabilities of a dictionary given a corpus for alignment. + +The resulting dictionary can then be used as a dictionary for alignment or transcription. + + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: train_dictionary diff --git a/docs/source/user_guide/workflows/training_lm.rst b/docs/source/user_guide/workflows/training_lm.rst new file mode 100644 index 00000000..2d9b42fa --- /dev/null +++ b/docs/source/user_guide/workflows/training_lm.rst @@ -0,0 +1,19 @@ +.. _training_lm: + +************************ +Training language models +************************ + +MFA has a utility function for training ARPA-format ngram language models, as well as merging with a pre-existing model. + + +.. warning:: + + Please note that this functionality is not available on Windows natively, however, you can install it using :xref:`wsl`, see :ref:`installation_ref` for more details. + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: train_lm diff --git a/docs/source/user_guide/workflows/transcribing.rst b/docs/source/user_guide/workflows/transcribing.rst new file mode 100644 index 00000000..9b40322c --- /dev/null +++ b/docs/source/user_guide/workflows/transcribing.rst @@ -0,0 +1,18 @@ +.. _`Coqui`: https://coqui.ai/ + +.. _transcribing: + +********************************* +Transcribe audio (Speech-to-text) +********************************* + +.. warning:: + + The technology that MFA uses is several years out of date, and as such if you have other options available such as :xref:`coqui` or other production systems for speech-to-text, we recommend using those. The transcription capabilities are more here for completeness. + +Command reference +----------------- + +.. autoprogram:: montreal_forced_aligner.command_line.mfa:parser + :prog: mfa + :start_command: transcribe diff --git a/environment.yml b/environment.yml index d30c3bb9..38e98321 100644 --- a/environment.yml +++ b/environment.yml @@ -2,7 +2,7 @@ name: mfa channels: - conda-forge dependencies: - - python>=3.8 # or 2.7 if you are feeling nostalgic + - python>=3.8 - numpy - librosa - tqdm diff --git a/environment_win.yml b/environment_win.yml index 5392cc2a..a1dd9a47 100644 --- a/environment_win.yml +++ b/environment_win.yml @@ -2,7 +2,7 @@ name: montreal-forced-aligner channels: - conda-forge dependencies: - - python>=3.8 # or 2.7 if you are feeling nostalgic + - python>=3.8 - numpy - librosa - tqdm diff --git a/montreal_forced_aligner/__init__.py b/montreal_forced_aligner/__init__.py index 96a722b2..2d84b423 100644 --- a/montreal_forced_aligner/__init__.py +++ b/montreal_forced_aligner/__init__.py @@ -14,6 +14,8 @@ import montreal_forced_aligner.utils as utils # noqa __all__ = [ + "abc", + "data", "aligner", "command_line", "config", diff --git a/montreal_forced_aligner/abc.py b/montreal_forced_aligner/abc.py new file mode 100644 index 00000000..88161c84 --- /dev/null +++ b/montreal_forced_aligner/abc.py @@ -0,0 +1,312 @@ +""" +Abstract Base Classes +===================== +""" + +from __future__ import annotations + +from abc import ABC, ABCMeta, abstractmethod +from typing import TYPE_CHECKING, Any, Collection, Dict, List, Optional, Tuple, Union + +if TYPE_CHECKING: + from .config.align_config import AlignConfig + from .config.dictionary_config import DictionaryConfig + from .config.transcribe_config import TranscribeConfig + from .corpus.base import Corpus + from .dictionary.multispeaker import MultispeakerDictionary + from .models import AcousticModel, DictionaryModel, LanguageModel + + +__all__ = [ + "MfaModel", + "MfaWorker", + "Dictionary", + "MetaDict", + "AcousticModelWorker", + "IvectorExtractor", + "Trainer", + "Transcriber", + "Aligner", + "DictionaryEntryType", + "ReversedMappingType", + "Labels", +] + +# Configuration types +MetaDict = Dict[str, Any] +Labels = List[Any] +CtmErrorDict = Dict[Tuple[str, int], str] + +# Dictionary types +DictionaryEntryType = List[Dict[str, Union[Tuple[str], float, None, int]]] +ReversedMappingType = Dict[int, str] +WordsType = Dict[str, DictionaryEntryType] +MappingType = Dict[str, int] +MultiSpeakerMappingType = Dict[str, str] +IpaType = Optional[List[str]] +PunctuationType = Optional[str] + +# Corpus types +SegmentsType = Dict[str, Dict[str, Union[str, float, int]]] +OneToOneMappingType = Dict[str, str] +OneToManyMappingType = Dict[str, List[str]] + +CorpusMappingType = Union[OneToOneMappingType, OneToManyMappingType] +ScpType = Union[List[Tuple[str, str]], List[Tuple[str, List[Any]]]] +CorpusGroupedOneToOne = List[List[Tuple[str, str]]] +CorpusGroupedOneToMany = List[List[Tuple[str, List[Any]]]] +CorpusGroupedType = Union[CorpusGroupedOneToMany, CorpusGroupedOneToOne] + + +class MfaWorker(metaclass=ABCMeta): + """Abstract class for MFA workers""" + + def __init__(self, corpus: Corpus): + self.corpus = corpus + + @property + @abstractmethod + def working_directory(self) -> str: + """Current directory""" + ... + + @property + def data_directory(self) -> str: + """Corpus data directory""" + return self._data_directory + + @data_directory.setter + def data_directory(self, val: str) -> None: + self._data_directory = val + + @property + def uses_voiced(self) -> bool: + """Flag for using voiced features""" + return self._uses_voiced + + @uses_voiced.setter + def uses_voiced(self, val: bool) -> None: + self._uses_voiced = val + + @property + def uses_cmvn(self) -> bool: + """Flag for using CMVN""" + return self._uses_cmvn + + @uses_cmvn.setter + def uses_cmvn(self, val: bool) -> None: + self._uses_cmvn = val + + @property + def uses_splices(self) -> bool: + """Flag for using spliced features""" + return self._uses_splices + + @uses_splices.setter + def uses_splices(self, val: bool) -> None: + self._uses_splices = val + + @property + def speaker_independent(self) -> bool: + """Flag for speaker independent features""" + return self._speaker_independent + + @speaker_independent.setter + def speaker_independent(self, val: bool) -> None: + self._speaker_independent = val + + @property + @abstractmethod + def working_log_directory(self) -> str: + """Current log directory""" + ... + + @property + def use_mp(self) -> bool: + """Flag for using multiprocessing""" + return self._use_mp + + @use_mp.setter + def use_mp(self, val: bool) -> None: + self._use_mp = val + + +class AcousticModelWorker(MfaWorker): + """ + Abstract class for MFA classes that use acoustic models + + Parameters + ---------- + dictionary: MultispeakerDictionary + Dictionary for the worker docstring + """ + + def __init__(self, corpus: Corpus, dictionary: MultispeakerDictionary): + super().__init__(corpus) + self.dictionary: MultispeakerDictionary = dictionary + + +class Trainer(AcousticModelWorker): + """ + Abstract class for MFA trainers + + Attributes + ---------- + iteration: int + Current iteration + """ + + def __init__(self, corpus: Corpus, dictionary: MultispeakerDictionary): + super(Trainer, self).__init__(corpus, dictionary) + self.iteration = 0 + + @property + @abstractmethod + def meta(self) -> MetaDict: + """Training configuration parameters""" + ... + + @abstractmethod + def train(self) -> None: + """Perform training""" + ... + + +class Aligner(AcousticModelWorker): + """Abstract class for MFA aligners""" + + def __init__( + self, corpus: Corpus, dictionary: MultispeakerDictionary, align_config: AlignConfig + ): + super().__init__(corpus, dictionary) + self.align_config = align_config + + @abstractmethod + def align(self, subset: Optional[int] = None) -> None: + """Perform alignment""" + ... + + @property + @abstractmethod + def model_path(self) -> str: + """Acoustic model file path""" + ... + + @property + @abstractmethod + def alignment_model_path(self) -> str: + """Acoustic model file path for speaker-independent alignment""" + ... + + +class Transcriber(AcousticModelWorker): + """Abstract class for MFA transcribers""" + + def __init__( + self, + corpus: Corpus, + dictionary: MultispeakerDictionary, + acoustic_model: AcousticModel, + language_model: LanguageModel, + transcribe_config: TranscribeConfig, + ): + super().__init__(corpus, dictionary) + self.acoustic_model = acoustic_model + self.language_model = language_model + self.transcribe_config = transcribe_config + + @abstractmethod + def transcribe(self) -> None: + """Perform transcription""" + ... + + @property + @abstractmethod + def model_path(self) -> str: + """Acoustic model file path""" + ... + + +class IvectorExtractor(AcousticModelWorker): + """Abstract class for MFA ivector extractors""" + + @abstractmethod + def extract_ivectors(self) -> None: + """Extract ivectors""" + ... + + @property + @abstractmethod + def model_path(self) -> str: + """Acoustic model file path""" + ... + + @property + @abstractmethod + def ivector_options(self) -> MetaDict: + """Ivector parameters""" + ... + + @property + @abstractmethod + def dubm_path(self) -> str: + """DUBM model file path""" + ... + + @property + @abstractmethod + def ie_path(self) -> str: + """Ivector extractor model file path""" + ... + + +class Dictionary(ABC): + """Abstract class for pronunciation dictionaries""" + + def __init__(self, dictionary_model: DictionaryModel, config: DictionaryConfig): + self.name = dictionary_model.name + self.dictionary_model = dictionary_model + self.config = config + + +class MfaModel(ABC): + """Abstract class for MFA models""" + + @property + @abstractmethod + def extensions(self) -> Collection: + """File extensions for the model""" + ... + + @extensions.setter + @abstractmethod + def extensions(self, val: Collection) -> None: + ... + + @classmethod + @abstractmethod + def valid_extension(cls, filename: str) -> bool: + """Check whether a file has a valid extensions""" + ... + + @classmethod + @abstractmethod + def generate_path(cls, root: str, name: str, enforce_existence: bool = True) -> Optional[str]: + """Generate a path from a root directory""" + ... + + @abstractmethod + def pretty_print(self): + """Print the model's meta data""" + ... + + @property + @abstractmethod + def meta(self) -> MetaDict: + """Meta data for the model""" + ... + + @abstractmethod + def add_meta_file(self, trainer: Trainer) -> None: + """Add meta data to the model""" + ... diff --git a/montreal_forced_aligner/aligner/__init__.py b/montreal_forced_aligner/aligner/__init__.py index 67cbba9e..7af0241c 100644 --- a/montreal_forced_aligner/aligner/__init__.py +++ b/montreal_forced_aligner/aligner/__init__.py @@ -1,4 +1,20 @@ -"""Aligner module of MFA, contains various types of top-level aligners""" +""" +Aligners +======== + +""" from .adapting import AdaptingAligner # noqa +from .base import BaseAligner # noqa from .pretrained import PretrainedAligner # noqa from .trainable import TrainableAligner # noqa + +__all__ = [ + "AdaptingAligner", + "PretrainedAligner", + "TrainableAligner", + "BaseAligner", + "adapting", + "base", + "pretrained", + "trainable", +] diff --git a/montreal_forced_aligner/aligner/adapting.py b/montreal_forced_aligner/aligner/adapting.py index 2ce4fd77..1205039f 100644 --- a/montreal_forced_aligner/aligner/adapting.py +++ b/montreal_forced_aligner/aligner/adapting.py @@ -5,6 +5,7 @@ import shutil from typing import TYPE_CHECKING, Optional +from ..abc import Trainer from ..exceptions import KaldiProcessingError from ..models import AcousticModel from ..multiprocessing import ( @@ -22,7 +23,7 @@ from ..config import AlignConfig from ..corpus import Corpus - from ..dictionary import Dictionary + from ..dictionary import MultispeakerDictionary from ..models import MetaDict from .pretrained import PretrainedAligner @@ -30,19 +31,19 @@ __all__ = ["AdaptingAligner"] -class AdaptingAligner(BaseAligner): +class AdaptingAligner(BaseAligner, Trainer): """ Aligner adapts another acoustic model to the current data Parameters ---------- - corpus : :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus : :class:`~montreal_forced_aligner.corpus.Corpus` Corpus object for the dataset - dictionary : :class:`~montreal_forced_aligner.dictionary.Dictionary` + dictionary : :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary` Dictionary object for the pronunciation dictionary - pretrained_aligner: :class:`~montreal_forced_aligner.aligner.pretrained.PretrainedAligner` + pretrained_aligner: :class:`~montreal_forced_aligner.aligner.PretrainedAligner` Pretrained aligner to use as input to training - align_config : :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + align_config : :class:`~montreal_forced_aligner.config.AlignConfig` Configuration for alignment temp_directory : str, optional Specifies the temporary directory root to save files need for Kaldi. @@ -58,7 +59,7 @@ class AdaptingAligner(BaseAligner): def __init__( self, corpus: Corpus, - dictionary: Dictionary, + dictionary: MultispeakerDictionary, previous_aligner: PretrainedAligner, align_config: AlignConfig, temp_directory: Optional[str] = None, @@ -130,7 +131,7 @@ def next_model_path(self): """Next iteration's acoustic model path""" return os.path.join(self.working_directory, "final.mdl") - def adapt(self) -> None: + def train(self) -> None: """Run the adaptation""" done_path = os.path.join(self.adapt_directory, "done") dirty_path = os.path.join(self.adapt_directory, "dirty") @@ -181,16 +182,16 @@ def meta(self) -> MetaDict: from ..utils import get_mfa_version data = { - "phones": sorted(self.dictionary.nonsil_phones), + "phones": sorted(self.dictionary.config.non_silence_phones), "version": get_mfa_version(), "architecture": self.acoustic_model.meta["architecture"], "train_date": str(datetime.now()), "features": self.previous_aligner.align_config.feature_config.params(), - "multilingual_ipa": self.dictionary.multilingual_ipa, + "multilingual_ipa": self.dictionary.config.multilingual_ipa, } - if self.dictionary.multilingual_ipa: - data["strip_diacritics"] = self.dictionary.strip_diacritics - data["digraphs"] = self.dictionary.digraphs + if self.dictionary.config.multilingual_ipa: + data["strip_diacritics"] = self.dictionary.config.strip_diacritics + data["digraphs"] = self.dictionary.config.digraphs return data def save(self, path, root_directory=None) -> None: diff --git a/montreal_forced_aligner/aligner/base.py b/montreal_forced_aligner/aligner/base.py index a01a56de..3985eb8c 100644 --- a/montreal_forced_aligner/aligner/base.py +++ b/montreal_forced_aligner/aligner/base.py @@ -7,6 +7,7 @@ import time from typing import TYPE_CHECKING, Optional +from ..abc import Aligner from ..config import TEMP_DIR from ..exceptions import KaldiProcessingError from ..multiprocessing import ( @@ -21,25 +22,27 @@ if TYPE_CHECKING: from logging import Logger + import montreal_forced_aligner + from ..config import AlignConfig - from ..corpus import Corpus - from ..dictionary import DictionaryType + from ..corpus.base import Corpus + from ..dictionary import MultispeakerDictionary from ..models import AcousticModel __all__ = ["BaseAligner"] -class BaseAligner: +class BaseAligner(Aligner): """ Base aligner class for common aligner functions Parameters ---------- - corpus : :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus : :class:`~montreal_forced_aligner.corpus.Corpus` Corpus object for the dataset - dictionary : :class:`~montreal_forced_aligner.dictionary.Dictionary` + dictionary : :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary` Dictionary object for the pronunciation dictionary - align_config : :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + align_config : :class:`~montreal_forced_aligner.config.AlignConfig` Configuration for alignment temp_directory : str, optional Specifies the temporary directory root to save files need for Kaldi. @@ -55,7 +58,7 @@ class BaseAligner: def __init__( self, corpus: Corpus, - dictionary: DictionaryType, + dictionary: MultispeakerDictionary, align_config: AlignConfig, temp_directory: Optional[str] = None, debug: bool = False, @@ -63,9 +66,7 @@ def __init__( logger: Optional[Logger] = None, acoustic_model: Optional[AcousticModel] = None, ): - self.align_config = align_config - self.corpus = corpus - self.dictionary = dictionary + super().__init__(corpus, dictionary, align_config) if not temp_directory: temp_directory = TEMP_DIR self.temp_directory = temp_directory @@ -97,7 +98,7 @@ def setup(self) -> None: self.dictionary.set_word_set(self.corpus.word_set) self.dictionary.write() self.corpus.initialize_corpus(self.dictionary, self.align_config.feature_config) - self.align_config.silence_csl = self.dictionary.silence_csl + self.align_config.silence_csl = self.dictionary.config.silence_csl self.data_directory = self.corpus.split_directory self.feature_config = self.align_config.feature_config @@ -107,12 +108,12 @@ def use_mp(self) -> bool: return self.align_config.use_mp @property - def meta(self) -> dict: + def meta(self) -> montreal_forced_aligner.abc.MetaDict: """Metadata for the trained model""" from ..utils import get_mfa_version data = { - "phones": sorted(self.dictionary.nonsil_phones), + "phones": sorted(self.dictionary.config.non_silence_phones), "version": get_mfa_version(), "architecture": "gmm-hmm", "features": "mfcc+deltas", @@ -123,14 +124,14 @@ def meta(self) -> dict: def align_options(self): """Options for alignment""" options = self.align_config.align_options - options["optional_silence_csl"] = self.dictionary.optional_silence_csl + options["optional_silence_csl"] = self.dictionary.config.optional_silence_csl return options @property def fmllr_options(self): """Options for fMLLR""" options = self.align_config.fmllr_options - options["silence_csl"] = self.dictionary.silence_csl + options["silence_csl"] = self.dictionary.config.silence_csl return options @property @@ -143,6 +144,11 @@ def working_directory(self) -> str: """Current working directory""" return self.align_directory + @property + def model_path(self) -> str: + """Current acoustic model path""" + return self.current_model_path + @property def current_model_path(self) -> str: """Current acoustic model path""" diff --git a/montreal_forced_aligner/aligner/pretrained.py b/montreal_forced_aligner/aligner/pretrained.py index 988cde9a..be66623f 100644 --- a/montreal_forced_aligner/aligner/pretrained.py +++ b/montreal_forced_aligner/aligner/pretrained.py @@ -13,7 +13,7 @@ from ..config import AlignConfig from ..corpus import Corpus - from ..dictionary import Dictionary + from ..dictionary import MultispeakerDictionary from ..models import AcousticModel __all__ = ["PretrainedAligner"] @@ -25,13 +25,13 @@ class PretrainedAligner(BaseAligner): Parameters ---------- - corpus : :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus : :class:`~montreal_forced_aligner.corpus.Corpus` Corpus object for the dataset - dictionary : :class:`~montreal_forced_aligner.dictionary.Dictionary` + dictionary : :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary` Dictionary object for the pronunciation dictionary acoustic_model : :class:`~montreal_forced_aligner.models.AcousticModel` Archive containing the acoustic model and pronunciation dictionary - align_config : :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + align_config : :class:`~montreal_forced_aligner.config.AlignConfig` Configuration for alignment temp_directory : str, optional Specifies the temporary directory root to save files need for Kaldi. @@ -47,7 +47,7 @@ class PretrainedAligner(BaseAligner): def __init__( self, corpus: Corpus, - dictionary: Dictionary, + dictionary: MultispeakerDictionary, acoustic_model: AcousticModel, align_config: AlignConfig, temp_directory: Optional[str] = None, @@ -78,7 +78,7 @@ def model_directory(self) -> str: def setup(self) -> None: """Set up aligner""" - self.dictionary.nonsil_phones = self.acoustic_model.meta["phones"] + self.dictionary.config.non_silence_phones = self.acoustic_model.meta["phones"] super(PretrainedAligner, self).setup() self.acoustic_model.export_model(self.align_directory) @@ -107,11 +107,7 @@ def generate_pronunciations( Specifies the minimum count of words to include in derived probabilities, default is 1 """ pron_counts, utt_mapping = generate_pronunciations(self) - if self.dictionary.has_multiple: - dictionary_mapping = self.dictionary.dictionary_mapping() - else: - dictionary_mapping = {self.dictionary.name: self.dictionary} - for dict_name, dictionary in dictionary_mapping.items(): + for dict_name, dictionary in self.dictionary.dictionary_mapping.items(): counts = pron_counts[dict_name] mapping = utt_mapping[dict_name] if calculate_silence_probs: diff --git a/montreal_forced_aligner/aligner/trainable.py b/montreal_forced_aligner/aligner/trainable.py index e80dcff4..faf47659 100644 --- a/montreal_forced_aligner/aligner/trainable.py +++ b/montreal_forced_aligner/aligner/trainable.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Optional +from ..abc import Trainer from .base import BaseAligner if TYPE_CHECKING: @@ -11,24 +12,24 @@ from ..aligner.pretrained import PretrainedAligner from ..config import AlignConfig, TrainingConfig from ..corpus import Corpus - from ..dictionary import Dictionary + from ..dictionary import MultispeakerDictionary __all__ = ["TrainableAligner"] -class TrainableAligner(BaseAligner): +class TrainableAligner(BaseAligner, Trainer): """ Aligner that aligns and trains acoustics models on a large dataset Parameters ---------- - corpus : :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus : :class:`~montreal_forced_aligner.corpus.Corpus` Corpus object for the dataset - dictionary : :class:`~montreal_forced_aligner.dictionary.Dictionary` + dictionary : :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary` Dictionary object for the pronunciation dictionary training_config : :class:`~montreal_forced_aligner.config.TrainingConfig` Configuration to train a model - align_config : :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + align_config : :class:`~montreal_forced_aligner.config.AlignConfig` Configuration for alignment temp_directory : str, optional Specifies the temporary directory root to save files need for Kaldi. @@ -46,7 +47,7 @@ class TrainableAligner(BaseAligner): def __init__( self, corpus: Corpus, - dictionary: Dictionary, + dictionary: MultispeakerDictionary, training_config: TrainingConfig, align_config: AlignConfig, temp_directory: Optional[str] = None, @@ -86,7 +87,7 @@ def save(self, path: str, root_directory: Optional[str] = None) -> None: Path for root directory of temporary files """ self.training_config.values()[-1].save(path, root_directory) - self.logger.info("Saved model to {}".format(path)) + self.logger.info(f"Saved model to {path}") @property def meta(self) -> dict: @@ -94,7 +95,7 @@ def meta(self) -> dict: from ..utils import get_mfa_version data = { - "phones": sorted(self.dictionary.nonsil_phones), + "phones": sorted(self.dictionary.config.non_silence_phones), "version": get_mfa_version(), "architecture": self.training_config.values()[-1].architecture, "phone_type": self.training_config.values()[-1].phone_type, @@ -102,6 +103,10 @@ def meta(self) -> dict: } return data + @property + def model_path(self) -> str: + return self.training_config.values()[-1].model_path + def train(self, generate_final_alignments: bool = True) -> None: """ Run through the training configurations to produce a final acoustic model diff --git a/montreal_forced_aligner/command_line/__init__.py b/montreal_forced_aligner/command_line/__init__.py index e34769e5..395b613b 100644 --- a/montreal_forced_aligner/command_line/__init__.py +++ b/montreal_forced_aligner/command_line/__init__.py @@ -1 +1,63 @@ -"""Command line functions for MFA""" +""" +Command line functionality +========================== + + +""" + +from .adapt import run_adapt_model # noqa +from .align import run_align_corpus # noqa +from .anchor import run_anchor # noqa +from .classify_speakers import run_classify_speakers # noqa +from .create_segments import run_create_segments # noqa +from .g2p import run_g2p # noqa +from .mfa import create_parser, main # noqa +from .model import download_model, inspect_model, list_model, run_model, save_model # noqa +from .train_acoustic_model import run_train_acoustic_model # noqa +from .train_dictionary import run_train_dictionary # noqa +from .train_g2p import run_train_g2p # noqa +from .train_ivector_extractor import run_train_ivector_extractor # noqa +from .train_lm import run_train_lm # noqa +from .transcribe import run_transcribe_corpus # noqa +from .utils import validate_model_arg # noqa +from .validate import run_validate_corpus # noqa + +__all__ = [ + "adapt", + "align", + "anchor", + "classify_speakers", + "create_segments", + "g2p", + "mfa", + "model", + "train_acoustic_model", + "train_dictionary", + "train_g2p", + "train_ivector_extractor", + "train_lm", + "transcribe", + "utils", + "validate", + "run_transcribe_corpus", + "run_validate_corpus", + "run_train_lm", + "run_train_g2p", + "run_align_corpus", + "run_train_dictionary", + "run_anchor", + "run_model", + "run_adapt_model", + "run_train_acoustic_model", + "run_train_ivector_extractor", + "run_g2p", + "run_create_segments", + "run_classify_speakers", + "create_parser", + "validate_model_arg", + "main", + "list_model", + "save_model", + "inspect_model", + "download_model", +] diff --git a/montreal_forced_aligner/command_line/adapt.py b/montreal_forced_aligner/command_line/adapt.py index 74d04f85..a421931d 100644 --- a/montreal_forced_aligner/command_line/adapt.py +++ b/montreal_forced_aligner/command_line/adapt.py @@ -15,7 +15,7 @@ load_command_configuration, ) from montreal_forced_aligner.corpus import Corpus -from montreal_forced_aligner.dictionary import Dictionary, MultispeakerDictionary +from montreal_forced_aligner.dictionary import MultispeakerDictionary from montreal_forced_aligner.exceptions import ArgumentError from montreal_forced_aligner.models import AcousticModel from montreal_forced_aligner.utils import get_mfa_version, log_config, setup_logger @@ -49,9 +49,9 @@ def adapt_model(args: Namespace, unknown_args: Optional[Collection[str]] = None) corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) if args.config_path: - align_config = align_yaml_to_config(args.config_path) + align_config, dictionary_config = align_yaml_to_config(args.config_path) else: - align_config = load_basic_align() + align_config, dictionary_config = load_basic_align() align_config.update_from_args(args) if unknown_args: align_config.update_from_unknown_args(unknown_args) @@ -66,6 +66,8 @@ def adapt_model(args: Namespace, unknown_args: Optional[Collection[str]] = None) logger = setup_logger(command, data_directory, console_level=log_level) logger.debug("ALIGN CONFIG:") log_config(logger, align_config) + logger.debug("DICTIONARY CONFIG:") + log_config(logger, dictionary_config) conf = load_command_configuration( conf_path, { @@ -119,6 +121,7 @@ def adapt_model(args: Namespace, unknown_args: Optional[Collection[str]] = None) model_directory = os.path.join(data_directory, "acoustic_models") os.makedirs(model_directory, exist_ok=True) acoustic_model = AcousticModel(args.acoustic_model_path, root_directory=model_directory) + dictionary_config.update(acoustic_model.meta) acoustic_model.log_details(logger) debug = getattr(args, "debug", False) audio_dir = None @@ -128,40 +131,21 @@ def adapt_model(args: Namespace, unknown_args: Optional[Collection[str]] = None) corpus = Corpus( args.corpus_directory, data_directory, + dictionary_config, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, sample_rate=align_config.feature_config.sample_frequency, logger=logger, use_mp=align_config.use_mp, - punctuation=align_config.punctuation, - clitic_markers=align_config.clitic_markers, audio_directory=audio_dir, ) logger.info(corpus.speaker_utterance_info()) - if args.dictionary_path.lower().endswith(".yaml"): - dictionary = MultispeakerDictionary( - args.dictionary_path, - data_directory, - logger=logger, - punctuation=align_config.punctuation, - clitic_markers=align_config.clitic_markers, - compound_markers=align_config.compound_markers, - multilingual_ipa=acoustic_model.meta["multilingual_ipa"], - strip_diacritics=acoustic_model.meta.get("strip_diacritics", None), - digraphs=acoustic_model.meta.get("digraphs", None), - ) - else: - dictionary = Dictionary( - args.dictionary_path, - data_directory, - logger=logger, - punctuation=align_config.punctuation, - clitic_markers=align_config.clitic_markers, - compound_markers=align_config.compound_markers, - multilingual_ipa=acoustic_model.meta["multilingual_ipa"], - strip_diacritics=acoustic_model.meta.get("strip_diacritics", None), - digraphs=acoustic_model.meta.get("digraphs", None), - ) + dictionary = MultispeakerDictionary( + args.dictionary_path, + data_directory, + dictionary_config, + logger=logger, + ) acoustic_model.validate(dictionary) begin = time.time() @@ -175,7 +159,7 @@ def adapt_model(args: Namespace, unknown_args: Optional[Collection[str]] = None) logger=logger, ) if args.full_train: - training_config = acoustic_model.adaptation_config() + training_config, dictionary = acoustic_model.adaptation_config() training_config.training_configs[0].update( {"beam": align_config.beam, "retry_beam": align_config.retry_beam} ) @@ -228,7 +212,7 @@ def adapt_model(args: Namespace, unknown_args: Optional[Collection[str]] = None) else: os.makedirs(args.output_directory, exist_ok=True) begin = time.time() - a.adapt() + a.train() logger.debug(f"Mapped adapted model in {time.time() - begin} seconds") if args.output_model_path is not None: a.save(args.output_model_path, root_directory=model_directory) diff --git a/montreal_forced_aligner/command_line/align.py b/montreal_forced_aligner/command_line/align.py index d6fc41ac..33892702 100644 --- a/montreal_forced_aligner/command_line/align.py +++ b/montreal_forced_aligner/command_line/align.py @@ -15,7 +15,7 @@ load_command_configuration, ) from montreal_forced_aligner.corpus import Corpus -from montreal_forced_aligner.dictionary import Dictionary, MultispeakerDictionary +from montreal_forced_aligner.dictionary import MultispeakerDictionary from montreal_forced_aligner.exceptions import ArgumentError from montreal_forced_aligner.models import AcousticModel from montreal_forced_aligner.utils import log_config, setup_logger @@ -52,12 +52,14 @@ def align_corpus(args: Namespace, unknown_args: Optional[list] = None) -> None: corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) if args.config_path: - align_config = align_yaml_to_config(args.config_path) + align_config, dictionary_config = align_yaml_to_config(args.config_path) else: - align_config = load_basic_align() + align_config, dictionary_config = load_basic_align() align_config.update_from_args(args) + dictionary_config.update_from_args(args) if unknown_args: align_config.update_from_unknown_args(unknown_args) + dictionary_config.update_from_unknown_args(unknown_args) conf_path = os.path.join(data_directory, "config.yml") if getattr(args, "clean", False) and os.path.exists(data_directory): print("Cleaning old directory!") @@ -124,6 +126,7 @@ def align_corpus(args: Namespace, unknown_args: Optional[list] = None) -> None: os.makedirs(model_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) acoustic_model = AcousticModel(args.acoustic_model_path, root_directory=model_directory) + dictionary_config.update(acoustic_model.meta) acoustic_model.log_details(logger) audio_dir = None if args.audio_directory: @@ -132,42 +135,23 @@ def align_corpus(args: Namespace, unknown_args: Optional[list] = None) -> None: corpus = Corpus( args.corpus_directory, data_directory, + dictionary_config, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, sample_rate=align_config.feature_config.sample_frequency, logger=logger, use_mp=align_config.use_mp, - punctuation=align_config.punctuation, - clitic_markers=align_config.clitic_markers, audio_directory=audio_dir, ) logger.info(corpus.speaker_utterance_info()) - if args.dictionary_path.lower().endswith(".yaml"): - dictionary = MultispeakerDictionary( - args.dictionary_path, - data_directory, - logger=logger, - punctuation=align_config.punctuation, - word_set=corpus.word_set, - clitic_markers=align_config.clitic_markers, - compound_markers=align_config.compound_markers, - multilingual_ipa=acoustic_model.meta["multilingual_ipa"], - strip_diacritics=acoustic_model.meta.get("strip_diacritics", None), - digraphs=acoustic_model.meta.get("digraphs", None), - ) - else: - dictionary = Dictionary( - args.dictionary_path, - data_directory, - logger=logger, - punctuation=align_config.punctuation, - word_set=corpus.word_set, - clitic_markers=align_config.clitic_markers, - compound_markers=align_config.compound_markers, - multilingual_ipa=acoustic_model.meta["multilingual_ipa"], - strip_diacritics=acoustic_model.meta.get("strip_diacritics", None), - digraphs=acoustic_model.meta.get("digraphs", None), - ) + dictionary = MultispeakerDictionary( + args.dictionary_path, + data_directory, + dictionary_config, + logger=logger, + word_set=corpus.word_set, + ) + acoustic_model.validate(dictionary) begin = time.time() diff --git a/montreal_forced_aligner/command_line/classify_speakers.py b/montreal_forced_aligner/command_line/classify_speakers.py index 287d4623..39d3ae25 100644 --- a/montreal_forced_aligner/command_line/classify_speakers.py +++ b/montreal_forced_aligner/command_line/classify_speakers.py @@ -15,7 +15,7 @@ ) from montreal_forced_aligner.corpus import Corpus from montreal_forced_aligner.exceptions import ArgumentError -from montreal_forced_aligner.models import IvectorExtractor +from montreal_forced_aligner.models import IvectorExtractorModel from montreal_forced_aligner.speaker_classifier import SpeakerClassifier from montreal_forced_aligner.utils import setup_logger @@ -112,7 +112,7 @@ def classify_speakers(args: Namespace, unknown_args: Optional[list] = None) -> N os.makedirs(data_directory, exist_ok=True) os.makedirs(args.output_directory, exist_ok=True) try: - ivector_extractor = IvectorExtractor( + ivector_extractor = IvectorExtractorModel( args.ivector_extractor_path, root_directory=data_directory ) corpus = Corpus( diff --git a/montreal_forced_aligner/command_line/g2p.py b/montreal_forced_aligner/command_line/g2p.py index f27b5cfb..f5e33184 100644 --- a/montreal_forced_aligner/command_line/g2p.py +++ b/montreal_forced_aligner/command_line/g2p.py @@ -9,7 +9,6 @@ from montreal_forced_aligner.config import TEMP_DIR from montreal_forced_aligner.config.g2p_config import g2p_yaml_to_config, load_basic_g2p_config from montreal_forced_aligner.corpus import Corpus -from montreal_forced_aligner.dictionary import check_bracketed from montreal_forced_aligner.g2p.generator import PyniniDictionaryGenerator as Generator from montreal_forced_aligner.models import G2PModel from montreal_forced_aligner.utils import setup_logger @@ -42,9 +41,9 @@ def generate_dictionary(args: Namespace, unknown_args: Optional[list] = None) -> shutil.rmtree(os.path.join(temp_dir, "G2P"), ignore_errors=True) shutil.rmtree(os.path.join(temp_dir, "models", "G2P"), ignore_errors=True) if args.config_path: - g2p_config = g2p_yaml_to_config(args.config_path) + g2p_config, dictionary_config = g2p_yaml_to_config(args.config_path) else: - g2p_config = load_basic_g2p_config() + g2p_config, dictionary_config = load_basic_g2p_config() g2p_config.use_mp = not args.disable_mp if unknown_args: g2p_config.update_from_unknown_args(unknown_args) @@ -64,16 +63,15 @@ def generate_dictionary(args: Namespace, unknown_args: Optional[list] = None) -> corpus = Corpus( input_dir, data_directory, + dictionary_config=dictionary_config, num_jobs=args.num_jobs, use_mp=g2p_config.use_mp, - punctuation=g2p_config.punctuation, - clitic_markers=g2p_config.clitic_markers, parse_text_only_files=True, ) word_set = corpus.word_set if not args.include_bracketed: - word_set = [x for x in word_set if not check_bracketed(x)] + word_set = [x for x in word_set if not dictionary_config.check_bracketed(x)] else: if getattr(args, "verbose", False): @@ -86,7 +84,7 @@ def generate_dictionary(args: Namespace, unknown_args: Optional[list] = None) -> for line in f: word_set.extend(line.strip().split()) if not args.include_bracketed: - word_set = [x for x in word_set if not check_bracketed(x)] + word_set = [x for x in word_set if not dictionary_config.check_bracketed(x)] logger.info( f"Generating transcriptions for the {len(word_set)} word types found in the corpus..." diff --git a/montreal_forced_aligner/command_line/mfa.py b/montreal_forced_aligner/command_line/mfa.py index dc15b45b..7b3fb96a 100644 --- a/montreal_forced_aligner/command_line/mfa.py +++ b/montreal_forced_aligner/command_line/mfa.py @@ -118,7 +118,7 @@ def create_parser() -> ArgumentParser: Returns ------- - ArgumentParser + :class:`~argparse.ArgumentParser` MFA argument parser """ GLOBAL_CONFIG = load_global_config() @@ -129,7 +129,7 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool Parameters ---------- - subparser: argparse.ArgumentParser + subparser: :class:`~argparse.ArgumentParser` Subparser to augment textgrid_output: bool Flag for whether the subparser is used for a command that generates TextGrids @@ -195,7 +195,9 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool _ = subparsers.add_parser("version") - align_parser = subparsers.add_parser("align") + align_parser = subparsers.add_parser( + "align", help="Align a corpus with a pretrained acoustic model" + ) align_parser.add_argument("corpus_directory", help="Full path to the directory to align") align_parser.add_argument( "dictionary_path", help="Full path to the pronunciation dictionary to use" @@ -228,7 +230,7 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool ) add_global_options(align_parser, textgrid_output=True) - adapt_parser = subparsers.add_parser("adapt") + adapt_parser = subparsers.add_parser("adapt", help="Adapt an acoustic model to a new corpus") adapt_parser.add_argument("corpus_directory", help="Full path to the directory to align") adapt_parser.add_argument( "dictionary_path", help="Full path to the pronunciation dictionary to use" @@ -240,7 +242,7 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool adapt_parser.add_argument( "output_paths", nargs="+", - help="Path to directory for aligned TextGrids, zip path to export acoustic model, or both", + help="Path to save the new acoustic model, path to export aligned TextGrids, or both", ) adapt_parser.add_argument( "-o", @@ -275,7 +277,9 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool ) add_global_options(adapt_parser, textgrid_output=True) - train_parser = subparsers.add_parser("train") + train_parser = subparsers.add_parser( + "train", help="Train a new acoustic model on a corpus and optionally export alignments" + ) train_parser.add_argument( "corpus_directory", help="Full path to the source directory to align" ) @@ -285,7 +289,7 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool train_parser.add_argument( "output_paths", nargs="+", - help="Path to directory for aligned TextGrids, zip path to export acoustic model, or both", + help="Path to save the new acoustic model, path to export aligned TextGrids, or both", ) train_parser.add_argument( "--config_path", @@ -317,7 +321,7 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool ) add_global_options(train_parser, textgrid_output=True) - validate_parser = subparsers.add_parser("validate") + validate_parser = subparsers.add_parser("validate", help="Validate a corpus for use in MFA") validate_parser.add_argument( "corpus_directory", help="Full path to the source directory to align" ) @@ -348,10 +352,14 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool ) add_global_options(validate_parser) - g2p_model_help_message = f"""Full path to the archive containing pre-trained model or language ({', '.join(g2p_models)}) - If not specified, then orthographic transcription is split into pronunciations.""" - g2p_parser = subparsers.add_parser("g2p") - g2p_parser.add_argument("g2p_model_path", help=g2p_model_help_message, nargs="?") + g2p_parser = subparsers.add_parser( + "g2p", help="Generate a pronunciation dictionary using a G2P model" + ) + g2p_parser.add_argument( + "g2p_model_path", + help=f"Full path to the archive containing pre-trained model or language ({', '.join(g2p_models)}). If not specified, then orthographic transcription is split into pronunciations.", + nargs="?", + ) g2p_parser.add_argument( "input_path", @@ -368,7 +376,9 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool ) add_global_options(g2p_parser) - train_g2p_parser = subparsers.add_parser("train_g2p") + train_g2p_parser = subparsers.add_parser( + "train_g2p", help="Train a G2P model from a pronunciation dictionary" + ) train_g2p_parser.add_argument("dictionary_path", help="Location of existing dictionary") train_g2p_parser.add_argument("output_model_path", help="Desired location of generated model") @@ -382,14 +392,19 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool "most of the data and validating on an unseen subset", ) add_global_options(train_g2p_parser) - - model_parser = subparsers.add_parser("model") + help_message = "Inspect, download, and save pretrained MFA models" + model_parser = subparsers.add_parser( + "model", aliases=["models"], description=help_message, help=help_message + ) model_subparsers = model_parser.add_subparsers(dest="action") model_subparsers.required = True - model_download_parser = model_subparsers.add_parser("download") + help_message = "Download a pretrained model from the MFA repository" + model_download_parser = model_subparsers.add_parser( + "download", description=help_message, help=help_message + ) model_download_parser.add_argument( - "model_type", help=f"Type of model to download, options: {', '.join(MODEL_TYPES)}" + "model_type", choices=sorted(MODEL_TYPES), help="Type of model to download" ) model_download_parser.add_argument( "name", @@ -397,24 +412,35 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool "will list all available languages", nargs="?", ) - - model_list_parser = model_subparsers.add_parser("list") + help_message = "List of saved models" + model_list_parser = model_subparsers.add_parser( + "list", description=help_message, help=help_message + ) model_list_parser.add_argument( - "model_type", nargs="?", help=f"Type of model to list, options: {', '.join(MODEL_TYPES)}" + "model_type", choices=sorted(MODEL_TYPES), nargs="?", help="Type of model to list" ) - model_inspect_parser = model_subparsers.add_parser("inspect") + help_message = "Inspect a model and output its metadata" + model_inspect_parser = model_subparsers.add_parser( + "inspect", description=help_message, help=help_message + ) model_inspect_parser.add_argument( "model_type", + choices=sorted(MODEL_TYPES), nargs="?", - help=f"Type of model to download, options: {', '.join(MODEL_TYPES)}", + help="Type of model to download", ) model_inspect_parser.add_argument( "name", help="Name of pretrained model or path to MFA model to inspect" ) - model_save_parser = model_subparsers.add_parser("save") - model_save_parser.add_argument("model_type", help="Type of MFA model") + help_message = "Save a MFA model to the pretrained directory for name-based referencing" + model_save_parser = model_subparsers.add_parser( + "save", description=help_message, help=help_message + ) + model_save_parser.add_argument( + "model_type", choices=sorted(MODEL_TYPES), help="Type of MFA model" + ) model_save_parser.add_argument( "path", help="Path to MFA model to save for invoking with just its name" ) @@ -430,7 +456,9 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool action="store_true", ) - train_lm_parser = subparsers.add_parser("train_lm") + train_lm_parser = subparsers.add_parser( + "train_lm", help="Train a language model from a corpus" + ) train_lm_parser.add_argument( "source_path", help="Full path to the source directory to train from, alternatively " @@ -463,7 +491,10 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool ) add_global_options(train_lm_parser) - train_dictionary_parser = subparsers.add_parser("train_dictionary") + train_dictionary_parser = subparsers.add_parser( + "train_dictionary", + help="Calculate pronunciation probabilities for a dictionary based on alignment results in a corpus", + ) train_dictionary_parser.add_argument( "corpus_directory", help="Full path to the directory to align" ) @@ -491,10 +522,13 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool ) add_global_options(train_dictionary_parser) - train_ivector_parser = subparsers.add_parser("train_ivector") + train_ivector_parser = subparsers.add_parser( + "train_ivector", + help="Train an ivector extractor from a corpus and pretrained acoustic model", + ) train_ivector_parser.add_argument( "corpus_directory", - help="Full path to the source directory to " "train the ivector extractor", + help="Full path to the source directory to train the ivector extractor", ) train_ivector_parser.add_argument( "dictionary_path", help="Full path to the pronunciation dictionary to use" @@ -524,10 +558,12 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool ) add_global_options(train_ivector_parser) - classify_speakers_parser = subparsers.add_parser("classify_speakers") + classify_speakers_parser = subparsers.add_parser( + "classify_speakers", help="Use an ivector extractor to cluster utterances into speakers" + ) classify_speakers_parser.add_argument( "corpus_directory", - help="Full path to the source directory to " "run speaker classification", + help="Full path to the source directory to run speaker classification", ) classify_speakers_parser.add_argument( "ivector_extractor_path", type=str, default="", help="Full path to ivector extractor model" @@ -551,9 +587,11 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool ) add_global_options(classify_speakers_parser) - create_segments_parser = subparsers.add_parser("create_segments") + create_segments_parser = subparsers.add_parser( + "create_segments", help="Create segments based on voice activity dectection (VAD)" + ) create_segments_parser.add_argument( - "corpus_directory", help="Full path to the source directory to " "run VAD segmentation" + "corpus_directory", help="Full path to the source directory to run VAD segmentation" ) create_segments_parser.add_argument( "output_directory", @@ -564,7 +602,10 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool ) add_global_options(create_segments_parser) - transcribe_parser = subparsers.add_parser("transcribe") + transcribe_parser = subparsers.add_parser( + "transcribe", + help="Transcribe utterances using an acoustic model, language model, and pronunciation dictionary", + ) transcribe_parser.add_argument( "corpus_directory", help="Full path to the directory to transcribe" ) @@ -604,7 +645,7 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool transcribe_parser.add_argument( "-e", "--evaluate", - help="Evaluate the transcription " "against golden texts", + help="Evaluate the transcription against golden texts", action="store_true", ) add_global_options(transcribe_parser) @@ -703,15 +744,20 @@ def add_global_options(subparser: argparse.ArgumentParser, textgrid_output: bool type=int, ) - history_parser = subparsers.add_parser("history") + history_parser = subparsers.add_parser("history", help="Show previously run mfa commands") + _ = subparsers.add_parser("thirdparty", help="DEPRECATED: Please install Kaldi via conda.") + _ = subparsers.add_parser( + "download", help="DEPRECATED: Please use mfa model download instead." + ) history_parser.add_argument("depth", help="Number of commands to list", nargs="?", default=10) history_parser.add_argument( "--verbose", help="Flag for whether to output additional information", action="store_true" ) - _ = subparsers.add_parser("annotator") - _ = subparsers.add_parser("anchor") + _ = subparsers.add_parser( + "anchor", aliases=["annotator"], help="Launch Anchor Annotator (if installed)" + ) return parser @@ -795,6 +841,14 @@ def main() -> None: from montreal_forced_aligner.utils import get_mfa_version print(get_mfa_version()) + elif args.subcommand == "thirdparty": # Deprecated command + raise DeprecationWarning( + "Necessary thirdparty executables are now installed via conda. Please refer to the installation docs for the updated commands." + ) + elif args.subcommand == "download": # Deprecated command + raise DeprecationWarning( + "Downloading models is now run through the `mfa model download` command, please use that instead." + ) except MFAError as e: if getattr(args, "debug", False): raise diff --git a/montreal_forced_aligner/command_line/train_acoustic_model.py b/montreal_forced_aligner/command_line/train_acoustic_model.py index d189e017..a02f7735 100644 --- a/montreal_forced_aligner/command_line/train_acoustic_model.py +++ b/montreal_forced_aligner/command_line/train_acoustic_model.py @@ -15,7 +15,7 @@ train_yaml_to_config, ) from montreal_forced_aligner.corpus import Corpus -from montreal_forced_aligner.dictionary import Dictionary +from montreal_forced_aligner.dictionary import MultispeakerDictionary from montreal_forced_aligner.exceptions import ArgumentError from montreal_forced_aligner.utils import log_config, setup_logger @@ -51,9 +51,9 @@ def train_acoustic_model(args: Namespace, unknown_args: Optional[list] = None) - corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) if args.config_path: - train_config, align_config = train_yaml_to_config(args.config_path) + train_config, align_config, dictionary_config = train_yaml_to_config(args.config_path) else: - train_config, align_config = load_basic_train() + train_config, align_config, dictionary_config = load_basic_train() train_config.use_mp = not args.disable_mp align_config.use_mp = not args.disable_mp align_config.debug = args.debug @@ -130,28 +130,22 @@ def train_acoustic_model(args: Namespace, unknown_args: Optional[list] = None) - corpus = Corpus( args.corpus_directory, data_directory, + dictionary_config, speaker_characters=args.speaker_characters, num_jobs=getattr(args, "num_jobs", 3), sample_rate=align_config.feature_config.sample_frequency, debug=getattr(args, "debug", False), logger=logger, use_mp=align_config.use_mp, - punctuation=align_config.punctuation, - clitic_markers=align_config.clitic_markers, audio_directory=audio_dir, ) logger.info(corpus.speaker_utterance_info()) - dictionary = Dictionary( + dictionary = MultispeakerDictionary( args.dictionary_path, data_directory, + dictionary_config, word_set=corpus.word_set, logger=logger, - punctuation=align_config.punctuation, - clitic_markers=align_config.clitic_markers, - compound_markers=align_config.compound_markers, - multilingual_ipa=align_config.multilingual_ipa, - strip_diacritics=align_config.strip_diacritics, - digraphs=align_config.digraphs, ) utt_oov_path = os.path.join(corpus.split_directory, "utterance_oovs.txt") if os.path.exists(utt_oov_path): diff --git a/montreal_forced_aligner/command_line/train_dictionary.py b/montreal_forced_aligner/command_line/train_dictionary.py index 9629b093..af437b59 100644 --- a/montreal_forced_aligner/command_line/train_dictionary.py +++ b/montreal_forced_aligner/command_line/train_dictionary.py @@ -15,7 +15,7 @@ load_command_configuration, ) from montreal_forced_aligner.corpus import Corpus -from montreal_forced_aligner.dictionary import Dictionary +from montreal_forced_aligner.dictionary import MultispeakerDictionary from montreal_forced_aligner.exceptions import ArgumentError from montreal_forced_aligner.models import AcousticModel from montreal_forced_aligner.utils import log_config, setup_logger @@ -53,14 +53,16 @@ def train_dictionary(args: Namespace, unknown_args: Optional[list] = None) -> No data_directory = os.path.join(temp_dir, corpus_name) conf_path = os.path.join(data_directory, "config.yml") if args.config_path: - align_config = align_yaml_to_config(args.config_path) + align_config, dictionary_config = align_yaml_to_config(args.config_path) else: - align_config = load_basic_align() + align_config, dictionary_config = load_basic_align() align_config.use_mp = not args.disable_mp align_config.overwrite = args.overwrite align_config.debug = args.debug + dictionary_config.debug = args.debug if unknown_args: align_config.update_from_unknown_args(unknown_args) + dictionary_config.update_from_unknown_args(unknown_args) if getattr(args, "clean", False) and os.path.exists(data_directory): print("Cleaning old directory!") shutil.rmtree(data_directory, ignore_errors=True) @@ -125,24 +127,21 @@ def train_dictionary(args: Namespace, unknown_args: Optional[list] = None) -> No corpus = Corpus( args.corpus_directory, data_directory, + dictionary_config, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, sample_rate=align_config.feature_config.sample_frequency, use_mp=align_config.use_mp, logger=logger, - punctuation=align_config.punctuation, - clitic_markers=align_config.clitic_markers, ) logger.info(corpus.speaker_utterance_info()) acoustic_model = AcousticModel(args.acoustic_model_path) - dictionary = Dictionary( + dictionary = MultispeakerDictionary( args.dictionary_path, data_directory, + dictionary_config, word_set=corpus.word_set, logger=logger, - punctuation=align_config.punctuation, - clitic_markers=align_config.clitic_markers, - compound_markers=align_config.compound_markers, ) acoustic_model.validate(dictionary) diff --git a/montreal_forced_aligner/command_line/train_g2p.py b/montreal_forced_aligner/command_line/train_g2p.py index 7c4ca426..a3cef78e 100644 --- a/montreal_forced_aligner/command_line/train_g2p.py +++ b/montreal_forced_aligner/command_line/train_g2p.py @@ -11,7 +11,7 @@ load_basic_train_g2p_config, train_g2p_yaml_to_config, ) -from montreal_forced_aligner.dictionary import Dictionary +from montreal_forced_aligner.dictionary import PronunciationDictionary from montreal_forced_aligner.g2p.trainer import PyniniTrainer as Trainer if TYPE_CHECKING: @@ -40,13 +40,13 @@ def train_g2p(args: Namespace, unknown_args: Optional[list] = None) -> None: shutil.rmtree(os.path.join(temp_dir, "G2P"), ignore_errors=True) shutil.rmtree(os.path.join(temp_dir, "models", "G2P"), ignore_errors=True) if args.config_path: - train_config = train_g2p_yaml_to_config(args.config_path) + train_config, dictionary_config = train_g2p_yaml_to_config(args.config_path) else: - train_config = load_basic_train_g2p_config() + train_config, dictionary_config = load_basic_train_g2p_config() train_config.use_mp = not args.disable_mp if unknown_args: train_config.update_from_unknown_args(unknown_args) - dictionary = Dictionary(args.dictionary_path, "") + dictionary = PronunciationDictionary(args.dictionary_path, "", dictionary_config) t = Trainer( dictionary, args.output_model_path, diff --git a/montreal_forced_aligner/command_line/train_ivector_extractor.py b/montreal_forced_aligner/command_line/train_ivector_extractor.py index 074443a4..1335b762 100644 --- a/montreal_forced_aligner/command_line/train_ivector_extractor.py +++ b/montreal_forced_aligner/command_line/train_ivector_extractor.py @@ -15,7 +15,7 @@ train_yaml_to_config, ) from montreal_forced_aligner.corpus import Corpus -from montreal_forced_aligner.dictionary import Dictionary +from montreal_forced_aligner.dictionary import MultispeakerDictionary from montreal_forced_aligner.exceptions import ArgumentError from montreal_forced_aligner.models import AcousticModel from montreal_forced_aligner.utils import log_config, setup_logger @@ -51,9 +51,9 @@ def train_ivector(args: Namespace, unknown_args: Optional[list] = None) -> None: corpus_name = os.path.basename(args.corpus_directory) data_directory = os.path.join(temp_dir, corpus_name) if args.config_path: - train_config, align_config = train_yaml_to_config(args.config_path) + train_config, align_config, dictionary_config = train_yaml_to_config(args.config_path) else: - train_config, align_config = load_basic_train_ivector() + train_config, align_config, dictionary_config = load_basic_train_ivector() if unknown_args: train_config.update_from_unknown_args(unknown_args) align_config.update_from_unknown_args(unknown_args) @@ -130,23 +130,20 @@ def train_ivector(args: Namespace, unknown_args: Optional[list] = None) -> None: corpus = Corpus( args.corpus_directory, data_directory, + dictionary_config, speaker_characters=args.speaker_characters, num_jobs=args.num_jobs, sample_rate=align_config.feature_config.sample_frequency, debug=getattr(args, "debug", False), logger=logger, use_mp=align_config.use_mp, - punctuation=align_config.punctuation, - clitic_markers=align_config.clitic_markers, ) - dictionary = Dictionary( + dictionary = MultispeakerDictionary( args.dictionary_path, data_directory, + dictionary_config, word_set=corpus.word_set, logger=logger, - punctuation=align_config.punctuation, - clitic_markers=align_config.clitic_markers, - compound_markers=align_config.compound_markers, ) acoustic_model = AcousticModel(args.acoustic_model_path, root_directory=model_directory) acoustic_model.log_details(logger) diff --git a/montreal_forced_aligner/command_line/train_lm.py b/montreal_forced_aligner/command_line/train_lm.py index 07ae1e99..48798b39 100644 --- a/montreal_forced_aligner/command_line/train_lm.py +++ b/montreal_forced_aligner/command_line/train_lm.py @@ -9,7 +9,7 @@ from montreal_forced_aligner.command_line.utils import validate_model_arg from montreal_forced_aligner.config import TEMP_DIR, load_basic_train_lm, train_lm_yaml_to_config from montreal_forced_aligner.corpus import Corpus -from montreal_forced_aligner.dictionary import Dictionary +from montreal_forced_aligner.dictionary import PronunciationDictionary from montreal_forced_aligner.exceptions import ArgumentError from montreal_forced_aligner.lm.trainer import LmTrainer from montreal_forced_aligner.utils import setup_logger @@ -38,9 +38,9 @@ def train_lm(args: Namespace, unknown_args: Optional[list] = None) -> None: else: temp_dir = os.path.expanduser(args.temp_directory) if args.config_path: - train_config = train_lm_yaml_to_config(args.config_path) + train_config, dictionary_config = train_lm_yaml_to_config(args.config_path) else: - train_config = load_basic_train_lm() + train_config, dictionary_config = load_basic_train_lm() train_config.use_mp = not args.disable_mp if unknown_args: train_config.update_from_unknown_args(unknown_args) @@ -74,8 +74,8 @@ def train_lm(args: Namespace, unknown_args: Optional[list] = None) -> None: debug=args.debug, ) if args.dictionary_path: - dictionary = Dictionary( - args.dictionary_path, data_directory, debug=args.debug, word_set=source.word_set + dictionary = PronunciationDictionary( + args.dictionary_path, data_directory, dictionary_config, word_set=source.word_set ) dictionary.generate_mappings() else: diff --git a/montreal_forced_aligner/command_line/transcribe.py b/montreal_forced_aligner/command_line/transcribe.py index b04ebdc2..b94ddf5c 100644 --- a/montreal_forced_aligner/command_line/transcribe.py +++ b/montreal_forced_aligner/command_line/transcribe.py @@ -14,7 +14,7 @@ transcribe_yaml_to_config, ) from montreal_forced_aligner.corpus import Corpus -from montreal_forced_aligner.dictionary import Dictionary, MultispeakerDictionary +from montreal_forced_aligner.dictionary import MultispeakerDictionary from montreal_forced_aligner.exceptions import ArgumentError from montreal_forced_aligner.models import AcousticModel, LanguageModel from montreal_forced_aligner.transcriber import Transcriber @@ -51,9 +51,9 @@ def transcribe_corpus(args: Namespace, unknown_args: Optional[list] = None) -> N args.corpus_directory = os.path.dirname(args.corpus_directory) corpus_name = os.path.basename(args.corpus_directory) if args.config_path: - transcribe_config = transcribe_yaml_to_config(args.config_path) + transcribe_config, dictionary_config = transcribe_yaml_to_config(args.config_path) else: - transcribe_config = load_basic_transcribe() + transcribe_config, dictionary_config = load_basic_transcribe() transcribe_config.use_mp = not args.disable_mp transcribe_config.overwrite = args.overwrite if unknown_args: @@ -138,6 +138,7 @@ def transcribe_corpus(args: Namespace, unknown_args: Optional[list] = None) -> N corpus = Corpus( args.corpus_directory, data_directory, + dictionary_config, speaker_characters=args.speaker_characters, sample_rate=transcribe_config.feature_config.sample_frequency, num_jobs=args.num_jobs, @@ -146,6 +147,7 @@ def transcribe_corpus(args: Namespace, unknown_args: Optional[list] = None) -> N ignore_speakers=transcribe_config.ignore_speakers, ) acoustic_model = AcousticModel(args.acoustic_model_path, root_directory=model_directory) + dictionary_config.update(acoustic_model.meta) acoustic_model.log_details(logger) if args.language_model_path.endswith(".arpa"): alternative_name = os.path.splitext(args.language_model_path)[0] + ".zip" @@ -155,26 +157,13 @@ def transcribe_corpus(args: Namespace, unknown_args: Optional[list] = None) -> N f"`mfa train_lm {args.language_model_path} {alternative_name}`." ) language_model = LanguageModel(args.language_model_path, root_directory=data_directory) - if args.dictionary_path.lower().endswith(".yaml"): - dictionary = MultispeakerDictionary( - args.dictionary_path, - data_directory, - logger=logger, - punctuation=transcribe_config.punctuation, - clitic_markers=transcribe_config.clitic_markers, - compound_markers=transcribe_config.compound_markers, - multilingual_ipa=acoustic_model.meta["multilingual_ipa"], - ) - else: - dictionary = Dictionary( - args.dictionary_path, - data_directory, - logger=logger, - punctuation=transcribe_config.punctuation, - clitic_markers=transcribe_config.clitic_markers, - compound_markers=transcribe_config.compound_markers, - multilingual_ipa=acoustic_model.meta["multilingual_ipa"], - ) + dictionary = MultispeakerDictionary( + args.dictionary_path, + data_directory, + dictionary_config, + logger=logger, + ) + acoustic_model.validate(dictionary) begin = time.time() t = Transcriber( diff --git a/montreal_forced_aligner/command_line/validate.py b/montreal_forced_aligner/command_line/validate.py index 05141ab1..e161e95c 100644 --- a/montreal_forced_aligner/command_line/validate.py +++ b/montreal_forced_aligner/command_line/validate.py @@ -8,8 +8,9 @@ from montreal_forced_aligner.command_line.utils import validate_model_arg from montreal_forced_aligner.config import TEMP_DIR +from montreal_forced_aligner.config.dictionary_config import DictionaryConfig from montreal_forced_aligner.corpus import Corpus -from montreal_forced_aligner.dictionary import Dictionary +from montreal_forced_aligner.dictionary import MultispeakerDictionary from montreal_forced_aligner.exceptions import ArgumentError from montreal_forced_aligner.models import AcousticModel from montreal_forced_aligner.utils import setup_logger @@ -54,28 +55,30 @@ def validate_corpus(args: Namespace, unknown_args: Optional[List[str]] = None) - else: log_level = "info" logger = setup_logger(command, data_directory, console_level=log_level) + dictionary_config = DictionaryConfig() + acoustic_model = None + if args.acoustic_model_path: + acoustic_model = AcousticModel(args.acoustic_model_path, root_directory=model_directory) + acoustic_model.log_details(logger) + dictionary_config.update(acoustic_model.meta) + dictionary = MultispeakerDictionary( + args.dictionary_path, + data_directory, + dictionary_config, + logger=logger, + ) + if acoustic_model: + acoustic_model.validate(dictionary) corpus = Corpus( args.corpus_directory, data_directory, + dictionary_config, speaker_characters=args.speaker_characters, num_jobs=getattr(args, "num_jobs", 3), logger=logger, use_mp=not args.disable_mp, ) - if args.acoustic_model_path: - acoustic_model = AcousticModel(args.acoustic_model_path, root_directory=model_directory) - acoustic_model.log_details(logger) - dictionary = Dictionary( - args.dictionary_path, - data_directory, - logger=logger, - multilingual_ipa=acoustic_model.meta["multilingual_ipa"], - ) - acoustic_model.validate(dictionary) - else: - dictionary = Dictionary(args.dictionary_path, data_directory, logger=logger) - a = CorpusValidator( corpus, dictionary, diff --git a/montreal_forced_aligner/config/__init__.py b/montreal_forced_aligner/config/__init__.py index 06729220..c5355586 100644 --- a/montreal_forced_aligner/config/__init__.py +++ b/montreal_forced_aligner/config/__init__.py @@ -1,4 +1,9 @@ -"""Class definitions for configuring MFA""" +""" +Configuration classes +===================== + + +""" from __future__ import annotations from typing import TYPE_CHECKING, Any, Dict, List @@ -6,14 +11,16 @@ if TYPE_CHECKING: from argparse import Namespace - ConfigDict = Dict[str, Any] import os import yaml from .align_config import AlignConfig, align_yaml_to_config, load_basic_align # noqa -from .command_config import load_command_configuration # noqa +from .base_config import BaseConfig +from .command_config import CommandConfig, load_command_configuration # noqa +from .dictionary_config import DictionaryConfig # noqa from .feature_config import FeatureConfig # noqa +from .g2p_config import G2PConfig, g2p_yaml_to_config, load_basic_g2p_config # noqa from .segmentation_config import ( # noqa SegmentationConfig, load_basic_segmentation, @@ -31,6 +38,11 @@ load_test_config, train_yaml_to_config, ) +from .train_g2p_config import ( # noqa + TrainG2PConfig, + load_basic_train_g2p_config, + train_g2p_yaml_to_config, +) from .train_lm_config import TrainLMConfig, load_basic_train_lm, train_lm_yaml_to_config # noqa from .transcribe_config import ( # noqa TranscribeConfig, @@ -38,6 +50,42 @@ transcribe_yaml_to_config, ) +__all__ = [ + "TEMP_DIR", + "align_config", + "base_config", + "command_config", + "dictionary_config", + "feature_config", + "segmentation_config", + "speaker_classification_config", + "train_config", + "train_lm_config", + "transcribe_config", + "generate_config_path", + "generate_command_history_path", + "load_command_history", + "update_command_history", + "update_global_config", + "load_global_config", + "USE_COLORS", + "BLAS_THREADS", +] + +BaseConfig.__module__ = "montreal_forced_aligner.config" +AlignConfig.__module__ = "montreal_forced_aligner.config" +CommandConfig.__module__ = "montreal_forced_aligner.config" +FeatureConfig.__module__ = "montreal_forced_aligner.config" +DictionaryConfig.__module__ = "montreal_forced_aligner.config" +SegmentationConfig.__module__ = "montreal_forced_aligner.config" +SpeakerClassificationConfig.__module__ = "montreal_forced_aligner.config" +TrainingConfig.__module__ = "montreal_forced_aligner.config" +TrainLMConfig.__module__ = "montreal_forced_aligner.config" +TrainG2PConfig.__module__ = "montreal_forced_aligner.config" +G2PConfig.__module__ = "montreal_forced_aligner.config" +TranscribeConfig.__module__ = "montreal_forced_aligner.config" + + TEMP_DIR = os.path.expanduser("~/Documents/MFA") diff --git a/montreal_forced_aligner/config/align_config.py b/montreal_forced_aligner/config/align_config.py index ea9cd1c2..e629e4ff 100644 --- a/montreal_forced_aligner/config/align_config.py +++ b/montreal_forced_aligner/config/align_config.py @@ -2,25 +2,19 @@ from __future__ import annotations import os -from typing import TYPE_CHECKING, Collection +from typing import TYPE_CHECKING, Collection, Tuple import yaml -from .base_config import ( - DEFAULT_CLITIC_MARKERS, - DEFAULT_COMPOUND_MARKERS, - DEFAULT_DIGRAPHS, - DEFAULT_PUNCTUATION, - DEFAULT_STRIP_DIACRITICS, - BaseConfig, - ConfigError, -) +from ..exceptions import ConfigError +from .base_config import BaseConfig +from .dictionary_config import DictionaryConfig from .feature_config import FeatureConfig if TYPE_CHECKING: from argparse import Namespace - from . import ConfigDict + from ..abc import MetaDict __all__ = ["AlignConfig", "align_yaml_to_config", "load_basic_align"] @@ -39,7 +33,7 @@ class AlignConfig(BaseConfig): Self-loop scale, defaults to 0.1 disable_sat : bool Flag for disabling speaker adaptation, defaults to False - feature_config : :class:`~montreal_forced_aligner.features.config.FeatureConfig` + feature_config : :class:`~montreal_forced_aligner.config.FeatureConfig` Configuration object for feature generation boost_silence : float Factor to boost silence probabilities, 1.0 is no boost or reduction @@ -66,14 +60,8 @@ def __init__(self, feature_config: FeatureConfig): self.retry_beam = 40 self.data_directory = None # Gets set later self.fmllr_update_type = "full" - self.punctuation = DEFAULT_PUNCTUATION - self.clitic_markers = DEFAULT_CLITIC_MARKERS - self.compound_markers = DEFAULT_COMPOUND_MARKERS - self.strip_diacritics = DEFAULT_STRIP_DIACRITICS - self.digraphs = DEFAULT_DIGRAPHS self.use_mp = True self.use_fmllr_mp = False - self.multilingual_ipa = False self.debug = False self.overwrite = False self.cleanup_textgrids = True @@ -81,7 +69,7 @@ def __init__(self, feature_config: FeatureConfig): self.iteration = None @property - def align_options(self) -> ConfigDict: + def align_options(self) -> MetaDict: """Options for use in aligning""" return { "transition_scale": self.transition_scale, @@ -94,7 +82,7 @@ def align_options(self) -> ConfigDict: } @property - def fmllr_options(self) -> ConfigDict: + def fmllr_options(self) -> MetaDict: """Options for use in calculating fMLLR transforms""" return { "fmllr_update_type": self.fmllr_update_type, @@ -105,15 +93,8 @@ def update(self, data: dict) -> None: for k, v in data.items(): if k == "use_mp": self.feature_config.use_mp = v - elif k in ["punctuation", "clitic_markers", "compound_markers"]: - if not v: - continue - if "-" in v: - v = "-" + v.replace("-", "") - if "]" in v and r"\]" not in v: - v = v.replace("]", r"\]") elif not hasattr(self, k): - raise ConfigError(f"No field found for key {k}") + continue setattr(self, k, v) def update_from_args(self, args: Namespace): @@ -129,7 +110,7 @@ def update_from_unknown_args(self, args: Collection[str]): self.retry_beam = self.beam * 4 -def align_yaml_to_config(path: str) -> AlignConfig: +def align_yaml_to_config(path: str) -> Tuple[AlignConfig, DictionaryConfig]: """ Helper function to load alignment configurations @@ -140,9 +121,12 @@ def align_yaml_to_config(path: str) -> AlignConfig: Returns ------- - :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + :class:`~montreal_forced_aligner.config.AlignConfig` Alignment configuration + :class:`~montreal_forced_aligner.config.dictionary_config.DictionaryConfig` + Dictionary configuration """ + dictionary_config = DictionaryConfig() with open(path, "r", encoding="utf8") as f: data = yaml.load(f, Loader=yaml.SafeLoader) global_params = {} @@ -154,20 +138,25 @@ def align_yaml_to_config(path: str) -> AlignConfig: global_params[k] = v align_config = AlignConfig(feature_config) align_config.update(global_params) + dictionary_config.update(global_params) if align_config.beam >= align_config.retry_beam: raise ConfigError("Retry beam must be greater than beam.") - return align_config + return align_config, dictionary_config -def load_basic_align() -> AlignConfig: +def load_basic_align() -> Tuple[AlignConfig, DictionaryConfig]: """ Helper function to load the default parameters Returns ------- - :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + :class:`~montreal_forced_aligner.config.AlignConfig` Default alignment configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ base_dir = os.path.dirname(os.path.abspath(__file__)) - align_config = align_yaml_to_config(os.path.join(base_dir, "basic_align.yaml")) - return align_config + align_config, dictionary_config = align_yaml_to_config( + os.path.join(base_dir, "basic_align.yaml") + ) + return align_config, dictionary_config diff --git a/montreal_forced_aligner/config/base_config.py b/montreal_forced_aligner/config/base_config.py index 6fb87f2f..4e132046 100644 --- a/montreal_forced_aligner/config/base_config.py +++ b/montreal_forced_aligner/config/base_config.py @@ -5,18 +5,9 @@ import yaml -from ..exceptions import ConfigError - if TYPE_CHECKING: from argparse import Namespace -DEFAULT_PUNCTUATION = r'、。।,@<>"(),.:;¿?¡!\\&%#*~【】,…‥「」『』〝〟″⟨⟩♪・‹›«»~′$+=‘' - -DEFAULT_CLITIC_MARKERS = "'’" -DEFAULT_COMPOUND_MARKERS = "-/" -DEFAULT_STRIP_DIACRITICS = ["ː", "ˑ", "̩", "̆", "̑", "̯", "͡", "‿", "͜"] -DEFAULT_DIGRAPHS = ["[dt][szʒʃʐʑʂɕç]", "[aoɔe][ʊɪ]"] - PARSING_KEYS = [ "punctuation", @@ -39,7 +30,7 @@ def update(self, data: dict) -> None: """Update configuration parameters""" for k, v in data.items(): if not hasattr(self, k): - raise ConfigError(f"No field found for key {k}") + continue setattr(self, k, v) def update_from_args(self, args: Namespace) -> None: diff --git a/montreal_forced_aligner/config/dictionary_config.py b/montreal_forced_aligner/config/dictionary_config.py new file mode 100644 index 00000000..deb03816 --- /dev/null +++ b/montreal_forced_aligner/config/dictionary_config.py @@ -0,0 +1,308 @@ +"""Class definitions for configuring pronunciation dictionaries""" +from __future__ import annotations + +import re +from typing import Collection, Dict, List, Optional, Set, Tuple, Union + +from .base_config import BaseConfig + +DEFAULT_PUNCTUATION = list(r'、。।,@<>"(),.:;¿?¡!\\&%#*~【】,…‥「」『』〝〟″⟨⟩♪・‹›«»~′$+=‘') + +DEFAULT_CLITIC_MARKERS = list("'’") +DEFAULT_COMPOUND_MARKERS = list("-/") +DEFAULT_STRIP_DIACRITICS = ["ː", "ˑ", "̩", "̆", "̑", "̯", "͡", "‿", "͜"] +DEFAULT_DIGRAPHS = ["[dt][szʒʃʐʑʂɕç]", "[aoɔe][ʊɪ]"] +DEFAULT_BRACKETS = [("[", "]"), ("{", "}"), ("<", ">"), ("(", ")")] + +__all__ = ["DictionaryConfig"] + + +class DictionaryConfig(BaseConfig): + """ + Class for storing configuration information about pronunciation dictionaries + Path to a directory to store files for Kaldi + oov_code : str, optional + What to label words not in the dictionary, defaults to ``''`` + position_dependent_phones : bool, optional + Specifies whether phones should be represented as dependent on their + position in the word (beginning, middle or end), defaults to True + num_sil_states : int, optional + Number of states to use for silence phones, defaults to 5 + num_nonsil_states : int, optional + Number of states to use for non-silence phones, defaults to 3 + shared_silence_phones : bool, optional + Specify whether to share states across all silence phones, defaults + to True + sil_prob : float, optional + Probability of optional silences following words, defaults to 0.5 + word_set : Collection[str], optional + Word set to limit output files + debug: bool, optional + Flag for whether to perform debug steps and prevent intermediate cleanup + logger: :class:`~logging.Logger`, optional + Logger to output information to + punctuation: str, optional + Punctuation to use when parsing text + clitic_markers: str, optional + Clitic markers to use when parsing text + compound_markers: str, optional + Compound markers to use when parsing text + multilingual_ipa: bool, optional + Flag for multilingual IPA mode, defaults to False + strip_diacritics: List[str], optional + Diacritics to strip in multilingual IPA mode + digraphs: List[str], optional + Digraphs to split up in multilingual IPA mode + """ + + topo_template = " {cur_state} {cur_state} {cur_state} 0.75 {next_state} 0.25 " + topo_sil_template = " {cur_state} {cur_state} {transitions} " + topo_transition_template = " {} {}" + positions: List[str] = ["_B", "_E", "_I", "_S"] + + def __init__( + self, + oov_word: str = "", + silence_word: str = "!sil", + nonoptional_silence_phone: str = "sil", + optional_silence_phone: str = "sp", + oov_phone: str = "spn", + other_noise_phone: str = "spn", + position_dependent_phones: bool = True, + num_silence_states: int = 5, + num_non_silence_states: int = 3, + shared_silence_phones: bool = True, + silence_probability: float = 0.5, + debug: bool = False, + punctuation: Optional[Union[str, Collection[str]]] = None, + clitic_markers: Optional[Union[str, Collection[str]]] = None, + compound_markers: Optional[Collection[str]] = None, + multilingual_ipa: bool = False, + strip_diacritics: Optional[Collection[str]] = None, + digraphs: Optional[Collection[str]] = None, + brackets: Optional[Collection[Tuple[str, str]]] = None, + ): + self.strip_diacritics = DEFAULT_STRIP_DIACRITICS + self.digraphs = DEFAULT_DIGRAPHS + self.punctuation = DEFAULT_PUNCTUATION + self.clitic_markers = DEFAULT_CLITIC_MARKERS + self.compound_markers = DEFAULT_COMPOUND_MARKERS + self.brackets = DEFAULT_BRACKETS + if strip_diacritics is not None: + self.strip_diacritics = strip_diacritics + if digraphs is not None: + self.digraphs = digraphs + if punctuation is not None: + self.punctuation = punctuation + if clitic_markers is not None: + self.clitic_markers = clitic_markers + if compound_markers is not None: + self.compound_markers = compound_markers + if brackets is not None: + self.brackets = brackets + + self.multilingual_ipa = multilingual_ipa + self.num_silence_states = num_silence_states + self.num_non_silence_states = num_non_silence_states + self.shared_silence_phones = shared_silence_phones + self.silence_probability = silence_probability + self.oov_word = oov_word + self.silence_word = silence_word + self.position_dependent_phones = position_dependent_phones + self.optional_silence_phone = optional_silence_phone + self.nonoptional_silence_phone = nonoptional_silence_phone + self.oov_phone = oov_phone + self.other_noise_phone = other_noise_phone + self.debug = debug + self.non_silence_phones: Set[str] = set() + self.max_disambiguation_symbol = 0 + self.disambiguation_symbols = set() + self.clitic_set: Set[str] = set() + + @property + def silence_phones(self): + return { + self.oov_phone, + self.optional_silence_phone, + self.nonoptional_silence_phone, + self.other_noise_phone, + } + + @property + def specials_set(self): + return {self.oov_word, self.silence_word, "", "", ""} + + def update(self, data: dict) -> None: + for k, v in data.items(): + if not hasattr(self, k): + continue + if k == "phones": + continue + if k in ["punctuation", "clitic_markers", "compound_markers"]: + if not v: + continue + if "-" in v: + v = "-" + v.replace("-", "") + if "]" in v and r"\]" not in v: + v = v.replace("]", r"\]") + print(k, v) + setattr(self, k, v) + + @property + def phone_mapping(self) -> Dict[str, int]: + phone_mapping = {} + i = 0 + phone_mapping[""] = i + if self.position_dependent_phones: + for p in self.positional_silence_phones: + i += 1 + phone_mapping[p] = i + for p in self.positional_non_silence_phones: + i += 1 + phone_mapping[p] = i + else: + for p in sorted(self.silence_phones): + i += 1 + phone_mapping[p] = i + for p in sorted(self.non_silence_phones): + i += 1 + phone_mapping[p] = i + i = max(phone_mapping.values()) + for x in range(self.max_disambiguation_symbol + 2): + p = f"#{x}" + self.disambiguation_symbols.add(p) + i += 1 + phone_mapping[p] = i + return phone_mapping + + @property + def positional_silence_phones(self) -> List[str]: + """ + List of silence phones with positions + """ + silence_phones = [] + for p in sorted(self.silence_phones): + silence_phones.append(p) + for pos in self.positions: + silence_phones.append(p + pos) + return silence_phones + + @property + def positional_non_silence_phones(self) -> List[str]: + """ + List of non-silence phones with positions + """ + non_silence_phones = [] + for p in sorted(self.non_silence_phones): + for pos in self.positions: + non_silence_phones.append(p + pos) + return non_silence_phones + + @property + def kaldi_silence_phones(self): + if self.position_dependent_phones: + return self.positional_silence_phones + return sorted(self.silence_phones) + + @property + def kaldi_non_silence_phones(self): + if self.position_dependent_phones: + return self.positional_non_silence_phones + return sorted(self.non_silence_phones) + + @property + def optional_silence_csl(self) -> str: + """ + Phone id of the optional silence phone + """ + return str(self.phone_mapping[self.optional_silence_phone]) + + @property + def silence_csl(self) -> str: + """ + A colon-separated list (as a string) of silence phone ids + """ + return ":".join(map(str, (self.phone_mapping[x] for x in self.kaldi_silence_phones))) + + @property + def phones(self) -> set: + """ + The set of all phones (silence and non-silence) + """ + return self.silence_phones | self.non_silence_phones + + def check_bracketed(self, word: str) -> bool: + """ + Checks whether a given string is surrounded by brackets. + + Parameters + ---------- + word : str + Text to check for final brackets + + Returns + ------- + bool + True if the word is fully bracketed, false otherwise + """ + for b in self.brackets: + if word.startswith(b[0]) and word.endswith(b[-1]): + return True + return False + + def sanitize(self, item: str) -> str: + """ + Sanitize an item according to punctuation and clitic markers + + Parameters + ---------- + item: str + Word to sanitize + + Returns + ------- + str + Sanitized form + """ + for c in self.clitic_markers: + item = item.replace(c, self.clitic_markers[0]) + if not item: + return item + if self.check_bracketed(item): + return item + sanitized = re.sub(rf"^[{''.join(self.punctuation)}]+", "", item) + sanitized = re.sub(rf"[{''.join(self.punctuation)}]+$", "", sanitized) + + return sanitized + + def parse_ipa(self, transcription: List[str]) -> Tuple[str, ...]: + """ + Parse a transcription in a multilingual IPA format (strips out diacritics and splits digraphs). + + Parameters + ---------- + transcription: List[str] + Transcription to parse + + Returns + ------- + Tuple[str, ...] + Parsed transcription + """ + new_transcription = [] + for t in transcription: + new_t = t + for d in self.strip_diacritics: + new_t = new_t.replace(d, "") + if "g" in new_t: + new_t = new_t.replace("g", "ɡ") + + found = False + for digraph in self.digraphs: + if re.match(rf"^{digraph}$", new_t): + found = True + if found: + new_transcription.extend(new_t) + continue + new_transcription.append(new_t) + return tuple(new_transcription) diff --git a/montreal_forced_aligner/config/g2p_config.py b/montreal_forced_aligner/config/g2p_config.py index d01d324c..fe4d95ca 100644 --- a/montreal_forced_aligner/config/g2p_config.py +++ b/montreal_forced_aligner/config/g2p_config.py @@ -1,15 +1,12 @@ """Class definitions for configuring G2P generation""" from __future__ import annotations +from typing import Tuple + import yaml -from .base_config import ( - DEFAULT_CLITIC_MARKERS, - DEFAULT_COMPOUND_MARKERS, - DEFAULT_PUNCTUATION, - BaseConfig, - ConfigError, -) +from .base_config import BaseConfig +from .dictionary_config import DictionaryConfig __all__ = ["G2PConfig", "g2p_yaml_to_config", "load_basic_g2p_config"] @@ -21,9 +18,6 @@ class G2PConfig(BaseConfig): """ def __init__(self): - self.punctuation = DEFAULT_PUNCTUATION - self.clitic_markers = DEFAULT_CLITIC_MARKERS - self.compound_markers = DEFAULT_COMPOUND_MARKERS self.num_pronunciations = 1 self.use_mp = True @@ -38,11 +32,11 @@ def update(self, data: dict) -> None: if "]" in v and r"\]" not in v: v = v.replace("]", r"\]") elif not hasattr(self, k): - raise ConfigError("No field found for key {}".format(k)) + continue setattr(self, k, v) -def g2p_yaml_to_config(path: str) -> G2PConfig: +def g2p_yaml_to_config(path: str) -> Tuple[G2PConfig, DictionaryConfig]: """ Helper function to load G2P configurations @@ -53,9 +47,12 @@ def g2p_yaml_to_config(path: str) -> G2PConfig: Returns ------- - :class:`~montreal_forced_aligner.config.g2p_config.G2PConfig` + :class:`~montreal_forced_aligner.config.G2PConfig` G2P configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ + dictionary_config = DictionaryConfig() with open(path, "r", encoding="utf8") as f: data = yaml.load(f, Loader=yaml.SafeLoader) global_params = {} @@ -63,16 +60,19 @@ def g2p_yaml_to_config(path: str) -> G2PConfig: global_params[k] = v g2p_config = G2PConfig() g2p_config.update(global_params) - return g2p_config + dictionary_config.update(global_params) + return g2p_config, dictionary_config -def load_basic_g2p_config() -> G2PConfig: +def load_basic_g2p_config() -> Tuple[G2PConfig, DictionaryConfig]: """ Helper function to load the default parameters Returns ------- - :class:`~montreal_forced_aligner.config.g2p_config.G2PConfig` + :class:`~montreal_forced_aligner.config.G2PConfig` Default G2P configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ - return G2PConfig() + return G2PConfig(), DictionaryConfig() diff --git a/montreal_forced_aligner/config/segmentation_config.py b/montreal_forced_aligner/config/segmentation_config.py index ccd7dd49..698abc4a 100644 --- a/montreal_forced_aligner/config/segmentation_config.py +++ b/montreal_forced_aligner/config/segmentation_config.py @@ -5,7 +5,7 @@ import yaml -from .base_config import BaseConfig, ConfigError +from .base_config import BaseConfig from .feature_config import FeatureConfig __all__ = ["SegmentationConfig", "segmentation_yaml_to_config", "load_basic_segmentation"] @@ -33,7 +33,7 @@ def update(self, data: dict) -> None: if k == "use_mp": self.feature_config.use_mp = v if not hasattr(self, k): - raise ConfigError("No field found for key {}".format(k)) + continue setattr(self, k, v) @property diff --git a/montreal_forced_aligner/config/speaker_classification_config.py b/montreal_forced_aligner/config/speaker_classification_config.py index a82bd08f..4014734c 100644 --- a/montreal_forced_aligner/config/speaker_classification_config.py +++ b/montreal_forced_aligner/config/speaker_classification_config.py @@ -42,7 +42,7 @@ def classification_yaml_to_config(path: str) -> SpeakerClassificationConfig: Returns ------- - :class:`~montreal_forced_aligner.config.speaker_classification_config.SpeakerClassificationConfig` + :class:`~montreal_forced_aligner.config.SpeakerClassificationConfig` Speaker classification configuration """ with open(path, "r", encoding="utf8") as f: @@ -59,7 +59,7 @@ def load_basic_classification() -> SpeakerClassificationConfig: Returns ------- - :class:`~montreal_forced_aligner.config.speaker_classification_config.SpeakerClassificationConfig` + :class:`~montreal_forced_aligner.config.SpeakerClassificationConfig` Default speaker classification configuration """ base_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/montreal_forced_aligner/config/train_config.py b/montreal_forced_aligner/config/train_config.py index 13256cc6..9af857f1 100644 --- a/montreal_forced_aligner/config/train_config.py +++ b/montreal_forced_aligner/config/train_config.py @@ -7,6 +7,7 @@ import yaml +from ..exceptions import ConfigError from ..trainers import ( BaseTrainer, IvectorExtractorTrainer, @@ -16,14 +17,8 @@ TriphoneTrainer, ) from .align_config import AlignConfig -from .base_config import ( - DEFAULT_CLITIC_MARKERS, - DEFAULT_COMPOUND_MARKERS, - DEFAULT_PUNCTUATION, - PARSING_KEYS, - BaseConfig, - ConfigError, -) +from .base_config import BaseConfig +from .dictionary_config import DictionaryConfig from .feature_config import FeatureConfig __all__ = [ @@ -54,10 +49,6 @@ def __init__(self, training_configs): curs[t.train_type] += 1 self.training_identifiers.append(i) - self.punctuation = DEFAULT_PUNCTUATION - self.clitic_markers = DEFAULT_CLITIC_MARKERS - self.compound_markers = DEFAULT_COMPOUND_MARKERS - def update_from_align(self, align_config: AlignConfig) -> None: """Update parameters from an AlignConfig""" for tc in self.training_configs: @@ -67,15 +58,8 @@ def update_from_align(self, align_config: AlignConfig) -> None: def update(self, data: dict) -> None: """Update parameters""" for k, v in data.items(): - if k in PARSING_KEYS: - if not v: - continue - if "-" in v: - v = "-" + v.replace("-", "") - if "]" in v and r"\]" not in v: - v = v.replace("]", r"\]") if not hasattr(self, k): - raise ConfigError("No field found for key {}".format(k)) + continue setattr(self, k, v) for trainer in self.values(): trainer.update(data) @@ -109,7 +93,7 @@ def uses_sat(self) -> bool: def train_yaml_to_config( path: str, require_mono: bool = True -) -> Tuple[TrainingConfig, AlignConfig]: +) -> Tuple[TrainingConfig, AlignConfig, DictionaryConfig]: """ Helper function to load acoustic model training configurations @@ -120,11 +104,14 @@ def train_yaml_to_config( Returns ------- - :class:`~montreal_forced_aligner.config.train_config.TrainingConfig` + :class:`~montreal_forced_aligner.config.TrainingConfig` Training configuration - :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + :class:`~montreal_forced_aligner.config.AlignConfig` Alignment configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ + dictionary_config = DictionaryConfig() with open(path, "r", encoding="utf8") as f: data = yaml.load(f, Loader=yaml.SafeLoader) global_params = {} @@ -155,6 +142,7 @@ def train_yaml_to_config( feature_config.update(global_feature_params) align_config = AlignConfig(feature_config) align_config.update(global_params) + dictionary_config.update(global_params) training_config = None if training: for i, t in enumerate(training): @@ -167,40 +155,44 @@ def train_yaml_to_config( align_config.feature_config.fmllr = training_config.uses_sat if align_config.beam >= align_config.retry_beam: raise ConfigError("Retry beam must be greater than beam.") - return training_config, align_config + return training_config, align_config, dictionary_config -def load_basic_train() -> Tuple[TrainingConfig, AlignConfig]: +def load_basic_train() -> Tuple[TrainingConfig, AlignConfig, DictionaryConfig]: """ Helper function to load the default parameters Returns ------- - :class:`~montreal_forced_aligner.config.train_config.TrainingConfig` + :class:`~montreal_forced_aligner.config.TrainingConfig` Training configuration - :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + :class:`~montreal_forced_aligner.config.AlignConfig` Alignment configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ base_dir = os.path.dirname(os.path.abspath(__file__)) - training_config, align_config = train_yaml_to_config( + training_config, align_config, dictionary_config = train_yaml_to_config( os.path.join(base_dir, "basic_train.yaml") ) - return training_config, align_config + return training_config, align_config, dictionary_config -def load_sat_adapt() -> Tuple[TrainingConfig, AlignConfig]: +def load_sat_adapt() -> Tuple[TrainingConfig, AlignConfig, DictionaryConfig]: """ Helper function to load the default speaker adaptation parameters for adapting an acoustic model to new data Returns ------- - :class:`~montreal_forced_aligner.config.train_config.TrainingConfig` + :class:`~montreal_forced_aligner.config.TrainingConfig` Training configuration - :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + :class:`~montreal_forced_aligner.config.AlignConfig` Alignment configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ base_dir = os.path.dirname(os.path.abspath(__file__)) - training_config, align_config = train_yaml_to_config( + training_config, align_config, dictionary_config = train_yaml_to_config( os.path.join(base_dir, "adapt_sat.yaml"), require_mono=False ) training_config.training_configs[0].fmllr_iterations = range( @@ -209,61 +201,67 @@ def load_sat_adapt() -> Tuple[TrainingConfig, AlignConfig]: training_config.training_configs[0].realignment_iterations = range( 0, training_config.training_configs[0].num_iterations ) - return training_config, align_config + return training_config, align_config, dictionary_config -def load_no_sat_adapt() -> Tuple[TrainingConfig, AlignConfig]: +def load_no_sat_adapt() -> Tuple[TrainingConfig, AlignConfig, DictionaryConfig]: """ Helper function to load the default parameters for adapting an acoustic model to new data without speaker adaptation Returns ------- - :class:`~montreal_forced_aligner.config.train_config.TrainingConfig` + :class:`~montreal_forced_aligner.config.TrainingConfig` Training configuration - :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + :class:`~montreal_forced_aligner.config.AlignConfig` Alignment configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ base_dir = os.path.dirname(os.path.abspath(__file__)) - training_config, align_config = train_yaml_to_config( + training_config, align_config, dictionary_config = train_yaml_to_config( os.path.join(base_dir, "adapt_nosat.yaml"), require_mono=False ) training_config.training_configs[0].realignment_iterations = range( 0, training_config.training_configs[0].num_iterations ) - return training_config, align_config + return training_config, align_config, dictionary_config -def load_basic_train_ivector() -> Tuple[TrainingConfig, AlignConfig]: +def load_basic_train_ivector() -> Tuple[TrainingConfig, AlignConfig, DictionaryConfig]: """ Helper function to load the default parameters for training ivector extractors Returns ------- - :class:`~montreal_forced_aligner.config.train_config.TrainingConfig` + :class:`~montreal_forced_aligner.config.TrainingConfig` Training configuration - :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + :class:`~montreal_forced_aligner.config.AlignConfig` Alignment configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ base_dir = os.path.dirname(os.path.abspath(__file__)) - training_config, align_config = train_yaml_to_config( + training_config, align_config, dictionary_config = train_yaml_to_config( os.path.join(base_dir, "basic_train_ivector.yaml") ) - return training_config, align_config + return training_config, align_config, dictionary_config -def load_test_config() -> Tuple[TrainingConfig, AlignConfig]: +def load_test_config() -> Tuple[TrainingConfig, AlignConfig, DictionaryConfig]: """ Helper function to load the default parameters for validating corpora Returns ------- - :class:`~montreal_forced_aligner.config.train_config.TrainingConfig` + :class:`~montreal_forced_aligner.config.TrainingConfig` Training configuration - :class:`~montreal_forced_aligner.config.align_config.AlignConfig` + :class:`~montreal_forced_aligner.config.AlignConfig` Alignment configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ base_dir = os.path.dirname(os.path.abspath(__file__)) - training_config, align_config = train_yaml_to_config( + training_config, align_config, dictionary_config = train_yaml_to_config( os.path.join(base_dir, "test_config.yaml") ) - return training_config, align_config + return training_config, align_config, dictionary_config diff --git a/montreal_forced_aligner/config/train_g2p_config.py b/montreal_forced_aligner/config/train_g2p_config.py index 3e118039..0e8a366e 100644 --- a/montreal_forced_aligner/config/train_g2p_config.py +++ b/montreal_forced_aligner/config/train_g2p_config.py @@ -1,15 +1,12 @@ """Class definitions for configuring G2P model training""" from __future__ import annotations +from typing import Tuple + import yaml -from .base_config import ( - DEFAULT_CLITIC_MARKERS, - DEFAULT_COMPOUND_MARKERS, - DEFAULT_PUNCTUATION, - BaseConfig, - ConfigError, -) +from .base_config import BaseConfig +from .dictionary_config import DictionaryConfig __all__ = ["TrainG2PConfig", "train_g2p_yaml_to_config", "load_basic_train_g2p_config"] @@ -20,9 +17,6 @@ class TrainG2PConfig(BaseConfig): """ def __init__(self): - self.punctuation = DEFAULT_PUNCTUATION - self.clitic_markers = DEFAULT_CLITIC_MARKERS - self.compound_markers = DEFAULT_COMPOUND_MARKERS self.num_pronunciations = 1 self.order = 7 self.random_starts = 25 @@ -36,22 +30,8 @@ def __init__(self): self.model_size = 1000000 self.use_mp = True - def update(self, data: dict) -> None: - """Update configuration parameters""" - for k, v in data.items(): - if k in ["punctuation", "clitic_markers", "compound_markers"]: - if not v: - continue - if "-" in v: - v = "-" + v.replace("-", "") - if "]" in v and r"\]" not in v: - v = v.replace("]", r"\]") - elif not hasattr(self, k): - raise ConfigError("No field found for key {}".format(k)) - setattr(self, k, v) - -def train_g2p_yaml_to_config(path: str) -> TrainG2PConfig: +def train_g2p_yaml_to_config(path: str) -> Tuple[TrainG2PConfig, DictionaryConfig]: """ Helper function to load G2P training configurations @@ -62,9 +42,12 @@ def train_g2p_yaml_to_config(path: str) -> TrainG2PConfig: Returns ------- - :class:`~montreal_forced_aligner.config.train_g2p_config.TrainG2PConfig` + :class:`~montreal_forced_aligner.config.TrainG2PConfig` G2P training configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ + dictionary_config = DictionaryConfig() with open(path, "r", encoding="utf8") as f: data = yaml.load(f, Loader=yaml.SafeLoader) global_params = {} @@ -72,16 +55,19 @@ def train_g2p_yaml_to_config(path: str) -> TrainG2PConfig: global_params[k] = v g2p_config = TrainG2PConfig() g2p_config.update(global_params) - return g2p_config + dictionary_config.update(global_params) + return g2p_config, dictionary_config -def load_basic_train_g2p_config() -> TrainG2PConfig: +def load_basic_train_g2p_config() -> Tuple[TrainG2PConfig, DictionaryConfig]: """ Helper function to load the default parameters Returns ------- - :class:`~montreal_forced_aligner.config.train_g2p_config.TrainG2PConfig` + :class:`~montreal_forced_aligner.config.TrainG2PConfig` Default G2P training configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ - return TrainG2PConfig() + return TrainG2PConfig(), DictionaryConfig() diff --git a/montreal_forced_aligner/config/train_lm_config.py b/montreal_forced_aligner/config/train_lm_config.py index 9dcab417..802b706a 100644 --- a/montreal_forced_aligner/config/train_lm_config.py +++ b/montreal_forced_aligner/config/train_lm_config.py @@ -2,10 +2,12 @@ from __future__ import annotations import os +from typing import Tuple import yaml from .base_config import BaseConfig +from .dictionary_config import DictionaryConfig __all__ = ["TrainLMConfig", "train_lm_yaml_to_config", "load_basic_train_lm"] @@ -35,7 +37,7 @@ def __init__(self): self.use_mp = True -def train_lm_yaml_to_config(path: str) -> TrainLMConfig: +def train_lm_yaml_to_config(path: str) -> Tuple[TrainLMConfig, DictionaryConfig]: """ Helper function to load language model training configurations @@ -46,25 +48,33 @@ def train_lm_yaml_to_config(path: str) -> TrainLMConfig: Returns ------- - :class:`~montreal_forced_aligner.config.train_lm_config.TrainLMConfig` + :class:`~montreal_forced_aligner.config.TrainLMConfig` Language model training configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ + dictionary_config = DictionaryConfig() with open(path, "r", encoding="utf8") as f: data = yaml.load(f, Loader=yaml.SafeLoader) config = TrainLMConfig() config.update(data) - return config + dictionary_config.update(data) + return config, dictionary_config -def load_basic_train_lm() -> TrainLMConfig: +def load_basic_train_lm() -> Tuple[TrainLMConfig, DictionaryConfig]: """ Helper function to load the default parameters Returns ------- - :class:`~montreal_forced_aligner.config.train_lm_config.TrainLMConfig` + :class:`~montreal_forced_aligner.config.TrainLMConfig` Default language model training configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ base_dir = os.path.dirname(os.path.abspath(__file__)) - training_config = train_lm_yaml_to_config(os.path.join(base_dir, "basic_train_lm.yaml")) - return training_config + training_config, dictionary_config = train_lm_yaml_to_config( + os.path.join(base_dir, "basic_train_lm.yaml") + ) + return training_config, dictionary_config diff --git a/montreal_forced_aligner/config/transcribe_config.py b/montreal_forced_aligner/config/transcribe_config.py index fdad417d..f41ae700 100644 --- a/montreal_forced_aligner/config/transcribe_config.py +++ b/montreal_forced_aligner/config/transcribe_config.py @@ -2,23 +2,16 @@ from __future__ import annotations import os -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Tuple import yaml -from .base_config import ( - DEFAULT_CLITIC_MARKERS, - DEFAULT_COMPOUND_MARKERS, - DEFAULT_DIGRAPHS, - DEFAULT_PUNCTUATION, - DEFAULT_STRIP_DIACRITICS, - BaseConfig, - ConfigError, -) +from .base_config import BaseConfig +from .dictionary_config import DictionaryConfig from .feature_config import FeatureConfig if TYPE_CHECKING: - from ..config import ConfigDict + from ..abc import MetaDict __all__ = ["TranscribeConfig", "transcribe_yaml_to_config", "load_basic_transcribe"] @@ -29,7 +22,7 @@ class TranscribeConfig(BaseConfig): Parameters ---------- - feature_config: :class:`~montreal_forced_aligner.config.feature.FeatureConfig` + feature_config: :class:`~montreal_forced_aligner.config.FeatureConfig` Feature configuration to use in transcription Attributes @@ -64,16 +57,10 @@ def __init__(self, feature_config: FeatureConfig): self.data_directory = None # Gets set later self.use_mp = True self.use_fmllr_mp = False - self.multilingual_ipa = False self.ignore_speakers = False - self.punctuation = DEFAULT_PUNCTUATION - self.clitic_markers = DEFAULT_CLITIC_MARKERS - self.compound_markers = DEFAULT_COMPOUND_MARKERS - self.strip_diacritics = DEFAULT_STRIP_DIACRITICS - self.digraphs = DEFAULT_DIGRAPHS self.overwrite = False - def params(self) -> ConfigDict: + def params(self) -> MetaDict: """Metadata parameters for the configuration""" return { "transition_scale": self.transition_scale, @@ -93,7 +80,7 @@ def params(self) -> ConfigDict: } @property - def decode_options(self) -> ConfigDict: + def decode_options(self) -> MetaDict: """Options needed for decoding""" return { "fmllr": self.fmllr, @@ -107,7 +94,7 @@ def decode_options(self) -> ConfigDict: } @property - def score_options(self) -> ConfigDict: + def score_options(self) -> MetaDict: """Options needed for scoring lattices""" return { "language_model_weight": self.language_model_weight, @@ -115,7 +102,7 @@ def score_options(self) -> ConfigDict: } @property - def fmllr_options(self) -> ConfigDict: + def fmllr_options(self) -> MetaDict: """Options needed for calculating fMLLR transformations""" return { "fmllr_update_type": self.fmllr_update_type, @@ -125,7 +112,7 @@ def fmllr_options(self) -> ConfigDict: } @property - def lm_rescore_options(self) -> ConfigDict: + def lm_rescore_options(self) -> MetaDict: """Options needed for rescoring the language model""" return { "acoustic_scale": self.acoustic_scale, @@ -137,11 +124,11 @@ def update(self, data: dict) -> None: if k == "use_mp": self.feature_config.use_mp = v if not hasattr(self, k): - raise ConfigError("No field found for key {}".format(k)) + continue setattr(self, k, v) -def transcribe_yaml_to_config(path: str) -> TranscribeConfig: +def transcribe_yaml_to_config(path: str) -> Tuple[TranscribeConfig, DictionaryConfig]: """ Helper function to load transcription configurations @@ -152,9 +139,12 @@ def transcribe_yaml_to_config(path: str) -> TranscribeConfig: Returns ------- - :class:`~montreal_forced_aligner.config.transcribe_config.TranscribeConfig` + :class:`~montreal_forced_aligner.config.TranscribeConfig` Transcription configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ + dictionary_config = DictionaryConfig() with open(path, "r", encoding="utf8") as f: data = yaml.load(f, Loader=yaml.SafeLoader) global_params = {} @@ -166,18 +156,23 @@ def transcribe_yaml_to_config(path: str) -> TranscribeConfig: global_params[k] = v config = TranscribeConfig(feature_config) config.update(global_params) - return config + dictionary_config.update(global_params) + return config, dictionary_config -def load_basic_transcribe() -> TranscribeConfig: +def load_basic_transcribe() -> Tuple[TranscribeConfig, DictionaryConfig]: """ Helper function to load the default parameters Returns ------- - :class:`~montreal_forced_aligner.config.transcribe_config.TranscribeConfig` + :class:`~montreal_forced_aligner.config.TranscribeConfig` Default transcription configuration + :class:`~montreal_forced_aligner.config.DictionaryConfig` + Dictionary configuration """ base_dir = os.path.dirname(os.path.abspath(__file__)) - config = transcribe_yaml_to_config(os.path.join(base_dir, "basic_transcribe.yaml")) - return config + config, dictionary_config = transcribe_yaml_to_config( + os.path.join(base_dir, "basic_transcribe.yaml") + ) + return config, dictionary_config diff --git a/montreal_forced_aligner/corpus/__init__.py b/montreal_forced_aligner/corpus/__init__.py index 3a3d952c..50cb731c 100644 --- a/montreal_forced_aligner/corpus/__init__.py +++ b/montreal_forced_aligner/corpus/__init__.py @@ -1,20 +1,17 @@ -"""Class definitions for corpora""" -from __future__ import annotations - -from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union +""" +Corpora +======= -from .base import Corpus # noqa -if TYPE_CHECKING: +""" +from __future__ import annotations - SegmentsType = Dict[str, Dict[str, Union[str, float, int]]] - OneToOneMappingType = Dict[str, str] - OneToManyMappingType = Dict[str, List[str]] +from .base import Corpus # noqa +from .classes import File, Speaker, Utterance - CorpusMappingType = Union[OneToOneMappingType, OneToManyMappingType] - ScpType = Union[List[Tuple[str, str]], List[Tuple[str, List[Any]]]] - CorpusGroupedOneToOne = List[List[Tuple[str, str]]] - CorpusGroupedOneToMany = List[List[Tuple[str, List[Any]]]] - CorpusGroupedType = Union[CorpusGroupedOneToMany, CorpusGroupedOneToOne] +__all__ = ["Corpus", "Speaker", "Utterance", "File", "base", "helper", "classes"] -__all__ = ["Corpus"] +Corpus.__module__ = "montreal_forced_aligner.corpus" +Speaker.__module__ = "montreal_forced_aligner.corpus" +Utterance.__module__ = "montreal_forced_aligner.corpus" +File.__module__ = "montreal_forced_aligner.corpus" diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py index 8c1f60eb..cebac349 100644 --- a/montreal_forced_aligner/corpus/base.py +++ b/montreal_forced_aligner/corpus/base.py @@ -15,9 +15,10 @@ import yaml from ..config import FeatureConfig +from ..config.dictionary_config import DictionaryConfig from ..exceptions import CorpusError, KaldiProcessingError, TextGridParseError, TextParseError from ..helper import output_mapping -from ..multiprocessing.classes import Job +from ..multiprocessing import Job from ..multiprocessing.corpus import CorpusProcessWorker from ..multiprocessing.features import calc_cmvn, compute_vad, mfcc from ..multiprocessing.helper import Stopped @@ -28,7 +29,7 @@ if TYPE_CHECKING: from logging import Logger - from ..dictionary import DictionaryType + from ..dictionary import MultispeakerDictionary __all__ = ["Corpus"] @@ -80,22 +81,20 @@ def __init__( self, directory: str, output_directory: str, + dictionary_config: Optional[DictionaryConfig] = None, speaker_characters: Union[int, str] = 0, num_jobs: int = 3, sample_rate: int = 16000, debug: bool = False, logger: Optional[Logger] = None, use_mp: bool = True, - punctuation: str = None, - clitic_markers: str = None, audio_directory: Optional[str] = None, skip_load: bool = False, parse_text_only_files: bool = False, ignore_speakers: bool = False, ): self.audio_directory = audio_directory - self.punctuation = punctuation - self.clitic_markers = clitic_markers + self.dictionary_config = dictionary_config self.debug = debug self.use_mp = use_mp log_dir = os.path.join(output_directory, "logging") @@ -163,16 +162,12 @@ def __init__( if not self.skip_load: self.load() - def normalized_text_iter( - self, dictionary: Optional[DictionaryType] = None, min_count: int = 1 - ) -> Generator: + def normalized_text_iter(self, min_count: int = 1) -> Generator: """ Construct an iterator over the normalized texts in the corpus Parameters ---------- - dictionary: :class:`~montreal_forced_aligner.dictionary.Dictionary` - Dictionary to use for normalization min_count: int Minimum word count to include in the output, otherwise will use OOV code, defaults to 1 @@ -186,9 +181,9 @@ def normalized_text_iter( text = u.text.split() new_text = [] for t in text: - if dictionary is not None: - dictionary.to_int(t) - lookup = dictionary.split_clitics(t) + if u.speaker.dictionary is not None: + u.speaker.dictionary.to_int(t) + lookup = u.speaker.dictionary.split_clitics(t) if lookup is None: continue else: @@ -196,7 +191,9 @@ def normalized_text_iter( for item in lookup: if item in unk_words: new_text.append("") - elif dictionary is not None and item not in dictionary.words: + elif ( + u.speaker.dictionary is not None and item not in u.speaker.dictionary.words + ): new_text.append("") else: new_text.append(item) @@ -252,7 +249,7 @@ def create_subset(self, subset: Optional[int]) -> None: ) larger_subset = utts[:larger_subset_num] else: - larger_subset = self.utterances.values() + larger_subset = sorted(self.utterances.values()) random.seed(1234) # make it deterministic sampling subset_utts = set(random.sample(larger_subset, subset)) log_dir = os.path.join(subset_directory, "log") @@ -445,8 +442,7 @@ def _load_from_source_mp(self) -> None: relative_path, self.speaker_characters, self.sample_rate, - self.punctuation, - self.clitic_markers, + self.dictionary_config, ) ) @@ -576,8 +572,7 @@ def _load_from_source(self) -> None: relative_path, self.speaker_characters, self.sample_rate, - self.punctuation, - self.clitic_markers, + self.dictionary_config, ) self.add_file(file) except TextParseError as e: @@ -609,7 +604,7 @@ def add_file(self, file: File) -> None: Parameters ---------- - file: :class:`~montreal_forced_aligner.corpus.classes.File` + file: :class:`~montreal_forced_aligner.corpus.File` File to be added """ self.files[file.name] = file @@ -623,25 +618,20 @@ def add_file(self, file: File) -> None: if u.text: self.word_counts.update(u.text.split()) - def get_word_frequency(self, dictionary: DictionaryType) -> Dict[str, float]: + def get_word_frequency(self) -> Dict[str, float]: """ Calculate the word frequency across all the texts in the corpus - Parameters - ---------- - dictionary: :class:`~montreal_forced_aligner.dictionary.Dictionary` - Dictionary to use for looking up subwords - Returns ------- Dict[str, float] - Dictionary of words and their relative frequencies + PronunciationDictionary of words and their relative frequencies """ word_counts = Counter() for u in self.utterances.values(): text = u.text speaker = u.speaker - d = dictionary.get_dictionary(speaker) + d = speaker.dictionary new_text = [] text = text.split() for t in text: @@ -664,7 +654,7 @@ def add_utterance(self, utterance: Utterance) -> None: Parameters ---------- - utterance: :class:`~montreal_forced_aligner.corpus.classes.Utterance` + utterance: :class:`~montreal_forced_aligner.corpus.Utterance` Utterance to add """ self.utterances[utterance.name] = utterance @@ -679,7 +669,7 @@ def delete_utterance(self, utterance: Union[str, Utterance]) -> None: Parameters ---------- - utterance: :class:`~montreal_forced_aligner.corpus.classes.Utterance` + utterance: :class:`~montreal_forced_aligner.corpus.Utterance` Utterance to delete """ if isinstance(utterance, str): @@ -704,7 +694,7 @@ def initialize_jobs(self) -> None: def initialize_corpus( self, - dictionary: Optional[DictionaryType] = None, + dictionary: Optional[MultispeakerDictionary] = None, feature_config: Optional[FeatureConfig] = None, ) -> None: """ @@ -712,9 +702,9 @@ def initialize_corpus( Parameters ---------- - dictionary: :class:`~montreal_forced_aligner.dictionary.Dictionary`, optional - Dictionary to use - feature_config: :class:`~montreal_forced_aligner.config.feature.FeatureConfig`, optional + dictionary: :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary`, optional + PronunciationDictionary to use + feature_config: :class:`~montreal_forced_aligner.config.FeatureConfig`, optional Feature configuration to use """ if not self.files: diff --git a/montreal_forced_aligner/corpus/classes.py b/montreal_forced_aligner/corpus/classes.py index 1ff3640d..f82fbea9 100644 --- a/montreal_forced_aligner/corpus/classes.py +++ b/montreal_forced_aligner/corpus/classes.py @@ -13,8 +13,8 @@ from .helper import get_wav_info, load_text, parse_transcription if TYPE_CHECKING: - from ..aligner.base import BaseAligner from ..config.align_config import AlignConfig + from ..config.dictionary_config import DictionaryConfig from ..dictionary import Dictionary, DictionaryData from ..textgrid import CtmType from ..trainers import BaseTrainer, LdaTrainer, SatTrainer @@ -23,10 +23,6 @@ FmllrConfigType = Union[SatTrainer, AlignConfig] LdaConfigType = Union[LdaTrainer, AlignConfig] - IterationType = Union[str, int] - - AlignerType = Union[BaseTrainer, BaseAligner] - __all__ = ["parse_file", "File", "Speaker", "Utterance"] @@ -38,8 +34,7 @@ def parse_file( relative_path: str, speaker_characters: Union[int, str], sample_rate: int = 16000, - punctuation: Optional[str] = None, - clitic_markers: Optional[str] = None, + dictionary_config: Optional[DictionaryConfig] = None, stop_check: Optional[Callable] = None, ) -> File: """ @@ -68,7 +63,7 @@ def parse_file( Returns ------- - :class:`~montreal_forced_aligner.corpus.classes.File` + :class:`~montreal_forced_aligner.corpus.File` Parsed file """ file = File(wav_path, text_path, relative_path=relative_path) @@ -90,8 +85,7 @@ def parse_file( root_speaker = Speaker(speaker_name) file.load_text( root_speaker=root_speaker, - punctuation=punctuation, - clitic_markers=clitic_markers, + dictionary_config=dictionary_config, stop_check=stop_check, ) return file @@ -108,19 +102,19 @@ class Speaker: Attributes ---------- - utterances: Dict[str, :class:`~montreal_forced_aligner.corpus.classes.Utterance`] + utterances: Dict[str, :class:`~montreal_forced_aligner.corpus.Utterance`] Utterances that the speaker is associated with cmvn: str, optional String pointing to any CMVN that has been calculated for this speaker - dictionary: Dictionary, optional - Dictionary that the speaker is associated with + dictionary: :class:`~montreal_forced_aligner.dictionary.PronunciationDictionary`, optional + Pronunciation dictionary that the speaker is associated with dictionary_data: DictionaryData, optional Dictionary data from the speaker's dictionary """ def __init__(self, name): self.name = name - self.utterances: Dict[str, Utterance] = {} + self.utterances = {} self.cmvn = None self.dictionary: Optional[Dictionary] = None self.dictionary_data: Optional[DictionaryData] = None @@ -193,7 +187,7 @@ def add_utterance(self, utterance: Utterance): Parameters ---------- - utterance: :class:`~montreal_forced_aligner.corpus.classes.Utterance` + utterance: :class:`~montreal_forced_aligner.corpus.Utterance` Utterance """ utterance.speaker = self @@ -205,7 +199,7 @@ def delete_utterance(self, utterance: Utterance): Parameters ---------- - utterance: :class:`~montreal_forced_aligner.corpus.classes.Utterance` + utterance: :class:`~montreal_forced_aligner.corpus.Utterance` Utterance to be deleted """ identifier = utterance.name @@ -218,7 +212,7 @@ def merge(self, speaker: Speaker): Parameters ---------- - speaker: :class:`~montreal_forced_aligner.corpus.classes.Speaker` + speaker: :class:`~montreal_forced_aligner.corpus.Speaker` Other speaker to take utterances from """ for u in speaker.utterances.values(): @@ -246,14 +240,14 @@ def set_dictionary(self, dictionary: Dictionary) -> None: Parameters ---------- - dictionary: :class:`~montreal_forced_aligner.dictionary.Dictionary` - Dictionary to associate with the speaker + dictionary: :class:`~montreal_forced_aligner.dictionary.PronunciationDictionary` + Pronunciation dictionary to associate with the speaker """ self.dictionary = dictionary self.dictionary_data = dictionary.data(self.word_set()) @property - def files(self) -> Set[File]: + def files(self) -> Set["File"]: """Files that the speaker is associated with""" files = set() for u in self.utterances.values(): @@ -490,8 +484,7 @@ def construct_output_path( def load_text( self, root_speaker: Optional[Speaker] = None, - punctuation: Optional[str] = None, - clitic_markers: Optional[str] = None, + dictionary_config: Optional[DictionaryConfig] = None, stop_check: Optional[Callable] = None, ) -> None: """ @@ -499,7 +492,7 @@ def load_text( Parameters ---------- - root_speaker: :class:`~montreal_forced_aligner.corpus.classes.Speaker`, optional + root_speaker: :class:`~montreal_forced_aligner.corpus.Speaker`, optional Speaker derived from the root directory, ignored for TextGrids punctuation: str Orthographic characters to treat as punctuation @@ -513,7 +506,7 @@ def load_text( text = load_text(self.text_path) except UnicodeDecodeError: raise TextParseError(self.text_path) - words = parse_transcription(text, punctuation, clitic_markers) + words = parse_transcription(text, dictionary_config) utterance = Utterance(speaker=root_speaker, file=self, text=" ".join(words)) self.add_utterance(utterance) elif self.text_type == "textgrid": @@ -547,7 +540,7 @@ def load_text( if stop_check is not None and stop_check(): return text = text.lower().strip() - words = parse_transcription(text, punctuation, clitic_markers) + words = parse_transcription(text, dictionary_config) if not words: continue begin, end = round(begin, 4), round(end, 4) @@ -566,7 +559,7 @@ def add_speaker(self, speaker: Speaker) -> None: Parameters ---------- - speaker: :class:`~montreal_forced_aligner.corpus.classes.Speaker` + speaker: :class:`~montreal_forced_aligner.corpus.Speaker` Speaker to add """ if speaker not in self.speaker_ordering: @@ -578,7 +571,7 @@ def add_utterance(self, utterance: Utterance) -> None: Parameters ---------- - utterance: :class:`~montreal_forced_aligner.corpus.classes.Utterance` + utterance: :class:`~montreal_forced_aligner.corpus.Utterance` Utterance to add """ utterance.file = self @@ -591,7 +584,7 @@ def delete_utterance(self, utterance: Utterance) -> None: Parameters ---------- - utterance: :class:`~montreal_forced_aligner.corpus.classes.Utterance` + utterance: :class:`~montreal_forced_aligner.corpus.Utterance` Utterance to remove """ identifier = utterance.name @@ -657,7 +650,7 @@ class Utterance: Parameters ---------- - speaker: :class:`~montreal_forced_aligner.corpus.classes.Speaker` + speaker: :class:`~montreal_forced_aligner.corpus.Speaker` Speaker of the utterance file: File File that the utterance belongs to @@ -774,7 +767,7 @@ def __eq__(self, other) -> bool: Parameters ---------- - other: :class:`~montreal_forced_aligner.corpus.classes.Utterance` or str + other: :class:`~montreal_forced_aligner.corpus.Utterance` or str Utterance to compare against Returns @@ -799,7 +792,7 @@ def __lt__(self, other) -> bool: Parameters ---------- - other: :class:`~montreal_forced_aligner.corpus.classes.Utterance` or str + other: :class:`~montreal_forced_aligner.corpus.Utterance` or str Utterance to compare against Returns @@ -823,7 +816,7 @@ def __lte__(self, other) -> bool: Parameters ---------- - other: :class:`~montreal_forced_aligner.corpus.classes.Utterance` or str + other: :class:`~montreal_forced_aligner.corpus.Utterance` or str Utterance to compare against Returns @@ -847,7 +840,7 @@ def __gt__(self, other) -> bool: Parameters ---------- - other: :class:`~montreal_forced_aligner.corpus.classes.Utterance` or str + other: :class:`~montreal_forced_aligner.corpus.Utterance` or str Utterance to compare against Returns @@ -872,7 +865,7 @@ def __gte__(self, other) -> bool: Parameters ---------- - other: :class:`~montreal_forced_aligner.corpus.classes.Utterance` or str + other: :class:`~montreal_forced_aligner.corpus.Utterance` or str Utterance to compare against Returns @@ -922,7 +915,7 @@ def set_speaker(self, speaker: Speaker): Parameters ---------- - speaker: :class:`~montreal_forced_aligner.corpus.classes.Speaker` + speaker: :class:`~montreal_forced_aligner.corpus.Speaker` New speaker """ self.speaker = speaker diff --git a/montreal_forced_aligner/corpus/helper.py b/montreal_forced_aligner/corpus/helper.py index 645a697e..ed3941d3 100644 --- a/montreal_forced_aligner/corpus/helper.py +++ b/montreal_forced_aligner/corpus/helper.py @@ -4,11 +4,13 @@ import os import shutil import subprocess -from typing import Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + +if TYPE_CHECKING: + from ..config.dictionary_config import DictionaryConfig import soundfile -from ..dictionary import sanitize from ..exceptions import SoxError SoundFileInfoDict = Dict[str, Union[int, float, str]] @@ -38,7 +40,7 @@ def load_text(path: str) -> str: def parse_transcription( - text: str, punctuation: Optional[str] = None, clitic_markers: Optional[str] = None + text: str, dictionary_config: Optional[DictionaryConfig] = None ) -> List[str]: """ Parse an orthographic transcription given punctuation and clitic markers @@ -47,18 +49,25 @@ def parse_transcription( ---------- text: str Orthographic text to parse - punctuation: str + dictionary_config: Optional[DictionaryConfig] Characters to treat as punctuation - clitic_markers: str - Characters that mark clitics Returns ------- List Parsed orthographic transcript """ - words = [sanitize(x, punctuation, clitic_markers) for x in text.split()] - words = [x for x in words if x not in ["", "-", "'"]] + if dictionary_config is not None: + words = [dictionary_config.sanitize(x) for x in text.split()] + words = [ + x + for x in words + if x + and x not in dictionary_config.clitic_markers + and x not in dictionary_config.compound_markers + ] + else: + words = text.split() return words diff --git a/montreal_forced_aligner/data.py b/montreal_forced_aligner/data.py new file mode 100644 index 00000000..80f6380c --- /dev/null +++ b/montreal_forced_aligner/data.py @@ -0,0 +1,61 @@ +""" +Data classes +============ + +""" +from dataclasses import dataclass +from typing import List + +from praatio.utilities.constants import Interval + +__all__ = ["CtmInterval"] + + +@dataclass +class CtmInterval: + """ + Data class for intervals derived from CTM files + + Attributes + ---------- + begin: float + Start time of interval + end: float + End time of interval + label: str + Text of interval + utterance: str + Utterance ID that the interval belongs to + """ + + begin: float + end: float + label: str + utterance: str + + def shift_times(self, offset: float): + """ + Shift times of the interval based on some offset (i.e., segments in Kaldi) + + Parameters + ---------- + offset: float + Offset to add to the interval's begin and end + + """ + self.begin += offset + self.end += offset + + def to_tg_interval(self) -> Interval: + """ + Converts the CTMInterval to `PraatIO's Interval class `_ + + Returns + ------- + :class:`~praatio.utilities.constants.Interval` + Derived PraatIO Interval + """ + return Interval(self.begin, self.end, self.label) + + +CtmType = List[CtmInterval] diff --git a/montreal_forced_aligner/dictionary.py b/montreal_forced_aligner/dictionary.py deleted file mode 100644 index 5683c5e5..00000000 --- a/montreal_forced_aligner/dictionary.py +++ /dev/null @@ -1,1867 +0,0 @@ -"""Pronunciation dictionaries for use in alignment and transcription""" - -from __future__ import annotations - -import logging -import math -import os -import re -import subprocess -import sys -from collections import Counter, defaultdict -from typing import ( - TYPE_CHECKING, - Any, - Collection, - Dict, - List, - NamedTuple, - Optional, - Set, - Tuple, - Union, -) - -import yaml - -from .config.base_config import ( - DEFAULT_CLITIC_MARKERS, - DEFAULT_COMPOUND_MARKERS, - DEFAULT_DIGRAPHS, - DEFAULT_PUNCTUATION, - DEFAULT_STRIP_DIACRITICS, -) -from .exceptions import DictionaryError, DictionaryFileError, DictionaryPathError -from .utils import get_available_dictionaries, get_dictionary_path, thirdparty_binary - -if TYPE_CHECKING: - IpaType = Optional[List[str]] - PunctuationType = Optional[str] - from logging import Logger - - from .corpus.classes import Speaker - -DictionaryEntryType = List[Dict[str, Union[Tuple[str], float, None, int]]] -ReversedMappingType = Dict[int, str] -WordsType = Dict[str, DictionaryEntryType] -MappingType = Dict[str, int] -MultiSpeakerMappingType = Dict[str, str] - -__all__ = [ - "compile_graphemes", - "sanitize", - "check_format", - "check_bracketed", - "parse_ipa", - "DictionaryData", - "Dictionary", - "MultispeakerDictionary", -] - - -def compile_graphemes(graphemes: Set[str]) -> re.Pattern: - """ - Compiles the list of graphemes into a regular expression pattern. - - Parameters - ---------- - graphemes: Set[str] - Set of characters to treat as orthographic text - - Returns - ------- - re.Pattern - Compiled pattern that matches all graphemes - """ - base = r"^\W*([{}]+)\W*" - string = re.escape("".join(graphemes)) - try: - return re.compile(base.format(string)) - except Exception: - print(graphemes) - raise - - -def check_bracketed(word: str, brackets: Optional[List[Tuple[str, str]]] = None) -> bool: - """ - Checks whether a given string is surrounded by brackets. - - Parameters - ---------- - word : str - Text to check for final brackets - brackets: List[Tuple[str, str]]], optional - Brackets to check, defaults to [('[', ']'), ('{', '}'), ('<', '>'), ('(', ')')] - - Returns - ------- - bool - True if the word is fully bracketed, false otherwise - """ - if brackets is None: - brackets = [("[", "]"), ("{", "}"), ("<", ">"), ("(", ")")] - for b in brackets: - if word.startswith(b[0]) and word.endswith(b[-1]): - return True - return False - - -def sanitize( - item: str, punctuation: Optional[str] = None, clitic_markers: Optional[str] = None -) -> str: - """ - Sanitize an item according to punctuation and clitic markers - - Parameters - ---------- - item: str - Word to sanitize - punctuation: str - Characters to treat as punctuation - clitic_markers: str - Characters to treat as clitic markers, will be collapsed to the first marker - - Returns - ------- - str - Sanitized form - """ - if punctuation is None: - punctuation = DEFAULT_PUNCTUATION - if clitic_markers is None: - clitic_markers = DEFAULT_CLITIC_MARKERS - for c in clitic_markers: - item = item.replace(c, clitic_markers[0]) - if not item: - return item - if check_bracketed(item): - return item - sanitized = re.sub(rf"^[{punctuation}]+", "", item) - sanitized = re.sub(rf"[{punctuation}]+$", "", sanitized) - - return sanitized - - -def check_format(path: str) -> Tuple[bool, bool]: - """ - Check the pronunciation dictionary format - - Parameters - ---------- - path: str - Path of pronunciation dictionary - - Returns - ------- - bool - Flag for whether the dictionary has pronunciation probabilities - bool - Flag for whether the dictionary includes silence probabilities - """ - count = 0 - pronunciation_probabilities = True - silence_probabilities = True - with open(path, "r", encoding="utf8") as f: - for line in f: - line = line.strip() - if not line: - continue - line = line.split() - _ = line.pop(0) # word - next_item = line.pop(0) - if pronunciation_probabilities: - try: - prob = float(next_item) - if prob > 1 or prob < 0: - raise ValueError - except ValueError: - pronunciation_probabilities = False - try: - next_item = line.pop(0) - except IndexError: - silence_probabilities = False - if silence_probabilities: - try: - prob = float(next_item) - if prob > 1 or prob < 0: - raise ValueError - except ValueError: - silence_probabilities = False - count += 1 - if count > 10: - break - return pronunciation_probabilities, silence_probabilities - - -def parse_ipa( - transcription: List[str], strip_diacritics: IpaType = None, digraphs: IpaType = None -) -> Tuple[str, ...]: - """ - Parse a transcription in a multilingual IPA format (strips out diacritics and splits digraphs). - - Parameters - ---------- - transcription: List[str] - Transcription to parse - strip_diacritics: List[str] - List of diacritics to remove from characters in the transcription - digraphs: List[str] - List of digraphs to split up into separate characters - - Returns - ------- - Tuple[str, ...] - Parsed transcription - """ - if strip_diacritics is None: - strip_diacritics = DEFAULT_STRIP_DIACRITICS - if digraphs is None: - digraphs = DEFAULT_DIGRAPHS - new_transcription = [] - for t in transcription: - new_t = t - for d in strip_diacritics: - new_t = new_t.replace(d, "") - if "g" in new_t: - new_t = new_t.replace("g", "ɡ") - - found = False - for digraph in digraphs: - if re.match(r"^{}$".format(digraph), new_t): - found = True - if found: - new_transcription.extend(new_t) - continue - new_transcription.append(new_t) - return tuple(new_transcription) - - -class DictionaryData(NamedTuple): - """ - Information required for parsing Kaldi-internal ids to text - """ - - silences: Set[str] - multilingual_ipa: bool - words_mapping: MappingType - reversed_words_mapping: ReversedMappingType - reversed_phone_mapping: ReversedMappingType - punctuation: PunctuationType - clitic_set: Set[str] - clitic_markers: PunctuationType - compound_markers: PunctuationType - strip_diacritics: IpaType - oov_int: int - oov_code: str - words: WordsType - - -class Dictionary: - """ - Class containing information about a pronunciation dictionary - - Parameters - ---------- - input_path : str - Path to an input pronunciation dictionary - output_directory : str - Path to a directory to store files for Kaldi - oov_code : str, optional - What to label words not in the dictionary, defaults to ``''`` - position_dependent_phones : bool, optional - Specifies whether phones should be represented as dependent on their - position in the word (beginning, middle or end), defaults to True - num_sil_states : int, optional - Number of states to use for silence phones, defaults to 5 - num_nonsil_states : int, optional - Number of states to use for non-silence phones, defaults to 3 - shared_silence_phones : bool, optional - Specify whether to share states across all silence phones, defaults - to True - sil_prob : float, optional - Probability of optional silences following words, defaults to 0.5 - word_set : Collection[str], optional - Word set to limit output files - debug: bool, optional - Flag for whether to perform debug steps and prevent intermediate cleanup - logger: :class:`~logging.Logger`, optional - Logger to output information to - punctuation: str, optional - Punctuation to use when parsing text - clitic_markers: str, optional - Clitic markers to use when parsing text - compound_markers: str, optional - Compound markers to use when parsing text - multilingual_ipa: bool, optional - Flag for multilingual IPA mode, defaults to False - strip_diacritics: List[str], optional - Diacritics to strip in multilingual IPA mode - digraphs: List[str], optional - Digraphs to split up in multilingual IPA mode - """ - - topo_template = " {cur_state} {cur_state} {cur_state} 0.75 {next_state} 0.25 " - topo_sil_template = " {cur_state} {cur_state} {transitions} " - topo_transition_template = " {} {}" - positions: List[str] = ["_B", "_E", "_I", "_S"] - has_multiple = False - - def __init__( - self, - input_path: str, - output_directory: str, - oov_code: str = "", - position_dependent_phones: bool = True, - num_sil_states: int = 5, - num_nonsil_states: int = 3, - shared_silence_phones: bool = True, - sil_prob: float = 0.5, - word_set: Optional[Collection[str]] = None, - debug: bool = False, - logger: Optional[logging.Logger] = None, - punctuation: PunctuationType = None, - clitic_markers: PunctuationType = None, - compound_markers: PunctuationType = None, - multilingual_ipa: bool = False, - strip_diacritics: IpaType = None, - digraphs: IpaType = None, - ): - self.multilingual_ipa = multilingual_ipa - self.strip_diacritics = DEFAULT_STRIP_DIACRITICS - self.digraphs = DEFAULT_DIGRAPHS - if strip_diacritics is not None: - self.strip_diacritics = strip_diacritics - if digraphs is not None: - self.digraphs = digraphs - self.punctuation = DEFAULT_PUNCTUATION - self.clitic_markers = DEFAULT_CLITIC_MARKERS - self.compound_markers = DEFAULT_COMPOUND_MARKERS - if punctuation is not None: - self.punctuation = punctuation - if clitic_markers is not None: - self.clitic_markers = clitic_markers - if compound_markers is not None: - self.compound_markers = compound_markers - - if not os.path.exists(input_path): - raise (DictionaryPathError(input_path)) - if not os.path.isfile(input_path): - raise (DictionaryFileError(input_path)) - self.input_path = input_path - self.name = os.path.splitext(os.path.basename(input_path))[0] - self.debug = debug - self.output_directory = os.path.join(output_directory, self.name) - os.makedirs(self.output_directory, exist_ok=True) - self.log_file = os.path.join(self.output_directory, f"{self.name}.log") - if logger is None: - self.logger = logging.getLogger("dictionary_setup") - self.logger.setLevel(logging.INFO) - handler = logging.FileHandler(self.log_file, "w", "utf-8") - handler.setFormatter = logging.Formatter("%(name)s %(message)s") - self.logger.addHandler(handler) - self.individual_logger = True - else: - self.logger = logger - self.individual_logger = False - self.num_sil_states = num_sil_states - self.num_nonsil_states = num_nonsil_states - self.shared_silence_phones = shared_silence_phones - self.sil_prob = sil_prob - self.oov_code = oov_code - self.sil_code = "!sil" - self.oovs_found = Counter() - self.position_dependent_phones = position_dependent_phones - - self.words = {} - self.nonsil_phones: Set[str] = set() - self.sil_phones = {"sp", "spn", "sil"} - self.optional_silence = "sp" - self.nonoptional_silence = "sil" - self.graphemes = set() - self.max_disambiguation_symbol = 0 - self.disambiguation_symbols = set() - self.all_words = defaultdict(list) - self.clitic_set = set() - self.specials_set = {self.oov_code, self.sil_code, "", "", ""} - self.words[self.sil_code] = [{"pronunciation": ("sp",), "probability": 1}] - self.words[self.oov_code] = [{"pronunciation": ("spn",), "probability": 1}] - self.pronunciation_probabilities, self.silence_probabilities = check_format(input_path) - progress = f'Parsing dictionary "{self.name}"' - if self.pronunciation_probabilities: - progress += " with pronunciation probabilities" - else: - progress += " without pronunciation probabilities" - if self.silence_probabilities: - progress += " with silence probabilities" - else: - progress += " without silence probabilities" - self.logger.info(progress) - with open(input_path, "r", encoding="utf8") as inf: - for i, line in enumerate(inf): - line = line.strip() - if not line: - continue - line = line.split() - word = sanitize(line.pop(0).lower(), self.punctuation, self.clitic_markers) - if not line: - raise DictionaryError( - f"Line {i} of {input_path} does not have a pronunciation." - ) - if word in ["!sil", oov_code]: - continue - self.graphemes.update(word) - prob = None - if self.pronunciation_probabilities: - prob = float(line.pop(0)) - if prob > 1 or prob < 0: - raise ValueError - if self.silence_probabilities: - right_sil_prob = float(line.pop(0)) - left_sil_prob = float(line.pop(0)) - left_nonsil_prob = float(line.pop(0)) - else: - right_sil_prob = None - left_sil_prob = None - left_nonsil_prob = None - if self.multilingual_ipa: - pron = parse_ipa(line, self.strip_diacritics, self.digraphs) - else: - pron = tuple(line) - pronunciation = { - "pronunciation": pron, - "probability": prob, - "disambiguation": None, - "right_sil_prob": right_sil_prob, - "left_sil_prob": left_sil_prob, - "left_nonsil_prob": left_nonsil_prob, - } - if self.multilingual_ipa: - pronunciation["original_pronunciation"] = tuple(line) - if not any(x in self.sil_phones for x in pron): - self.nonsil_phones.update(pron) - if word in self.words and pron in {x["pronunciation"] for x in self.words[word]}: - continue - if word not in self.words: - self.words[word] = [] - self.words[word].append(pronunciation) - # test whether a word is a clitic - is_clitic = False - for cm in self.clitic_markers: - if word.startswith(cm) or word.endswith(cm): - is_clitic = True - if is_clitic: - self.clitic_set.add(word) - if word_set is not None: - word_set = {y for x in word_set for y in self._lookup(x)} - word_set.add("!sil") - word_set.add(self.oov_code) - self.word_set = word_set - if self.word_set is not None: - self.word_set = self.word_set | self.clitic_set - if not self.graphemes: - raise DictionaryFileError(f"No words were found in the dictionary path {input_path}") - self.word_pattern = compile_graphemes(self.graphemes) - self.log_info() - self.phone_mapping = {} - self.words_mapping = {} - - def __hash__(self) -> Any: - """Return the hash of a given dictionary""" - return hash(self.input_path) - - @property - def output_paths(self) -> Dict[str, str]: - """ - Mapping of output directory for this dictionary - """ - return {self.name: self.output_directory} - - @property - def silences(self) -> Set[str]: - """ - Set of symbols that correspond to silence - """ - return {self.optional_silence, self.nonoptional_silence} - - def get_dictionary(self, speaker: Union[Speaker, str]) -> Dictionary: - """ - Wrapper function to return this dictionary for any arbitrary speaker - - Parameters - ---------- - speaker: Union[Speaker, str] - Speaker to look up dictionary for - - Returns - ------- - Dictionary - This dictionary - """ - return self - - def data(self, word_set: Optional[Collection[str]] = None) -> DictionaryData: - """ - Generates a dictionary data for use in parsing utilities - - Parameters - ---------- - word_set: Collection[str], optional - Word set to limit data to - - Returns - ------- - DictionaryData - Data necessary for parsing text - """ - - def word_check(word): - """Check whether a word should be included in the output""" - if word in word_set: - return True - if word in self.clitic_set: - return True - if word in self.specials_set: - return True - return False - - if word_set: - words_mapping = {k: v for k, v in self.words_mapping.items() if word_check(k)} - reversed_word_mapping = { - k: v for k, v in self.reversed_word_mapping.items() if word_check(v) - } - words = {k: v for k, v in self.words.items() if word_check(k)} - else: - words_mapping = self.words_mapping - reversed_word_mapping = self.reversed_word_mapping - words = self.words - return DictionaryData( - self.silences, - self.multilingual_ipa, - words_mapping, - reversed_word_mapping, - self.reversed_phone_mapping, - self.punctuation, - self.clitic_set, - self.clitic_markers, - self.compound_markers, - self.strip_diacritics, - self.oov_int, - self.oov_code, - words, - ) - - def cleanup_logger(self) -> None: - """ - Clean up and detach logger from handles - """ - if not self.individual_logger: - return - handlers = self.logger.handlers[:] - for handler in handlers: - handler.close() - self.logger.removeHandler(handler) - - def log_info(self) -> None: - """ - Dump debugging information to the logger - """ - self.logger.debug(f'"{self.name}" DICTIONARY INFORMATION') - if self.pronunciation_probabilities: - self.logger.debug("Has pronunciation probabilities") - else: - self.logger.debug("Has NO pronunciation probabilities") - if self.silence_probabilities: - self.logger.debug("Has silence probabilities") - else: - self.logger.debug("Has NO silence probabilities") - - self.logger.debug(f"Grapheme set: {', '.join(sorted(self.graphemes))}") - self.logger.debug(f"Phone set: {', '.join(sorted(self.nonsil_phones))}") - self.logger.debug(f"Punctuation: {self.punctuation}") - self.logger.debug(f"Clitic markers: {self.clitic_markers}") - self.logger.debug(f"Clitic set: {', '.join(sorted(self.clitic_set))}") - if self.multilingual_ipa: - self.logger.debug(f"Strip diacritics: {', '.join(sorted(self.strip_diacritics))}") - self.logger.debug(f"Digraphs: {', '.join(sorted(self.digraphs))}") - - def set_word_set(self, word_set: List[str]) -> None: - """ - Limit output to a subset of overall words - - Parameters - ---------- - word_set: List[str] - Word set to limit generated files to - """ - word_set = {y for x in word_set for y in self._lookup(x)} - word_set.add(self.sil_code) - word_set.add(self.oov_code) - self.word_set = word_set | self.clitic_set - self.generate_mappings() - - @property - def actual_words(self) -> Dict[str, DictionaryEntryType]: - """ - Mapping of words to integer IDs without Kaldi-internal words - """ - return { - k: v - for k, v in self.words.items() - if k not in [self.sil_code, self.oov_code, ""] and len(v) - } - - def split_clitics(self, item: str) -> List[str]: - """ - Split a word into subwords based on clitic and compound markers - - Parameters - ---------- - item: str - Word to split up - - Returns - ------- - List[str] - List of subwords - """ - if item in self.words: - return [item] - if any(x in item for x in self.compound_markers): - s = re.split(rf"[{self.compound_markers}]", item) - if any(x in item for x in self.clitic_markers): - new_s = [] - for seg in s: - if any(x in seg for x in self.clitic_markers): - new_s.extend(self.split_clitics(seg)) - else: - new_s.append(seg) - s = new_s - return s - if any( - x in item and not item.endswith(x) and not item.startswith(x) - for x in self.clitic_markers - ): - initial, final = re.split(rf"[{self.clitic_markers}]", item, maxsplit=1) - if any(x in final for x in self.clitic_markers): - final = self.split_clitics(final) - else: - final = [final] - for clitic in self.clitic_markers: - if initial + clitic in self.clitic_set: - return [initial + clitic] + final - elif clitic + final[0] in self.clitic_set: - final[0] = clitic + final[0] - return [initial] + final - return [item] - - def __len__(self) -> int: - """Return the number of pronunciations across all words""" - return sum(len(x) for x in self.words.values()) - - def exclude_for_alignment(self, word: str) -> bool: - """ - Check for whether to exclude a word from alignment lexicons (if there is a word set in the dictionary, - checks whether the given string is in the word set) - - Parameters - ---------- - word: str - Word to check - - Returns - ------- - bool - True if there is no word set on the dictionary, or if the word is in the given word set - """ - if self.word_set is None: - return False - if word not in self.word_set and word not in self.clitic_set: - return True - return False - - def generate_mappings(self) -> None: - """ - Generate phone and word mappings from text to integer IDs - """ - if self.phone_mapping: - return - self.phone_mapping = {} - i = 0 - self.phone_mapping[""] = i - if self.position_dependent_phones: - for p in self.positional_sil_phones: - i += 1 - self.phone_mapping[p] = i - for p in self.positional_nonsil_phones: - i += 1 - self.phone_mapping[p] = i - else: - for p in sorted(self.sil_phones): - i += 1 - self.phone_mapping[p] = i - for p in sorted(self.nonsil_phones): - i += 1 - self.phone_mapping[p] = i - - self.words_mapping = {} - i = 0 - self.words_mapping[""] = i - for w in sorted(self.words.keys()): - if self.exclude_for_alignment(w): - continue - i += 1 - self.words_mapping[w] = i - - self.words_mapping["#0"] = i + 1 - self.words_mapping[""] = i + 2 - self.words_mapping[""] = i + 3 - self.oovs_found = Counter() - self.add_disambiguation() - - def add_disambiguation(self) -> None: - """ - Calculate disambiguation symbols for each pronunciation - """ - subsequences = set() - pronunciation_counts = defaultdict(int) - - for w, prons in self.words.items(): - if self.exclude_for_alignment(w): - continue - for p in prons: - pronunciation_counts[p["pronunciation"]] += 1 - pron = p["pronunciation"][:-1] - while pron: - subsequences.add(tuple(p)) - pron = pron[:-1] - last_used = defaultdict(int) - for w, prons in sorted(self.words.items()): - if self.exclude_for_alignment(w): - continue - for p in prons: - if ( - pronunciation_counts[p["pronunciation"]] == 1 - and not p["pronunciation"] in subsequences - ): - disambig = None - else: - pron = p["pronunciation"] - last_used[pron] += 1 - disambig = last_used[pron] - p["disambiguation"] = disambig - if last_used: - self.max_disambiguation_symbol = max(last_used.values()) - else: - self.max_disambiguation_symbol = 0 - self.disambiguation_symbols = set() - i = max(self.phone_mapping.values()) - for x in range(self.max_disambiguation_symbol + 2): - p = f"#{x}" - self.disambiguation_symbols.add(p) - i += 1 - self.phone_mapping[p] = i - - def create_utterance_fst(self, text: List[str], frequent_words: List[Tuple[str, int]]) -> str: - """ - Create an FST for an utterance with frequent words as a unigram language model - - Parameters - ---------- - text: List[str] - Text of the utterance - frequent_words: List[Tuple[str, int]] - Frequent words to incorporate into the FST - Returns - ------- - str - FST created from the utterance text and frequent words - """ - num_words = len(text) - word_probs = Counter(text) - word_probs = {k: v / num_words for k, v in word_probs.items()} - word_probs.update(frequent_words) - fst_text = "" - for k, v in word_probs.items(): - cost = -1 * math.log(v) - w = self.to_int(k)[0] - fst_text += f"0 0 {w} {w} {cost}\n" - fst_text += f"0 {-1 * math.log(1 / num_words)}\n" - return fst_text - - def to_int(self, item: str) -> List[int]: - """ - Convert a given word into integer IDs - - Parameters - ---------- - item: str - Word to look up - - Returns - ------- - List[int] - List of integer IDs corresponding to each subword - """ - if item == "": - return [] - sanitized = self._lookup(item) - text_int = [] - for item in sanitized: - if not item: - continue - if item not in self.words_mapping: - self.oovs_found.update([item]) - text_int.append(self.oov_int) - else: - text_int.append(self.words_mapping[item]) - return text_int - - def save_oovs_found(self, directory: str) -> None: - """ - Save all out of vocabulary items to a file in the specified directory - - Parameters - ---------- - directory : str - Path to directory to save ``oovs_found.txt`` - """ - with open(os.path.join(directory, "oovs_found.txt"), "w", encoding="utf8") as f, open( - os.path.join(directory, "oov_counts.txt"), "w", encoding="utf8" - ) as cf: - for oov in sorted(self.oovs_found.keys(), key=lambda x: (-self.oovs_found[x], x)): - f.write(oov + "\n") - cf.write(f"{oov}\t{self.oovs_found[oov]}\n") - - def _lookup(self, item: str) -> List[str]: - """ - Look up a word and return the list of sub words if necessary taking into account clitic and compound markers - - Parameters - ---------- - item: str - Word to look up - - Returns - ------- - List[str] - List of subwords that are in the dictionary - """ - if item in self.words: - return [item] - sanitized = sanitize(item, self.punctuation, self.clitic_markers) - if sanitized in self.words: - return [sanitized] - split = self.split_clitics(sanitized) - oov_count = sum(1 for x in split if x not in self.words) - if oov_count < len( - split - ): # Only returned split item if it gains us any transcribed speech - return split - return [sanitized] - - def check_word(self, item: str) -> bool: - """ - Check whether a word is in the dictionary, takes into account sanitization and - clitic and compound markers - - Parameters - ---------- - item: str - Word to check - - Returns - ------- - bool - True if the look up would not result in an OOV item - """ - if item == "": - return False - if item in self.words: - return True - sanitized = sanitize(item, self.punctuation, self.clitic_markers) - if sanitized in self.words: - return True - - sanitized = self.split_clitics(sanitized) - if all(s in self.words for s in sanitized): - return True - return False - - @property - def reversed_word_mapping(self) -> ReversedMappingType: - """ - A mapping of integer ids to words - """ - mapping = {} - for k, v in self.words_mapping.items(): - mapping[v] = k - return mapping - - @property - def reversed_phone_mapping(self) -> ReversedMappingType: - """ - A mapping of integer ids to phones - """ - mapping = {} - for k, v in self.phone_mapping.items(): - mapping[v] = k - return mapping - - @property - def oov_int(self) -> int: - """ - The integer id for out of vocabulary items - """ - return self.words_mapping[self.oov_code] - - @property - def positional_sil_phones(self) -> List[str]: - """ - List of silence phones with positions - """ - sil_phones = [] - for p in sorted(self.sil_phones): - sil_phones.append(p) - for pos in self.positions: - sil_phones.append(p + pos) - return sil_phones - - @property - def positional_nonsil_phones(self) -> List[str]: - """ - List of non-silence phones with positions - """ - nonsil_phones = [] - for p in sorted(self.nonsil_phones): - for pos in self.positions: - nonsil_phones.append(p + pos) - return nonsil_phones - - @property - def optional_silence_csl(self) -> str: - """ - Phone id of the optional silence phone - """ - return str(self.phone_mapping[self.optional_silence]) - - @property - def silence_csl(self) -> str: - """ - A colon-separated list (as a string) of silence phone ids - """ - if self.position_dependent_phones: - return ":".join(map(str, (self.phone_mapping[x] for x in self.positional_sil_phones))) - else: - return ":".join(map(str, (self.phone_mapping[x] for x in self.sil_phones))) - - @property - def phones_dir(self) -> str: - """ - Directory to store information Kaldi needs about phones - """ - return os.path.join(self.output_directory, "phones") - - @property - def phones(self) -> set: - """ - The set of all phones (silence and non-silence) - """ - return self.sil_phones | self.nonsil_phones - - @property - def words_symbol_path(self) -> str: - """ - Path of word to int mapping file for the dictionary - """ - return os.path.join(self.output_directory, "words.txt") - - @property - def disambig_path(self) -> str: - """ - Path of disambiguated lexicon fst (L.fst) - """ - return os.path.join(self.output_directory, "L_disambig.fst") - - def write(self, write_disambiguation: Optional[bool] = False) -> None: - """ - Write the files necessary for Kaldi - - Parameters - ---------- - write_disambiguation: bool, optional - Flag for including disambiguation information - """ - self.logger.info("Creating dictionary information...") - os.makedirs(self.phones_dir, exist_ok=True) - self.generate_mappings() - self._write_graphemes() - self._write_phone_map_file() - self._write_phone_sets() - self._write_phone_symbol_table() - self._write_disambig() - self._write_topo() - self._write_word_boundaries() - self._write_extra_questions() - self._write_word_file() - self._write_align_lexicon() - self._write_fst_text(write_disambiguation=write_disambiguation) - self._write_fst_binary(write_disambiguation=write_disambiguation) - self.cleanup() - - def cleanup(self) -> None: - """ - Clean up temporary files in the output directory - """ - if not self.debug: - if os.path.exists(os.path.join(self.output_directory, "temp.fst")): - os.remove(os.path.join(self.output_directory, "temp.fst")) - if os.path.exists(os.path.join(self.output_directory, "lexicon.text.fst")): - os.remove(os.path.join(self.output_directory, "lexicon.text.fst")) - - def _write_graphemes(self) -> None: - """ - Write graphemes to temporary directory - """ - outfile = os.path.join(self.output_directory, "graphemes.txt") - if os.path.exists(outfile): - return - with open(outfile, "w", encoding="utf8") as f: - for char in sorted(self.graphemes): - f.write(char + "\n") - - def export_lexicon( - self, - path: str, - write_disambiguation: Optional[bool] = False, - probability: Optional[bool] = False, - ) -> None: - """ - Export pronunciation dictionary to a text file - - Parameters - ---------- - path: str - Path to save dictionary - write_disambiguation: bool, optional - Flag for whether to include disambiguation information - probability: bool, optional - Flag for whether to include probabilities - """ - with open(path, "w", encoding="utf8") as f: - for w in sorted(self.words.keys()): - for p in sorted( - self.words[w], - key=lambda x: (x["pronunciation"], x["probability"], x["disambiguation"]), - ): - phones = " ".join(p["pronunciation"]) - if write_disambiguation and p["disambiguation"] is not None: - phones += f" #{p['disambiguation']}" - if probability: - f.write(f"{w}\t{p['probability']}\t{phones}\n") - else: - f.write(f"{w}\t{phones}\n") - - def _write_phone_map_file(self) -> None: - """ - Write the phone map to the temporary directory - """ - outfile = os.path.join(self.output_directory, "phone_map.txt") - if os.path.exists(outfile): - return - with open(outfile, "w", encoding="utf8") as f: - for sp in self.sil_phones: - if self.position_dependent_phones: - new_phones = [sp + x for x in ["", ""] + self.positions] - else: - new_phones = [sp] - f.write(" ".join(new_phones) + "\n") - for nsp in self.nonsil_phones: - if self.position_dependent_phones: - new_phones = [nsp + x for x in [""] + self.positions] - else: - new_phones = [nsp] - f.write(" ".join(new_phones) + "\n") - - def _write_phone_symbol_table(self) -> None: - """ - Write the phone mapping to the temporary directory - """ - outfile = os.path.join(self.output_directory, "phones.txt") - if os.path.exists(outfile): - return - with open(outfile, "w", encoding="utf8") as f: - for p, i in sorted(self.phone_mapping.items(), key=lambda x: x[1]): - f.write(f"{p} {i}\n") - - def _write_word_boundaries(self) -> None: - """ - Write the word boundaries file to the temporary directory - """ - boundary_path = os.path.join(self.output_directory, "phones", "word_boundary.txt") - boundary_int_path = os.path.join(self.output_directory, "phones", "word_boundary.int") - if os.path.exists(boundary_path) and os.path.exists(boundary_int_path): - return - with open(boundary_path, "w", encoding="utf8") as f, open( - boundary_int_path, "w", encoding="utf8" - ) as intf: - if self.position_dependent_phones: - for p in sorted(self.phone_mapping.keys(), key=lambda x: self.phone_mapping[x]): - if p == "" or p.startswith("#"): - continue - cat = "nonword" - if p.endswith("_B"): - cat = "begin" - elif p.endswith("_S"): - cat = "singleton" - elif p.endswith("_I"): - cat = "internal" - elif p.endswith("_E"): - cat = "end" - f.write(" ".join([p, cat]) + "\n") - intf.write(" ".join([str(self.phone_mapping[p]), cat]) + "\n") - - def _write_word_file(self) -> None: - """ - Write the word mapping to the temporary directory - """ - words_path = os.path.join(self.output_directory, "words.txt") - if os.path.exists(words_path): - return - if sys.platform == "win32": - newline = "" - else: - newline = None - with open(words_path, "w", encoding="utf8", newline=newline) as f: - for w, i in sorted(self.words_mapping.items(), key=lambda x: x[1]): - f.write(f"{w} {i}\n") - - def _write_align_lexicon(self) -> None: - """ - Write the alignment lexicon text file to the temporary directory - """ - path = os.path.join(self.phones_dir, "align_lexicon.int") - if os.path.exists(path): - return - - with open(path, "w", encoding="utf8") as f: - for w, i in self.words_mapping.items(): - if self.exclude_for_alignment(w): - continue - if w not in self.words: # special characters - continue - for pron in sorted( - self.words[w], - key=lambda x: (x["pronunciation"], x["probability"], x["disambiguation"]), - ): - - phones = list(pron["pronunciation"]) - if self.position_dependent_phones: - if len(phones) == 1: - phones[0] += "_S" - else: - for j in range(len(phones)): - if j == 0: - phones[j] += "_B" - elif j == len(phones) - 1: - phones[j] += "_E" - else: - phones[j] += "_I" - p = " ".join(str(self.phone_mapping[x]) for x in phones) - f.write(f"{i} {i} {p}\n".format(i=i, p=p)) - - def _write_topo(self) -> None: - """ - Write the topo file to the temporary directory - """ - filepath = os.path.join(self.output_directory, "topo") - if os.path.exists(filepath): - return - sil_transp = 1 / (self.num_sil_states - 1) - initial_transition = [ - self.topo_transition_template.format(x, sil_transp) - for x in range(self.num_sil_states - 1) - ] - middle_transition = [ - self.topo_transition_template.format(x, sil_transp) - for x in range(1, self.num_sil_states) - ] - final_transition = [ - self.topo_transition_template.format(self.num_sil_states - 1, 0.75), - self.topo_transition_template.format(self.num_sil_states, 0.25), - ] - with open(filepath, "w") as f: - f.write("\n") - f.write("\n") - f.write("\n") - if self.position_dependent_phones: - phones = self.positional_nonsil_phones - else: - phones = sorted(self.nonsil_phones) - f.write(f"{' '.join(str(self.phone_mapping[x]) for x in phones)}\n") - f.write("\n") - states = [ - self.topo_template.format(cur_state=x, next_state=x + 1) - for x in range(self.num_nonsil_states) - ] - f.write("\n".join(states)) - f.write(f"\n {self.num_nonsil_states} \n") - f.write("\n") - - f.write("\n") - f.write("\n") - if self.position_dependent_phones: - phones = self.positional_sil_phones - else: - phones = self.sil_phones - f.write(f"{' '.join(str(self.phone_mapping[x]) for x in phones)}\n") - f.write("\n") - states = [] - for i in range(self.num_sil_states): - if i == 0: - transition = " ".join(initial_transition) - elif i == self.num_sil_states - 1: - transition = " ".join(final_transition) - else: - transition = " ".join(middle_transition) - states.append(self.topo_sil_template.format(cur_state=i, transitions=transition)) - f.write("\n".join(states)) - f.write(f"\n {self.num_sil_states} \n") - f.write("\n") - f.write("\n") - - def _write_phone_sets(self) -> None: - """ - Write phone symbol sets to the temporary directory - """ - sharesplit = ["shared", "split"] - if not self.shared_silence_phones: - sil_sharesplit = ["not-shared", "not-split"] - else: - sil_sharesplit = sharesplit - - sets_file = os.path.join(self.output_directory, "phones", "sets.txt") - roots_file = os.path.join(self.output_directory, "phones", "roots.txt") - - sets_int_file = os.path.join(self.output_directory, "phones", "sets.int") - roots_int_file = os.path.join(self.output_directory, "phones", "roots.int") - if ( - os.path.exists(sets_file) - and os.path.exists(roots_file) - and os.path.exists(sets_int_file) - and os.path.exists(roots_int_file) - ): - return - - with open(sets_file, "w", encoding="utf8") as setf, open( - roots_file, "w", encoding="utf8" - ) as rootf, open(sets_int_file, "w", encoding="utf8") as setintf, open( - roots_int_file, "w", encoding="utf8" - ) as rootintf: - - # process silence phones - for i, sp in enumerate(self.sil_phones): - if self.position_dependent_phones: - mapped = [sp + x for x in [""] + self.positions] - else: - mapped = [sp] - setf.write(" ".join(mapped) + "\n") - setintf.write(" ".join(map(str, (self.phone_mapping[x] for x in mapped))) + "\n") - if i == 0: - line = sil_sharesplit + mapped - lineint = sil_sharesplit + [self.phone_mapping[x] for x in mapped] - else: - line = sharesplit + mapped - lineint = sharesplit + [self.phone_mapping[x] for x in mapped] - rootf.write(" ".join(line) + "\n") - rootintf.write(" ".join(map(str, lineint)) + "\n") - - # process nonsilence phones - for nsp in sorted(self.nonsil_phones): - if self.position_dependent_phones: - mapped = [nsp + x for x in self.positions] - else: - mapped = [nsp] - setf.write(" ".join(mapped) + "\n") - setintf.write(" ".join(map(str, (self.phone_mapping[x] for x in mapped))) + "\n") - line = sharesplit + mapped - lineint = sharesplit + [self.phone_mapping[x] for x in mapped] - rootf.write(" ".join(line) + "\n") - rootintf.write(" ".join(map(str, lineint)) + "\n") - - def _write_extra_questions(self) -> None: - """ - Write extra questions symbols to the temporary directory - """ - phone_extra = os.path.join(self.phones_dir, "extra_questions.txt") - phone_extra_int = os.path.join(self.phones_dir, "extra_questions.int") - if os.path.exists(phone_extra) and os.path.exists(phone_extra_int): - return - with open(phone_extra, "w", encoding="utf8") as outf, open( - phone_extra_int, "w", encoding="utf8" - ) as intf: - if self.position_dependent_phones: - sils = sorted(self.positional_sil_phones) - else: - sils = sorted(self.sil_phones) - outf.write(" ".join(sils) + "\n") - intf.write(" ".join(map(str, (self.phone_mapping[x] for x in sils))) + "\n") - - if self.position_dependent_phones: - nonsils = sorted(self.positional_nonsil_phones) - else: - nonsils = sorted(self.nonsil_phones) - outf.write(" ".join(nonsils) + "\n") - intf.write(" ".join(map(str, (self.phone_mapping[x] for x in nonsils))) + "\n") - if self.position_dependent_phones: - for p in self.positions: - line = [x + p for x in sorted(self.nonsil_phones)] - outf.write(" ".join(line) + "\n") - intf.write(" ".join(map(str, (self.phone_mapping[x] for x in line))) + "\n") - for p in [""] + self.positions: - line = [x + p for x in sorted(self.sil_phones)] - outf.write(" ".join(line) + "\n") - intf.write(" ".join(map(str, (self.phone_mapping[x] for x in line))) + "\n") - - def _write_disambig(self) -> None: - """ - Write disambiguation symbols to the temporary directory - """ - disambig = os.path.join(self.phones_dir, "disambiguation_symbols.txt") - disambig_int = os.path.join(self.phones_dir, "disambiguation_symbols.int") - if os.path.exists(disambig) and os.path.exists(disambig_int): - return - with open(disambig, "w", encoding="utf8") as outf, open( - disambig_int, "w", encoding="utf8" - ) as intf: - for d in sorted(self.disambiguation_symbols, key=lambda x: self.phone_mapping[x]): - outf.write(f"{d}\n") - intf.write(f"{self.phone_mapping[d]}\n") - - def _write_fst_binary(self, write_disambiguation: Optional[bool] = False) -> None: - """ - Write the binary fst file to the temporary directory - - Parameters - ---------- - write_disambiguation: bool, optional - Flag for including disambiguation symbols - """ - if write_disambiguation: - lexicon_fst_path = os.path.join(self.output_directory, "lexicon_disambig.text.fst") - output_fst = os.path.join(self.output_directory, "L_disambig.fst") - else: - lexicon_fst_path = os.path.join(self.output_directory, "lexicon.text.fst") - output_fst = os.path.join(self.output_directory, "L.fst") - if os.path.exists(output_fst): - return - - phones_file_path = os.path.join(self.output_directory, "phones.txt") - words_file_path = os.path.join(self.output_directory, "words.txt") - - log_path = os.path.join(self.output_directory, "fst.log") - temp_fst_path = os.path.join(self.output_directory, "temp.fst") - with open(log_path, "w") as log_file: - compile_proc = subprocess.Popen( - [ - thirdparty_binary("fstcompile"), - f"--isymbols={phones_file_path}", - f"--osymbols={words_file_path}", - "--keep_isymbols=false", - "--keep_osymbols=false", - lexicon_fst_path, - temp_fst_path, - ], - stderr=log_file, - ) - compile_proc.communicate() - if write_disambiguation: - temp2_fst_path = os.path.join(self.output_directory, "temp2.fst") - phone_disambig_path = os.path.join(self.output_directory, "phone_disambig.txt") - word_disambig_path = os.path.join(self.output_directory, "word_disambig.txt") - with open(phone_disambig_path, "w") as f: - f.write(str(self.phone_mapping["#0"])) - with open(word_disambig_path, "w") as f: - f.write(str(self.words_mapping["#0"])) - selfloop_proc = subprocess.Popen( - [ - thirdparty_binary("fstaddselfloops"), - phone_disambig_path, - word_disambig_path, - temp_fst_path, - temp2_fst_path, - ], - stderr=log_file, - ) - selfloop_proc.communicate() - arc_sort_proc = subprocess.Popen( - [ - thirdparty_binary("fstarcsort"), - "--sort_type=olabel", - temp2_fst_path, - output_fst, - ], - stderr=log_file, - ) - else: - arc_sort_proc = subprocess.Popen( - [ - thirdparty_binary("fstarcsort"), - "--sort_type=olabel", - temp_fst_path, - output_fst, - ], - stderr=log_file, - ) - arc_sort_proc.communicate() - - def _write_fst_text(self, write_disambiguation: Optional[bool] = False) -> None: - """ - Write the text fst file to the temporary directory - - Parameters - ---------- - write_disambiguation: bool, optional - Flag for including disambiguation symbols - """ - if write_disambiguation: - lexicon_fst_path = os.path.join(self.output_directory, "lexicon_disambig.text.fst") - sildisambig = f"#{self.max_disambiguation_symbol + 1}" - else: - lexicon_fst_path = os.path.join(self.output_directory, "lexicon.text.fst") - if os.path.exists(lexicon_fst_path): - return - if self.sil_prob != 0: - silphone = self.optional_silence - nonoptsil = self.nonoptional_silence - - silcost = -1 * math.log(self.sil_prob) - nosilcost = -1 * math.log(1.0 - self.sil_prob) - startstate = 0 - loopstate = 1 - silstate = 2 - else: - loopstate = 0 - nextstate = 1 - - with open(lexicon_fst_path, "w", encoding="utf8") as outf: - if self.sil_prob != 0: - outf.write( - "\t".join(map(str, [startstate, loopstate, "", "", nosilcost])) - + "\n" - ) # no silence - - outf.write( - "\t".join(map(str, [startstate, loopstate, nonoptsil, "", silcost])) - + "\n" - ) # silence - outf.write( - "\t".join(map(str, [silstate, loopstate, silphone, ""])) + "\n" - ) # no cost - nextstate = 3 - if write_disambiguation: - disambigstate = 3 - nextstate = 4 - outf.write( - "\t".join( - map(str, [startstate, disambigstate, silphone, "", silcost]) - ) - + "\n" - ) # silence. - outf.write( - "\t".join(map(str, [silstate, disambigstate, silphone, "", silcost])) - + "\n" - ) # no cost. - outf.write( - "\t".join(map(str, [disambigstate, loopstate, sildisambig, ""])) - + "\n" - ) # silence disambiguation symbol. - - for w in sorted(self.words.keys()): - if self.exclude_for_alignment(w): - continue - for pron in sorted( - self.words[w], - key=lambda x: (x["pronunciation"], x["probability"], x["disambiguation"]), - ): - phones = list(pron["pronunciation"]) - prob = pron["probability"] - disambig_symbol = pron["disambiguation"] - if self.position_dependent_phones: - if len(phones) == 1: - phones[0] += "_S" - else: - for i in range(len(phones)): - if i == 0: - phones[i] += "_B" - elif i == len(phones) - 1: - phones[i] += "_E" - else: - phones[i] += "_I" - if not self.pronunciation_probabilities: - pron_cost = 0 - else: - if prob is None: - prob = 1.0 - elif not prob: - prob = 0.001 # Dithering to ensure low probability entries - pron_cost = -1 * math.log(prob) - - pron_cost_string = "" - if pron_cost != 0: - pron_cost_string = f"\t{pron_cost}" - - s = loopstate - word_or_eps = w - local_nosilcost = nosilcost + pron_cost - local_silcost = silcost + pron_cost - while len(phones) > 0: - p = phones.pop(0) - if len(phones) > 0 or ( - write_disambiguation and disambig_symbol is not None - ): - ns = nextstate - nextstate += 1 - outf.write( - "\t".join(map(str, [s, ns, p, word_or_eps])) - + pron_cost_string - + "\n" - ) - word_or_eps = "" - pron_cost_string = "" - s = ns - elif self.sil_prob == 0: - ns = loopstate - outf.write( - "\t".join(map(str, [s, ns, p, word_or_eps])) - + pron_cost_string - + "\n" - ) - word_or_eps = "" - pron_cost_string = "" - s = ns - else: - outf.write( - "\t".join( - map(str, [s, loopstate, p, word_or_eps, local_nosilcost]) - ) - + "\n" - ) - outf.write( - "\t".join(map(str, [s, silstate, p, word_or_eps, local_silcost])) - + "\n" - ) - if write_disambiguation and disambig_symbol is not None: - outf.write( - "\t".join( - map( - str, - [ - s, - loopstate, - f"#{disambig_symbol}", - word_or_eps, - local_nosilcost, - ], - ) - ) - + "\n" - ) - outf.write( - "\t".join( - map( - str, - [ - s, - silstate, - f"#{disambig_symbol}", - word_or_eps, - local_silcost, - ], - ) - ) - + "\n" - ) - - outf.write(f"{loopstate}\t0\n") - - -class MultispeakerDictionary(Dictionary): - """ - Class containing information about a pronunciation dictionary with different dictionaries per speaker - - Parameters - ---------- - input_path : str - Path to an input pronunciation dictionary - output_directory : str - Path to a directory to store files for Kaldi - oov_code : str, optional - What to label words not in the dictionary, defaults to ``''`` - position_dependent_phones : bool, optional - Specifies whether phones should be represented as dependent on their - position in the word (beginning, middle or end), defaults to True - num_sil_states : int, optional - Number of states to use for silence phones, defaults to 5 - num_nonsil_states : int, optional - Number of states to use for non-silence phones, defaults to 3 - shared_silence_phones : bool, optional - Specify whether to share states across all silence phones, defaults - to True - sil_prob : float, optional - Probability of optional silences following words, defaults to 0.5 - word_set : Collection[str], optional - Word set to limit output files - debug: bool, optional - Flag for whether to perform debug steps and prevent intermediate cleanup - logger: :class:`~logging.Logger`, optional - Logger to output information to - punctuation: str, optional - Punctuation to use when parsing text - clitic_markers: str, optional - Clitic markers to use when parsing text - compound_markers: str, optional - Compound markers to use when parsing text - multilingual_ipa: bool, optional - Flag for multilingual IPA mode, defaults to False - strip_diacritics: List[str], optional - Diacritics to strip in multilingual IPA mode - digraphs: List[str], optional - Digraphs to split up in multilingual IPA mode - """ - - has_multiple = True - - def __init__( - self, - input_path: str, - output_directory: str, - oov_code: Optional[str] = "", - position_dependent_phones: Optional[bool] = True, - num_sil_states: Optional[int] = 5, - num_nonsil_states: Optional[int] = 3, - shared_silence_phones: Optional[bool] = True, - sil_prob: Optional[float] = 0.5, - word_set: Optional[List[str]] = None, - debug: Optional[bool] = False, - logger: Optional[Logger] = None, - punctuation: PunctuationType = None, - clitic_markers: PunctuationType = None, - compound_markers: PunctuationType = None, - multilingual_ipa: Optional[bool] = False, - strip_diacritics: IpaType = None, - digraphs: IpaType = None, - ): - self.multilingual_ipa = multilingual_ipa - self.strip_diacritics = DEFAULT_STRIP_DIACRITICS - self.digraphs = DEFAULT_DIGRAPHS - if strip_diacritics is not None: - self.strip_diacritics = strip_diacritics - if digraphs is not None: - self.digraphs = digraphs - self.punctuation = DEFAULT_PUNCTUATION - self.clitic_markers = DEFAULT_CLITIC_MARKERS - self.compound_markers = DEFAULT_COMPOUND_MARKERS - if punctuation is not None: - self.punctuation = punctuation - if clitic_markers is not None: - self.clitic_markers = clitic_markers - if compound_markers is not None: - self.compound_markers = compound_markers - self.input_path = input_path - self.debug = debug - self.output_directory = os.path.join(output_directory, "dictionary") - os.makedirs(self.output_directory, exist_ok=True) - self.log_file = os.path.join(self.output_directory, "dictionary.log") - if logger is None: - self.logger = logging.getLogger("dictionary_setup") - self.logger.setLevel(logging.INFO) - handler = logging.FileHandler(self.log_file, "w", "utf-8") - handler.setFormatter = logging.Formatter("%(name)s %(message)s") - self.logger.addHandler(handler) - else: - self.logger = logger - self.num_sil_states = num_sil_states - self.num_nonsil_states = num_nonsil_states - self.shared_silence_phones = shared_silence_phones - self.sil_prob = sil_prob - self.oov_code = oov_code - self.sil_code = "!sil" - self.oovs_found = Counter() - self.position_dependent_phones = position_dependent_phones - self.max_disambiguation_symbol = 0 - self.disambiguation_symbols = set() - self.optional_silence = "sp" - self.nonoptional_silence = "sil" - - if word_set is not None: - word_set = {sanitize(x, self.punctuation, self.clitic_markers) for x in word_set} - word_set.add("!sil") - word_set.add(self.oov_code) - self.word_set = word_set - - if not os.path.exists(input_path): - raise (DictionaryPathError(input_path)) - if not os.path.isfile(input_path): - raise (DictionaryFileError(input_path)) - - self.speaker_mapping = {} - self.dictionary_mapping = {} - self.logger.info("Parsing multispeaker dictionary file") - available_langs = get_available_dictionaries() - with open(input_path, "r", encoding="utf8") as f: - data = yaml.safe_load(f) - for speaker, path in data.items(): - if path in available_langs: - path = get_dictionary_path(path) - dictionary_name = os.path.splitext(os.path.basename(path))[0] - self.speaker_mapping[speaker] = dictionary_name - if dictionary_name not in self.dictionary_mapping: - self.dictionary_mapping[dictionary_name] = Dictionary( - path, - output_directory, - oov_code=self.oov_code, - position_dependent_phones=self.position_dependent_phones, - word_set=self.word_set, - num_sil_states=self.num_sil_states, - num_nonsil_states=self.num_nonsil_states, - shared_silence_phones=self.shared_silence_phones, - sil_prob=self.sil_prob, - debug=self.debug, - logger=self.logger, - punctuation=self.punctuation, - clitic_markers=self.clitic_markers, - compound_markers=self.compound_markers, - multilingual_ipa=self.multilingual_ipa, - strip_diacritics=self.strip_diacritics, - digraphs=self.digraphs, - ) - - self.nonsil_phones = set() - self.sil_phones = {"sp", "spn", "sil"} - self.words = set() - self.clitic_set = set() - for d in self.dictionary_mapping.values(): - self.nonsil_phones.update(d.nonsil_phones) - self.sil_phones.update(d.sil_phones) - self.words.update(d.words) - self.clitic_set.update(d.clitic_set) - self.words_mapping = {} - self.phone_mapping = {} - - @property - def silences(self) -> set: - """ - Set of silence phones - """ - return {self.optional_silence, self.nonoptional_silence} - - def get_dictionary_name(self, speaker: Union[str, Speaker]) -> str: - """ - Get the dictionary name for a given speaker - - Parameters - ---------- - speaker: Union[Speaker, str] - Speaker to look up - - Returns - ------- - str - Dictionary name for the speaker - """ - if not isinstance(speaker, str): - speaker = speaker.name - if speaker not in self.speaker_mapping: - return self.speaker_mapping["default"] - return self.speaker_mapping[speaker] - - def get_dictionary(self, speaker: Union[Speaker, str]) -> Dictionary: - """ - Get a dictionary for a given speaker - - Parameters - ---------- - speaker: Union[Speaker, str] - Speaker to look up - - Returns - ------- - Dictionary - Dictionary for the speaker - """ - return self.dictionary_mapping[self.get_dictionary_name(speaker)] - - def generate_mappings(self) -> None: - """ - Generate phone and word mappings from text to integer IDs - """ - self.phone_mapping = {} - i = 0 - self.phone_mapping[""] = i - if self.position_dependent_phones: - for p in self.positional_sil_phones: - i += 1 - self.phone_mapping[p] = i - for p in self.positional_nonsil_phones: - i += 1 - self.phone_mapping[p] = i - else: - for p in sorted(self.sil_phones): - i += 1 - self.phone_mapping[p] = i - for p in sorted(self.nonsil_phones): - i += 1 - self.phone_mapping[p] = i - - self.words_mapping = {} - i = 0 - self.words_mapping[""] = i - for w in sorted(self.words): - if self.exclude_for_alignment(w): - continue - i += 1 - self.words_mapping[w] = i - - self.words_mapping["#0"] = i + 1 - self.words_mapping[""] = i + 2 - self.words_mapping[""] = i + 3 - self.words.update(["", "#0", "", ""]) - self.oovs_found = Counter() - self.max_disambiguation_symbol = 0 - for d in self.dictionary_mapping.values(): - d.generate_mappings() - if d.max_disambiguation_symbol > self.max_disambiguation_symbol: - self.max_disambiguation_symbol = d.max_disambiguation_symbol - i = max(self.phone_mapping.values()) - self.disambiguation_symbols = set() - for x in range(self.max_disambiguation_symbol + 2): - p = f"#{x}" - self.disambiguation_symbols.add(p) - i += 1 - self.phone_mapping[p] = i - - def write(self, write_disambiguation: Optional[bool] = False) -> None: - """ - Write all child dictionaries to the temporary directory - - Parameters - ---------- - write_disambiguation: bool, optional - Flag to use disambiguation symbols in the output - """ - os.makedirs(self.phones_dir, exist_ok=True) - self.generate_mappings() - for d in self.dictionary_mapping.values(): - d.phone_mapping = self.phone_mapping - d.write(write_disambiguation) - - @property - def output_paths(self) -> Dict[str, str]: - """ - Mapping of output directory for child dictionaries - """ - return {d.name: d.output_directory for d in self.dictionary_mapping.values()} - - -if TYPE_CHECKING: - DictionaryType = Union[MultispeakerDictionary, Dictionary] diff --git a/montreal_forced_aligner/dictionary/__init__.py b/montreal_forced_aligner/dictionary/__init__.py new file mode 100644 index 00000000..5796eeff --- /dev/null +++ b/montreal_forced_aligner/dictionary/__init__.py @@ -0,0 +1,21 @@ +""" +Pronunciation dictionaries +========================== + +""" + +from .base_dictionary import PronunciationDictionary +from .data import DictionaryData +from .multispeaker import MultispeakerDictionary + +__all__ = [ + "base_dictionary", + "multispeaker", + "data", + "MultispeakerDictionary", + "PronunciationDictionary", + "DictionaryData", +] +MultispeakerDictionary.__module__ = "montreal_forced_aligner.dictionary" +PronunciationDictionary.__module__ = "montreal_forced_aligner.dictionary" +DictionaryData.__module__ = "montreal_forced_aligner.dictionary" diff --git a/montreal_forced_aligner/dictionary/base_dictionary.py b/montreal_forced_aligner/dictionary/base_dictionary.py new file mode 100644 index 00000000..f09f9912 --- /dev/null +++ b/montreal_forced_aligner/dictionary/base_dictionary.py @@ -0,0 +1,1118 @@ +"""Pronunciation dictionaries for use in alignment and transcription""" + +from __future__ import annotations + +import logging +import math +import os +import subprocess +import sys +from collections import Counter, defaultdict +from typing import TYPE_CHECKING, Any, Collection, Dict, List, Optional, Set, Tuple, Union + +if TYPE_CHECKING: + from ..abc import ReversedMappingType, DictionaryEntryType + +from ..abc import Dictionary +from ..config.dictionary_config import DictionaryConfig +from ..exceptions import DictionaryError, DictionaryFileError +from ..models import DictionaryModel +from ..utils import thirdparty_binary +from .data import DictionaryData + +__all__ = [ + "PronunciationDictionary", +] + + +class PronunciationDictionary(Dictionary): + """ + Class containing information about a pronunciation dictionary + + Parameters + ---------- + dictionary_model : :class:`~montreal_forced_aligner.models.DictionaryModel` + MFA Dictionary model + output_directory : str + Path to a directory to store files for Kaldi + config: DictionaryConfig + Configuration for generating lexicons + word_set : Collection[str], optional + Word set to limit output files + logger: :class:`~logging.Logger`, optional + Logger to output information to + """ + + topo_template = " {cur_state} {cur_state} {cur_state} 0.75 {next_state} 0.25 " + topo_sil_template = " {cur_state} {cur_state} {transitions} " + topo_transition_template = " {} {}" + positions: List[str] = ["_B", "_E", "_I", "_S"] + + def __init__( + self, + dictionary_model: Union[DictionaryModel, str], + output_directory: str, + config: Optional[DictionaryConfig] = None, + word_set: Optional[Collection[str]] = None, + logger: Optional[logging.Logger] = None, + ): + if isinstance(dictionary_model, str): + dictionary_model = DictionaryModel(dictionary_model) + if config is None: + config = DictionaryConfig() + super().__init__(dictionary_model, config) + self.output_directory = os.path.join(output_directory, self.name) + os.makedirs(self.output_directory, exist_ok=True) + self.log_file = os.path.join(self.output_directory, f"{self.name}.log") + if logger is None: + self.logger = logging.getLogger("dictionary_setup") + self.logger.setLevel(logging.INFO) + handler = logging.FileHandler(self.log_file, "w", "utf-8") + handler.setFormatter = logging.Formatter("%(name)s %(message)s") + self.logger.addHandler(handler) + else: + self.logger = logger + self.oovs_found = Counter() + + self.words = {} + self.graphemes = set() + self.all_words = defaultdict(list) + self.words[self.config.silence_word] = [ + {"pronunciation": (self.config.nonoptional_silence_phone,), "probability": 1} + ] + self.words[self.config.oov_word] = [ + {"pronunciation": (self.config.oov_phone,), "probability": 1} + ] + + progress = f'Parsing dictionary "{self.name}"' + if self.dictionary_model.pronunciation_probabilities: + progress += " with pronunciation probabilities" + else: + progress += " without pronunciation probabilities" + if self.dictionary_model.silence_probabilities: + progress += " with silence probabilities" + else: + progress += " without silence probabilities" + self.logger.info(progress) + with open(self.dictionary_model.path, "r", encoding="utf8") as inf: + for i, line in enumerate(inf): + line = line.strip() + if not line: + continue + line = line.split() + word = self.config.sanitize(line.pop(0).lower()) + if not line: + raise DictionaryError( + f"Line {i} of {self.dictionary_model.path} does not have a pronunciation." + ) + if word in [self.config.silence_word, self.config.oov_word]: + continue + self.graphemes.update(word) + prob = 1 + if self.dictionary_model.pronunciation_probabilities: + prob = float(line.pop(0)) + if prob > 1 or prob < 0: + raise ValueError + if self.dictionary_model.silence_probabilities: + right_sil_prob = float(line.pop(0)) + left_sil_prob = float(line.pop(0)) + left_nonsil_prob = float(line.pop(0)) + else: + right_sil_prob = None + left_sil_prob = None + left_nonsil_prob = None + if self.config.multilingual_ipa: + pron = self.config.parse_ipa(line) + else: + pron = tuple(line) + pronunciation = { + "pronunciation": pron, + "probability": prob, + "disambiguation": None, + "right_sil_prob": right_sil_prob, + "left_sil_prob": left_sil_prob, + "left_nonsil_prob": left_nonsil_prob, + } + if self.config.multilingual_ipa: + pronunciation["original_pronunciation"] = tuple(line) + if not any(x in self.config.silence_phones for x in pron): + self.config.non_silence_phones.update(pron) + if word in self.words and pron in {x["pronunciation"] for x in self.words[word]}: + continue + if word not in self.words: + self.words[word] = [] + self.words[word].append(pronunciation) + # test whether a word is a clitic + is_clitic = False + for cm in self.config.clitic_markers: + if word.startswith(cm) or word.endswith(cm): + is_clitic = True + if is_clitic: + self.config.clitic_set.add(word) + self.words_mapping = {} + if word_set is not None: + word_set = {y for x in word_set for y in self._lookup(x)} + word_set.add(self.config.silence_word) + word_set.add(self.config.oov_word) + self.word_set = word_set + if self.word_set is not None: + self.word_set = self.word_set | self.config.clitic_set + if not self.graphemes: + raise DictionaryFileError( + f"No words were found in the dictionary path {self.dictionary_model.path}" + ) + + def __hash__(self) -> Any: + """Return the hash of a given dictionary""" + return hash(self.dictionary_model.path) + + @property + def output_paths(self) -> Dict[str, str]: + """ + Mapping of output directory for this dictionary + """ + return {self.name: self.output_directory} + + @property + def silences(self) -> Set[str]: + """ + Set of symbols that correspond to silence + """ + return self.config.silence_phones + + def data(self, word_set: Optional[Collection[str]] = None) -> DictionaryData: + """ + Generates a dictionary data for use in parsing utilities + + Parameters + ---------- + word_set: Collection[str], optional + Word set to limit data to + + Returns + ------- + DictionaryData + Data necessary for parsing text + """ + + def word_check(word): + """Check whether a word should be included in the output""" + if word in word_set: + return True + if word in self.config.clitic_set: + return True + if word in self.config.specials_set: + return True + return False + + if word_set: + words_mapping = {k: v for k, v in self.words_mapping.items() if word_check(k)} + reversed_word_mapping = { + k: v for k, v in self.reversed_word_mapping.items() if word_check(v) + } + words = {k: v for k, v in self.words.items() if word_check(k)} + else: + words_mapping = self.words_mapping + reversed_word_mapping = self.reversed_word_mapping + words = self.words + return DictionaryData( + self.config, + words_mapping, + reversed_word_mapping, + self.reversed_phone_mapping, + words, + ) + + def set_word_set(self, word_set: Collection[str]) -> None: + """ + Limit output to a subset of overall words + + Parameters + ---------- + word_set: Collection[str] + Word set to limit generated files to + """ + word_set = {y for x in word_set for y in self._lookup(x)} + word_set.add(self.config.silence_word) + word_set.add(self.config.oov_word) + self.word_set = word_set | self.config.clitic_set + self.generate_mappings() + + @property + def actual_words(self) -> Dict[str, "DictionaryEntryType"]: + """ + Mapping of words to integer IDs without Kaldi-internal words + """ + return { + k: v for k, v in self.words.items() if k not in self.config.specials_set and len(v) + } + + def split_clitics(self, item: str) -> List[str]: + """ + Split a word into subwords based on clitic and compound markers + + Parameters + ---------- + item: str + Word to split up + + Returns + ------- + List[str] + List of subwords + """ + return self.data().split_clitics(item) + + def __len__(self) -> int: + """Return the number of pronunciations across all words""" + return sum(len(x) for x in self.words.values()) + + def exclude_for_alignment(self, word: str) -> bool: + """ + Check for whether to exclude a word from alignment lexicons (if there is a word set in the dictionary, + checks whether the given string is in the word set) + + Parameters + ---------- + word: str + Word to check + + Returns + ------- + bool + True if there is no word set on the dictionary, or if the word is in the given word set + """ + if self.word_set is None: + return False + if word not in self.word_set and word not in self.config.clitic_set: + return True + return False + + @property + def phone_mapping(self) -> Dict[str, int]: + return self.config.phone_mapping + + def generate_mappings(self) -> None: + """ + Generate phone and word mappings from text to integer IDs + """ + self.words_mapping = {} + i = 0 + self.words_mapping[""] = i + for w in sorted(self.words.keys()): + if self.exclude_for_alignment(w): + continue + i += 1 + self.words_mapping[w] = i + + self.words_mapping["#0"] = i + 1 + self.words_mapping[""] = i + 2 + self.words_mapping[""] = i + 3 + self.oovs_found = Counter() + self.add_disambiguation() + + def add_disambiguation(self) -> None: + """ + Calculate disambiguation symbols for each pronunciation + """ + subsequences = set() + pronunciation_counts = defaultdict(int) + + for w, prons in self.words.items(): + if self.exclude_for_alignment(w): + continue + for p in prons: + pronunciation_counts[p["pronunciation"]] += 1 + pron = p["pronunciation"][:-1] + while pron: + subsequences.add(tuple(p)) + pron = pron[:-1] + last_used = defaultdict(int) + for w, prons in sorted(self.words.items()): + if self.exclude_for_alignment(w): + continue + for p in prons: + if ( + pronunciation_counts[p["pronunciation"]] == 1 + and not p["pronunciation"] in subsequences + ): + disambig = None + else: + pron = p["pronunciation"] + last_used[pron] += 1 + disambig = last_used[pron] + p["disambiguation"] = disambig + if last_used: + self.config.max_disambiguation_symbol = max( + self.config.max_disambiguation_symbol, max(last_used.values()) + ) + + def create_utterance_fst(self, text: List[str], frequent_words: List[Tuple[str, int]]) -> str: + """ + Create an FST for an utterance with frequent words as a unigram language model + + Parameters + ---------- + text: List[str] + Text of the utterance + frequent_words: List[Tuple[str, int]] + Frequent words to incorporate into the FST + Returns + ------- + str + FST created from the utterance text and frequent words + """ + num_words = len(text) + word_probs = Counter(text) + word_probs = {k: v / num_words for k, v in word_probs.items()} + word_probs.update(frequent_words) + fst_text = "" + for k, v in word_probs.items(): + cost = -1 * math.log(v) + w = self.to_int(k)[0] + fst_text += f"0 0 {w} {w} {cost}\n" + fst_text += f"0 {-1 * math.log(1 / num_words)}\n" + return fst_text + + def to_int(self, item: str) -> List[int]: + """ + Convert a given word into integer IDs + + Parameters + ---------- + item: str + Word to look up + + Returns + ------- + List[int] + List of integer IDs corresponding to each subword + """ + return self.data().to_int(item) + + def _lookup(self, item: str) -> List[str]: + """ + Look up a word and return the list of sub words if necessary taking into account clitic and compound markers + + Parameters + ---------- + item: str + Word to look up + + Returns + ------- + List[str] + List of subwords that are in the dictionary + """ + return self.data().lookup(item) + + def check_word(self, item: str) -> bool: + """ + Check whether a word is in the dictionary, takes into account sanitization and + clitic and compound markers + + Parameters + ---------- + item: str + Word to check + + Returns + ------- + bool + True if the look up would not result in an OOV item + """ + return self.data().check_word(item) + + @property + def reversed_word_mapping(self) -> ReversedMappingType: + """ + A mapping of integer ids to words + """ + mapping = {} + for k, v in self.words_mapping.items(): + mapping[v] = k + return mapping + + @property + def reversed_phone_mapping(self) -> ReversedMappingType: + """ + A mapping of integer ids to phones + """ + mapping = {} + for k, v in self.phone_mapping.items(): + mapping[v] = k + return mapping + + @property + def oov_int(self) -> int: + """ + The integer id for out of vocabulary items + """ + return self.words_mapping[self.config.oov_word] + + @property + def phones_dir(self) -> str: + """ + Directory to store information Kaldi needs about phones + """ + return os.path.join(self.output_directory, "phones") + + @property + def words_symbol_path(self) -> str: + """ + Path of word to int mapping file for the dictionary + """ + return os.path.join(self.output_directory, "words.txt") + + @property + def disambig_path(self) -> str: + """ + Path of disambiguated lexicon fst (L.fst) + """ + return os.path.join(self.output_directory, "L_disambig.fst") + + def write(self, write_disambiguation: Optional[bool] = False) -> None: + """ + Write the files necessary for Kaldi + + Parameters + ---------- + write_disambiguation: bool, optional + Flag for including disambiguation information + """ + self.logger.info("Creating dictionary information...") + os.makedirs(self.phones_dir, exist_ok=True) + self.generate_mappings() + self._write_graphemes() + self._write_phone_map_file() + self._write_phone_sets() + self._write_phone_symbol_table() + self._write_disambig() + self._write_topo() + self._write_word_boundaries() + self._write_extra_questions() + self._write_word_file() + self._write_align_lexicon() + if write_disambiguation: + self._write_fst_text_disambiguated() + else: + self._write_basic_fst_text() + self._write_fst_binary(write_disambiguation=write_disambiguation) + self.cleanup() + + def cleanup(self) -> None: + """ + Clean up temporary files in the output directory + """ + if not self.config.debug: + if os.path.exists(os.path.join(self.output_directory, "temp.fst")): + os.remove(os.path.join(self.output_directory, "temp.fst")) + if os.path.exists(os.path.join(self.output_directory, "lexicon.text.fst")): + os.remove(os.path.join(self.output_directory, "lexicon.text.fst")) + + def _write_graphemes(self) -> None: + """ + Write graphemes to temporary directory + """ + outfile = os.path.join(self.output_directory, "graphemes.txt") + if os.path.exists(outfile): + return + with open(outfile, "w", encoding="utf8") as f: + for char in sorted(self.graphemes): + f.write(char + "\n") + + def export_lexicon( + self, + path: str, + write_disambiguation: Optional[bool] = False, + probability: Optional[bool] = False, + ) -> None: + """ + Export pronunciation dictionary to a text file + + Parameters + ---------- + path: str + Path to save dictionary + write_disambiguation: bool, optional + Flag for whether to include disambiguation information + probability: bool, optional + Flag for whether to include probabilities + """ + with open(path, "w", encoding="utf8") as f: + for w in sorted(self.words.keys()): + for p in sorted( + self.words[w], + key=lambda x: (x["pronunciation"], x["probability"], x["disambiguation"]), + ): + phones = " ".join(p["pronunciation"]) + if write_disambiguation and p["disambiguation"] is not None: + phones += f" #{p['disambiguation']}" + if probability: + f.write(f"{w}\t{p['probability']}\t{phones}\n") + else: + f.write(f"{w}\t{phones}\n") + + def _write_phone_map_file(self) -> None: + """ + Write the phone map to the temporary directory + """ + outfile = os.path.join(self.output_directory, "phone_map.txt") + if os.path.exists(outfile): + return + with open(outfile, "w", encoding="utf8") as f: + for sp in self.config.silence_phones: + if self.config.position_dependent_phones: + new_phones = [sp + x for x in ["", ""] + self.positions] + else: + new_phones = [sp] + f.write(" ".join(new_phones) + "\n") + for nsp in self.config.non_silence_phones: + if self.config.position_dependent_phones: + new_phones = [nsp + x for x in [""] + self.positions] + else: + new_phones = [nsp] + f.write(" ".join(new_phones) + "\n") + + def _write_phone_symbol_table(self) -> None: + """ + Write the phone mapping to the temporary directory + """ + outfile = os.path.join(self.output_directory, "phones.txt") + if os.path.exists(outfile): + return + with open(outfile, "w", encoding="utf8") as f: + for p, i in sorted(self.phone_mapping.items(), key=lambda x: x[1]): + f.write(f"{p} {i}\n") + + def _write_word_boundaries(self) -> None: + """ + Write the word boundaries file to the temporary directory + """ + boundary_path = os.path.join(self.output_directory, "phones", "word_boundary.txt") + boundary_int_path = os.path.join(self.output_directory, "phones", "word_boundary.int") + if os.path.exists(boundary_path) and os.path.exists(boundary_int_path): + return + with open(boundary_path, "w", encoding="utf8") as f, open( + boundary_int_path, "w", encoding="utf8" + ) as intf: + if self.config.position_dependent_phones: + for p in sorted(self.phone_mapping.keys(), key=lambda x: self.phone_mapping[x]): + if p == "" or p.startswith("#"): + continue + cat = "nonword" + if p.endswith("_B"): + cat = "begin" + elif p.endswith("_S"): + cat = "singleton" + elif p.endswith("_I"): + cat = "internal" + elif p.endswith("_E"): + cat = "end" + f.write(" ".join([p, cat]) + "\n") + intf.write(" ".join([str(self.phone_mapping[p]), cat]) + "\n") + + def _write_word_file(self) -> None: + """ + Write the word mapping to the temporary directory + """ + words_path = os.path.join(self.output_directory, "words.txt") + if os.path.exists(words_path): + return + if sys.platform == "win32": + newline = "" + else: + newline = None + with open(words_path, "w", encoding="utf8", newline=newline) as f: + for w, i in sorted(self.words_mapping.items(), key=lambda x: x[1]): + f.write(f"{w} {i}\n") + + def _write_align_lexicon(self) -> None: + """ + Write the alignment lexicon text file to the temporary directory + """ + path = os.path.join(self.phones_dir, "align_lexicon.int") + if os.path.exists(path): + return + + with open(path, "w", encoding="utf8") as f: + for w, i in self.words_mapping.items(): + if self.exclude_for_alignment(w): + continue + if w not in self.words: # special characters + continue + for pron in sorted( + self.words[w], + key=lambda x: (x["pronunciation"], x["probability"], x["disambiguation"]), + ): + + phones = list(pron["pronunciation"]) + if self.config.position_dependent_phones: + if len(phones) == 1: + phones[0] += "_S" + else: + for j in range(len(phones)): + if j == 0: + phones[j] += "_B" + elif j == len(phones) - 1: + phones[j] += "_E" + else: + phones[j] += "_I" + p = " ".join(str(self.phone_mapping[x]) for x in phones) + f.write(f"{i} {i} {p}\n".format(i=i, p=p)) + + def _write_topo(self) -> None: + """ + Write the topo file to the temporary directory + """ + filepath = os.path.join(self.output_directory, "topo") + if os.path.exists(filepath): + return + sil_transp = 1 / (self.config.num_silence_states - 1) + initial_transition = [ + self.topo_transition_template.format(x, sil_transp) + for x in range(self.config.num_silence_states - 1) + ] + middle_transition = [ + self.topo_transition_template.format(x, sil_transp) + for x in range(1, self.config.num_silence_states) + ] + final_transition = [ + self.topo_transition_template.format(self.config.num_silence_states - 1, 0.75), + self.topo_transition_template.format(self.config.num_silence_states, 0.25), + ] + with open(filepath, "w") as f: + f.write("\n") + f.write("\n") + f.write("\n") + phones = self.config.kaldi_non_silence_phones + f.write(f"{' '.join(str(self.phone_mapping[x]) for x in phones)}\n") + f.write("\n") + states = [ + self.topo_template.format(cur_state=x, next_state=x + 1) + for x in range(self.config.num_non_silence_states) + ] + f.write("\n".join(states)) + f.write(f"\n {self.config.num_non_silence_states} \n") + f.write("\n") + + f.write("\n") + f.write("\n") + + phones = self.config.kaldi_silence_phones + f.write(f"{' '.join(str(self.phone_mapping[x]) for x in phones)}\n") + f.write("\n") + states = [] + for i in range(self.config.num_silence_states): + if i == 0: + transition = " ".join(initial_transition) + elif i == self.config.num_silence_states - 1: + transition = " ".join(final_transition) + else: + transition = " ".join(middle_transition) + states.append(self.topo_sil_template.format(cur_state=i, transitions=transition)) + f.write("\n".join(states)) + f.write(f"\n {self.config.num_silence_states} \n") + f.write("\n") + f.write("\n") + + def _write_phone_sets(self) -> None: + """ + Write phone symbol sets to the temporary directory + """ + sharesplit = ["shared", "split"] + if not self.config.shared_silence_phones: + sil_sharesplit = ["not-shared", "not-split"] + else: + sil_sharesplit = sharesplit + + sets_file = os.path.join(self.output_directory, "phones", "sets.txt") + roots_file = os.path.join(self.output_directory, "phones", "roots.txt") + + sets_int_file = os.path.join(self.output_directory, "phones", "sets.int") + roots_int_file = os.path.join(self.output_directory, "phones", "roots.int") + if ( + os.path.exists(sets_file) + and os.path.exists(roots_file) + and os.path.exists(sets_int_file) + and os.path.exists(roots_int_file) + ): + return + + with open(sets_file, "w", encoding="utf8") as setf, open( + roots_file, "w", encoding="utf8" + ) as rootf, open(sets_int_file, "w", encoding="utf8") as setintf, open( + roots_int_file, "w", encoding="utf8" + ) as rootintf: + + # process silence phones + for i, sp in enumerate(self.config.silence_phones): + if self.config.position_dependent_phones: + mapped = [sp + x for x in [""] + self.positions] + else: + mapped = [sp] + setf.write(" ".join(mapped) + "\n") + setintf.write(" ".join(map(str, (self.phone_mapping[x] for x in mapped))) + "\n") + if i == 0: + line = sil_sharesplit + mapped + lineint = sil_sharesplit + [str(self.phone_mapping[x]) for x in mapped] + else: + line = sharesplit + mapped + lineint = sharesplit + [str(self.phone_mapping[x]) for x in mapped] + rootf.write(" ".join(line) + "\n") + rootintf.write(" ".join(lineint) + "\n") + + # process nonsilence phones + for nsp in sorted(self.config.non_silence_phones): + if self.config.position_dependent_phones: + mapped = [nsp + x for x in self.positions] + else: + mapped = [nsp] + setf.write(" ".join(mapped) + "\n") + setintf.write(" ".join(map(str, (self.phone_mapping[x] for x in mapped))) + "\n") + line = sharesplit + mapped + lineint = sharesplit + [str(self.phone_mapping[x]) for x in mapped] + rootf.write(" ".join(line) + "\n") + rootintf.write(" ".join(lineint) + "\n") + + def _write_extra_questions(self) -> None: + """ + Write extra questions symbols to the temporary directory + """ + phone_extra = os.path.join(self.phones_dir, "extra_questions.txt") + phone_extra_int = os.path.join(self.phones_dir, "extra_questions.int") + if os.path.exists(phone_extra) and os.path.exists(phone_extra_int): + return + with open(phone_extra, "w", encoding="utf8") as outf, open( + phone_extra_int, "w", encoding="utf8" + ) as intf: + silences = self.config.kaldi_silence_phones + outf.write(" ".join(silences) + "\n") + intf.write(" ".join(str(self.phone_mapping[x]) for x in silences) + "\n") + + non_silences = self.config.kaldi_non_silence_phones + outf.write(" ".join(non_silences) + "\n") + intf.write(" ".join(str(self.phone_mapping[x]) for x in non_silences) + "\n") + if self.config.position_dependent_phones: + for p in self.positions: + line = [x + p for x in sorted(self.config.non_silence_phones)] + outf.write(" ".join(line) + "\n") + intf.write(" ".join(str(self.phone_mapping[x]) for x in line) + "\n") + for p in [""] + self.positions: + line = [x + p for x in sorted(self.config.silence_phones)] + outf.write(" ".join(line) + "\n") + intf.write(" ".join(str(self.phone_mapping[x]) for x in line) + "\n") + + def _write_disambig(self) -> None: + """ + Write disambiguation symbols to the temporary directory + """ + disambig = os.path.join(self.phones_dir, "disambiguation_symbols.txt") + disambig_int = os.path.join(self.phones_dir, "disambiguation_symbols.int") + if os.path.exists(disambig) and os.path.exists(disambig_int): + return + with open(disambig, "w", encoding="utf8") as outf, open( + disambig_int, "w", encoding="utf8" + ) as intf: + for d in sorted( + self.config.disambiguation_symbols, key=lambda x: self.phone_mapping[x] + ): + outf.write(f"{d}\n") + intf.write(f"{self.phone_mapping[d]}\n") + + def _write_fst_binary(self, write_disambiguation: Optional[bool] = False) -> None: + """ + Write the binary fst file to the temporary directory + + Parameters + ---------- + write_disambiguation: bool, optional + Flag for including disambiguation symbols + """ + if write_disambiguation: + lexicon_fst_path = os.path.join(self.output_directory, "lexicon_disambig.text.fst") + output_fst = os.path.join(self.output_directory, "L_disambig.fst") + else: + lexicon_fst_path = os.path.join(self.output_directory, "lexicon.text.fst") + output_fst = os.path.join(self.output_directory, "L.fst") + if os.path.exists(output_fst): + return + + phones_file_path = os.path.join(self.output_directory, "phones.txt") + words_file_path = os.path.join(self.output_directory, "words.txt") + + log_path = os.path.join(self.output_directory, "fst.log") + temp_fst_path = os.path.join(self.output_directory, "temp.fst") + with open(log_path, "w") as log_file: + compile_proc = subprocess.Popen( + [ + thirdparty_binary("fstcompile"), + f"--isymbols={phones_file_path}", + f"--osymbols={words_file_path}", + "--keep_isymbols=false", + "--keep_osymbols=false", + lexicon_fst_path, + temp_fst_path, + ], + stderr=log_file, + ) + compile_proc.communicate() + if write_disambiguation: + temp2_fst_path = os.path.join(self.output_directory, "temp2.fst") + phone_disambig_path = os.path.join(self.output_directory, "phone_disambig.txt") + word_disambig_path = os.path.join(self.output_directory, "word_disambig.txt") + with open(phone_disambig_path, "w") as f: + f.write(str(self.phone_mapping["#0"])) + with open(word_disambig_path, "w") as f: + f.write(str(self.words_mapping["#0"])) + selfloop_proc = subprocess.Popen( + [ + thirdparty_binary("fstaddselfloops"), + phone_disambig_path, + word_disambig_path, + temp_fst_path, + temp2_fst_path, + ], + stderr=log_file, + ) + selfloop_proc.communicate() + arc_sort_proc = subprocess.Popen( + [ + thirdparty_binary("fstarcsort"), + "--sort_type=olabel", + temp2_fst_path, + output_fst, + ], + stderr=log_file, + ) + else: + arc_sort_proc = subprocess.Popen( + [ + thirdparty_binary("fstarcsort"), + "--sort_type=olabel", + temp_fst_path, + output_fst, + ], + stderr=log_file, + ) + arc_sort_proc.communicate() + + def _write_basic_fst_text(self) -> None: + """ + Write the L.fst text file to the temporary directory + """ + sil_disambiguation = None + nonoptional_silence = None + optional_silence_phone = None + lexicon_fst_path = os.path.join(self.output_directory, "lexicon.text.fst") + start_state = 0 + silence_state = 0 + silence_cost = 0 + no_silence_cost = 0 + loop_state = 0 + next_state = 1 + if self.config.silence_probability: + optional_silence_phone = self.config.optional_silence_phone + nonoptional_silence = self.config.nonoptional_silence_phone + + silence_cost = -1 * math.log(self.config.silence_probability) + no_silence_cost = -1 * math.log(1.0 - self.config.silence_probability) + loop_state = 1 + silence_state = 2 + + with open(lexicon_fst_path, "w", encoding="utf8") as outf: + if self.config.silence_probability: + outf.write( + "\t".join( + map(str, [start_state, loop_state, "", "", no_silence_cost]) + ) + + "\n" + ) # no silence + + outf.write( + "\t".join( + map( + str, + [start_state, loop_state, nonoptional_silence, "", silence_cost], + ) + ) + + "\n" + ) # silence + if sil_disambiguation is None: + outf.write( + "\t".join( + map(str, [silence_state, loop_state, optional_silence_phone, ""]) + ) + + "\n" + ) # no cost + next_state = 3 + else: + silence_disambiguation_state = next_state + next_state += 1 + outf.write( + "\t".join( + map( + str, + [ + silence_state, + silence_disambiguation_state, + optional_silence_phone, + "", + ], + ) + ) + + "\n" + ) + outf.write( + "\t".join( + map( + str, + [ + silence_disambiguation_state, + loop_state, + sil_disambiguation, + "", + ], + ) + ) + + "\n" + ) + + for w in sorted(self.words.keys()): + if self.exclude_for_alignment(w): + continue + for pron in sorted( + self.words[w], + key=lambda x: (x["pronunciation"], x["probability"], x["disambiguation"]), + ): + phones = list(pron["pronunciation"]) + prob = pron["probability"] + if self.config.position_dependent_phones: + if len(phones) == 1: + phones[0] += "_S" + else: + for i in range(len(phones)): + if i == 0: + phones[i] += "_B" + elif i == len(phones) - 1: + phones[i] += "_E" + else: + phones[i] += "_I" + if not prob: + prob = 0.001 # Dithering to ensure low probability entries + pron_cost = abs(math.log(prob)) + + current_state = loop_state + word_or_eps = w + local_no_silence_cost = no_silence_cost + pron_cost + local_silcost = no_silence_cost + pron_cost + for i, p in enumerate(phones): + if i < len(phones) - 1: + outf.write( + f"{current_state}\t{next_state}\t{p}\t{word_or_eps}\t{pron_cost}\n" + ) + word_or_eps = "" + pron_cost = 0 + current_state = next_state + next_state += 1 + else: # transition on last phone to loop state + if self.config.silence_probability: + outf.write( + f"{current_state}\t{loop_state}\t{p}\t{word_or_eps}\t{local_no_silence_cost}\n" + ) + outf.write( + f"{current_state}\t{silence_state}\t{p}\t{word_or_eps}\t{local_silcost}\n" + ) + else: + outf.write( + f"{current_state}\t{loop_state}\t{p}\t{word_or_eps}\t{pron_cost}\n" + ) + word_or_eps = "" + + outf.write(f"{loop_state}\t0\n") + + def _write_fst_text_disambiguated(self) -> None: + """ + Write the text L_disambig.fst file to the temporary directory + """ + lexicon_fst_path = os.path.join(self.output_directory, "lexicon_disambig.text.fst") + sil_disambiguation = f"#{self.config.max_disambiguation_symbol + 1}" + assert self.config.silence_probability + start_state = 0 + loop_state = 1 + silence_state = 2 + next_state = 3 + + silence_phone = self.config.nonoptional_silence_phone + + silence_cost = -1 * math.log(self.config.silence_probability) + no_silence_cost = -1 * math.log(1 - self.config.silence_probability) + + with open(lexicon_fst_path, "w", encoding="utf8") as outf: + outf.write( + f"{start_state}\t{loop_state}\t\t\t{no_silence_cost}\n" + ) # no silence + outf.write( + f"{start_state}\t{silence_state}\t\t\t{silence_cost}\n" + ) # silence + silence_disambiguation_state = next_state + next_state += 1 + + outf.write( + f"{silence_state}\t{silence_disambiguation_state}\t{silence_phone}\t\t0.0\n" + ) # silence disambig + outf.write( + f"{silence_disambiguation_state}\t{loop_state}\t{sil_disambiguation}\t\t0.0\n" + ) # silence disambig + + for w in sorted(self.words.keys()): + if self.exclude_for_alignment(w): + continue + for pron in sorted( + self.words[w], + key=lambda x: (x["pronunciation"], x["probability"], x["disambiguation"]), + ): + phones = list(pron["pronunciation"]) + prob = pron["probability"] + disambig_symbol = pron["disambiguation"] + if self.config.position_dependent_phones: + if len(phones) == 1: + phones[0] += "_S" + else: + for i in range(len(phones)): + if i == 0: + phones[i] += "_B" + elif i == len(phones) - 1: + phones[i] += "_E" + else: + phones[i] += "_I" + if not prob: + prob = 0.001 # Dithering to ensure low probability entries + pron_cost = abs(math.log(prob)) + if disambig_symbol: + phones += [f"#{disambig_symbol}"] + + current_state = loop_state + for i in range(0, len(phones) - 1): + p = phones[i] + outf.write( + f"{current_state}\t{next_state}\t{p}\t{w if i == 0 else ''}\t{pron_cost if i == 0 else 0.0}\n" + ) + current_state = next_state + next_state += 1 + + i = len(phones) - 1 + + local_no_silence_cost = no_silence_cost + pron_cost + local_silcost = silence_cost + pron_cost + if i <= 0: + local_silcost = silence_cost + local_no_silence_cost = no_silence_cost + outf.write( + f"{current_state}\t{loop_state}\t{phones[i] if i >= 0 else ''}\t{w if i <= 0 else ''}\t{local_no_silence_cost}\n" + ) + outf.write( + f"{current_state}\t{silence_state}\t{phones[i] if i >= 0 else ''}\t{w if i <= 0 else ''}\t{local_silcost}\n" + ) + + outf.write(f"{loop_state}\t0.0\n") diff --git a/montreal_forced_aligner/dictionary/data.py b/montreal_forced_aligner/dictionary/data.py new file mode 100644 index 00000000..ebb1d503 --- /dev/null +++ b/montreal_forced_aligner/dictionary/data.py @@ -0,0 +1,261 @@ +"""Pronunciation dictionaries for use in alignment and transcription""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import TYPE_CHECKING, List, Optional + +from ..data import CtmInterval + +if TYPE_CHECKING: + IpaType = Optional[List[str]] + PunctuationType = Optional[str] + from ..abc import DictionaryEntryType, MappingType, ReversedMappingType, WordsType + from ..config.dictionary_config import DictionaryConfig + from ..data import CtmType + +__all__ = [ + "DictionaryData", +] + + +@dataclass +class DictionaryData: + """ + Information required for parsing Kaldi-internal ids to text + """ + + dictionary_config: DictionaryConfig + words_mapping: MappingType + reversed_words_mapping: ReversedMappingType + reversed_phone_mapping: ReversedMappingType + words: WordsType + + @property + def oov_int(self): + return self.words_mapping[self.dictionary_config.oov_word] + + def split_clitics( + self, + item: str, + ) -> List[str]: + """ + Split a word into subwords based on dictionary information + + Parameters + ---------- + item: str + Word to split + + Returns + ------- + List[str] + List of subwords + """ + if item in self.words: + return [item] + if any(x in item for x in self.dictionary_config.compound_markers): + s = re.split(rf"[{''.join(self.dictionary_config.compound_markers)}]", item) + if any(x in item for x in self.dictionary_config.clitic_markers): + new_s = [] + for seg in s: + if any(x in seg for x in self.dictionary_config.clitic_markers): + new_s.extend(self.split_clitics(seg)) + else: + new_s.append(seg) + s = new_s + return s + if any( + x in item and not item.endswith(x) and not item.startswith(x) + for x in self.dictionary_config.clitic_markers + ): + initial, final = re.split( + rf"[{''.join(self.dictionary_config.clitic_markers)}]", item, maxsplit=1 + ) + if any(x in final for x in self.dictionary_config.clitic_markers): + final = self.split_clitics(final) + else: + final = [final] + for clitic in self.dictionary_config.clitic_markers: + if initial + clitic in self.dictionary_config.clitic_set: + return [initial + clitic] + final + elif clitic + final[0] in self.dictionary_config.clitic_set: + final[0] = clitic + final[0] + return [initial] + final + return [item] + + def lookup( + self, + item: str, + ) -> List[str]: + """ + Look up a word and return the list of sub words if necessary + taking into account clitic and compound markers + + Parameters + ---------- + item: str + Word to look up + + Returns + ------- + List[str] + List of subwords that are in the dictionary + """ + + if item in self.words: + return [item] + sanitized = self.dictionary_config.sanitize(item) + if sanitized in self.words: + return [sanitized] + split = self.split_clitics(sanitized) + oov_count = sum(1 for x in split if x not in self.words) + + if oov_count < len( + split + ): # Only returned split item if it gains us any transcribed speech + return split + return [sanitized] + + def to_int( + self, + item: str, + ) -> List[int]: + """ + Convert a given word into integer IDs + + Parameters + ---------- + item: str + Word to look up + + Returns + ------- + List[int] + List of integer IDs corresponding to each subword + """ + if item == "": + return [] + sanitized = self.lookup(item) + text_int = [] + for item in sanitized: + if not item: + continue + if item not in self.words_mapping: + text_int.append(self.oov_int) + else: + text_int.append(self.words_mapping[item]) + return text_int + + def check_word(self, item: str) -> bool: + """ + Check whether a word is in the dictionary, takes into account sanitization and + clitic and compound markers + + Parameters + ---------- + item: str + Word to check + + Returns + ------- + bool + True if the look up would not result in an OOV item + """ + if item == "": + return False + if item in self.words: + return True + sanitized = self.dictionary_config.sanitize(item) + if sanitized in self.words: + return True + + sanitized = self.split_clitics(sanitized) + if all(s in self.words for s in sanitized): + return True + return False + + def map_to_original_pronunciation( + self, phones: CtmType, subpronunciations: List[DictionaryEntryType] + ) -> CtmType: + """ + Convert phone transcriptions from multilingual IPA mode to their original IPA transcription + + Parameters + ---------- + phones: List[CtmInterval] + List of aligned phones + subpronunciations: List[DictionaryEntryType] + Pronunciations of each sub word to reconstruct the transcriptions + + Returns + ------- + List[CtmInterval] + Intervals with their original IPA pronunciation rather than the internal simplified form + """ + transcription = tuple(x.label for x in phones) + new_phones = [] + mapping_ind = 0 + transcription_ind = 0 + for pronunciations in subpronunciations: + pron = None + if mapping_ind >= len(phones): + break + for p in pronunciations: + if ( + "original_pronunciation" in p + and transcription == p["pronunciation"] == p["original_pronunciation"] + ) or (transcription == p["pronunciation"] and "original_pronunciation" not in p): + new_phones.extend(phones) + mapping_ind += len(phones) + break + if ( + p["pronunciation"] + == transcription[ + transcription_ind : transcription_ind + len(p["pronunciation"]) + ] + and pron is None + ): + pron = p + if mapping_ind >= len(phones): + break + if not pron: + new_phones.extend(phones) + mapping_ind += len(phones) + break + to_extend = phones[transcription_ind : transcription_ind + len(pron["pronunciation"])] + transcription_ind += len(pron["pronunciation"]) + p = pron + if ( + "original_pronunciation" not in p + or p["pronunciation"] == p["original_pronunciation"] + ): + new_phones.extend(to_extend) + mapping_ind += len(to_extend) + break + for pi in p["original_pronunciation"]: + if pi == phones[mapping_ind].label: + new_phones.append(phones[mapping_ind]) + else: + modded_phone = pi + new_p = phones[mapping_ind].label + for diacritic in self.dictionary_config.strip_diacritics: + modded_phone = modded_phone.replace(diacritic, "") + if modded_phone == new_p: + phones[mapping_ind].label = pi + new_phones.append(phones[mapping_ind]) + elif mapping_ind != len(phones) - 1: + new_p = phones[mapping_ind].label + phones[mapping_ind + 1].label + if modded_phone == new_p: + new_phones.append( + CtmInterval( + phones[mapping_ind].begin, + phones[mapping_ind + 1].end, + new_p, + phones[mapping_ind].utterance, + ) + ) + mapping_ind += 1 + mapping_ind += 1 + return new_phones diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py new file mode 100644 index 00000000..fcf2a327 --- /dev/null +++ b/montreal_forced_aligner/dictionary/multispeaker.py @@ -0,0 +1,190 @@ +"""Pronunciation dictionaries for use in alignment and transcription""" + +from __future__ import annotations + +import logging +import os +from collections import Counter +from typing import TYPE_CHECKING, Collection, Dict, Optional, Union + +from ..abc import Dictionary +from ..config.dictionary_config import DictionaryConfig +from ..models import DictionaryModel +from .base_dictionary import PronunciationDictionary + +if TYPE_CHECKING: + + from ..corpus.classes import Speaker + + +__all__ = [ + "MultispeakerDictionary", +] + + +class MultispeakerDictionary(Dictionary): + """ + Class containing information about a pronunciation dictionary with different dictionaries per speaker + + Parameters + ---------- + dictionary_model : DictionaryModel + Multispeaker dictionary + output_directory : str + Path to a directory to store files for Kaldi + config: DictionaryConfig, optional + Configuration for generating lexicons + word_set : Collection[str], optional + Word set to limit output files + logger: :class:`~logging.Logger`, optional + Logger to output information to + """ + + def __init__( + self, + dictionary_model: Union[DictionaryModel, str], + output_directory: str, + config: Optional[DictionaryConfig] = None, + word_set: Optional[Collection[str]] = None, + logger: Optional[logging.Logger] = None, + ): + if isinstance(dictionary_model, str): + dictionary_model = DictionaryModel(dictionary_model) + if config is None: + config = DictionaryConfig() + super().__init__(dictionary_model, config) + self.output_directory = os.path.join(output_directory, "dictionary") + os.makedirs(self.output_directory, exist_ok=True) + self.log_file = os.path.join(self.output_directory, "dictionary.log") + if logger is None: + self.logger = logging.getLogger("dictionary_setup") + self.logger.setLevel(logging.INFO) + handler = logging.FileHandler(self.log_file, "w", "utf-8") + handler.setFormatter = logging.Formatter("%(name)s %(message)s") + self.logger.addHandler(handler) + else: + self.logger = logger + + self.speaker_mapping = {} + self.dictionary_mapping = {} + + for speaker, dictionary in self.dictionary_model.load_dictionary_paths().items(): + self.speaker_mapping[speaker] = dictionary.name + if dictionary.name not in self.dictionary_mapping: + self.dictionary_mapping[dictionary.name] = PronunciationDictionary( + dictionary, + self.output_directory, + config, + word_set=word_set, + logger=self.logger, + ) + + @property + def phones_dir(self): + return self.get_dictionary("default").phones_dir + + @property + def topo_path(self): + return os.path.join(self.get_dictionary("default").output_directory, "topo") + + @property + def oovs_found(self) -> Counter[str, int]: + oovs = Counter() + for dictionary in self.dictionary_mapping.values(): + oovs.update(dictionary.oovs_found) + return oovs + + def save_oovs_found(self, directory: str) -> None: + """ + Save all out of vocabulary items to a file in the specified directory + + Parameters + ---------- + directory : str + Path to directory to save ``oovs_found.txt`` + """ + with open(os.path.join(directory, "oovs_found.txt"), "w", encoding="utf8") as f, open( + os.path.join(directory, "oov_counts.txt"), "w", encoding="utf8" + ) as cf: + for oov in sorted(self.oovs_found.keys(), key=lambda x: (-self.oovs_found[x], x)): + f.write(oov + "\n") + cf.write(f"{oov}\t{self.oovs_found[oov]}\n") + + @property + def silences(self) -> set: + """ + Set of silence phones + """ + return self.config.silence_phones + + @property + def default_dictionary(self) -> PronunciationDictionary: + """Default PronunciationDictionary""" + return self.get_dictionary("default") + + def get_dictionary_name(self, speaker: Union[str, Speaker]) -> str: + """ + Get the dictionary name for a given speaker + + Parameters + ---------- + speaker: Union[Speaker, str] + Speaker to look up + + Returns + ------- + str + PronunciationDictionary name for the speaker + """ + if not isinstance(speaker, str): + speaker = speaker.name + if speaker not in self.speaker_mapping: + return self.speaker_mapping["default"] + return self.speaker_mapping[speaker] + + def get_dictionary(self, speaker: Union[Speaker, str]) -> PronunciationDictionary: + """ + Get a dictionary for a given speaker + + Parameters + ---------- + speaker: Union[Speaker, str] + Speaker to look up + + Returns + ------- + :class:`~montreal_forced_aligner.dictionary.PronunciationDictionary` + PronunciationDictionary for the speaker + """ + return self.dictionary_mapping[self.get_dictionary_name(speaker)] + + def write(self, write_disambiguation: Optional[bool] = False) -> None: + """ + Write all child dictionaries to the temporary directory + + Parameters + ---------- + write_disambiguation: bool, optional + Flag to use disambiguation symbols in the output + """ + for d in self.dictionary_mapping.values(): + d.write(write_disambiguation) + + def set_word_set(self, word_set: Collection[str]) -> None: + """ + Limit output to a subset of overall words + + Parameters + ---------- + word_set: Collection[str] + Word set to limit generated files to + """ + for d in self.dictionary_mapping.values(): + d.set_word_set(word_set) + + @property + def output_paths(self) -> Dict[str, str]: + """ + Mapping of output directory for child dictionaries + """ + return {d.name: d.output_directory for d in self.dictionary_mapping.values()} diff --git a/montreal_forced_aligner/exceptions.py b/montreal_forced_aligner/exceptions.py index daab2ee0..e8f6627f 100644 --- a/montreal_forced_aligner/exceptions.py +++ b/montreal_forced_aligner/exceptions.py @@ -1,4 +1,8 @@ -"""Excepts for Montreal Forced Aligner""" +""" +Exception classes +================= + +""" from __future__ import annotations from typing import TYPE_CHECKING, Collection, Dict, List, Optional, Tuple @@ -8,7 +12,7 @@ from .helper import comma_join if TYPE_CHECKING: - from .dictionary import DictionaryType + from .dictionary import PronunciationDictionary from .models import G2PModel @@ -196,7 +200,7 @@ def __init__(self, path: str): class DictionaryError(MFAError): """ - Exception class for errors in creating Dictionary objects + Exception class for errors in creating dictionary objects """ pass @@ -214,7 +218,7 @@ def __init__(self): class DictionaryPathError(DictionaryError): """ - Exception class for errors in locating paths for Dictionary objects + Exception class for errors in locating paths for dictionary objects Parameters ---------- @@ -231,7 +235,7 @@ def __init__(self, input_path: str): class DictionaryFileError(DictionaryError): """ - Exception class for file type being wrong for Dictionary objects + Exception class for file type being wrong for DictionaryModel objects Parameters ---------- @@ -396,13 +400,13 @@ class PronunciationOrthographyMismatchError(AlignerError): Parameters ---------- - g2p_model: G2PModel + g2p_model: :class:`~montreal_forced_aligner.models.G2PModel` Specified G2P model - dictionary: Dictionary + dictionary: :class:`~montreal_forced_aligner.dictionary.PronunciationDictionary` Specified dictionary """ - def __init__(self, g2p_model: G2PModel, dictionary: DictionaryType): + def __init__(self, g2p_model: G2PModel, dictionary: PronunciationDictionary): super().__init__() missing_graphs = dictionary.graphemes - set(g2p_model.meta["graphemes"]) missing_graphs = [f"{self.error_text(x)}" for x in sorted(missing_graphs)] diff --git a/montreal_forced_aligner/g2p/__init__.py b/montreal_forced_aligner/g2p/__init__.py index bc5c78c9..c22599b6 100644 --- a/montreal_forced_aligner/g2p/__init__.py +++ b/montreal_forced_aligner/g2p/__init__.py @@ -1 +1,14 @@ -"""G2P module for MFA""" +""" +Grapheme to phoneme (G2P) +========================= + + +""" + +from .generator import PyniniDictionaryGenerator +from .trainer import PyniniTrainer + +__all__ = ["generator", "trainer", "PyniniTrainer", "PyniniDictionaryGenerator"] + +PyniniTrainer.__module__ = "montreal_forced_aligner.g2p" +PyniniDictionaryGenerator.__module__ = "montreal_forced_aligner.g2p" diff --git a/montreal_forced_aligner/g2p/trainer.py b/montreal_forced_aligner/g2p/trainer.py index 28027542..e38dd0d0 100644 --- a/montreal_forced_aligner/g2p/trainer.py +++ b/montreal_forced_aligner/g2p/trainer.py @@ -44,8 +44,8 @@ def convert(x): G2P_DISABLED = True if TYPE_CHECKING: + from ..abc import Dictionary, DictionaryEntryType from ..config.train_g2p_config import TrainG2PConfig - from ..dictionary import Dictionary, DictionaryEntryType Labels = List[Any] @@ -490,8 +490,8 @@ class PyniniTrainer: Parameters ---------- - dictionary: Dictionary - Dictionary to train from` + dictionary: :class:`~montreal_forced_aligner.dictionary.PronunciationDictionary` + PronunciationDictionary to train from` model_path: str Output model path train_config: TrainG2PConfig @@ -651,7 +651,7 @@ def train(self, word_dict: Optional[Dict[str, DictionaryEntryType]] = None) -> N Parameters ---------- word_dict: Dict[str, DictionaryEntryType] - Dictionary of words to pronunciations, optional, defaults to the dictionary's + PronunciationDictionary of words to pronunciations, optional, defaults to the dictionary's set of words """ input_path = os.path.join(self.temp_directory, "input.txt") diff --git a/montreal_forced_aligner/helper.py b/montreal_forced_aligner/helper.py index cfcf2dad..f2563242 100644 --- a/montreal_forced_aligner/helper.py +++ b/montreal_forced_aligner/helper.py @@ -1,4 +1,8 @@ -"""Helper functions for MFA""" +""" +Helper functions +================ + +""" from __future__ import annotations import sys @@ -9,10 +13,8 @@ from colorama import Fore, Style if TYPE_CHECKING: - from .config import ConfigDict - from .corpus import CorpusMappingType, ScpType + from .abc import CorpusMappingType, Labels, MetaDict, ScpType -Labels = List[Any] __all__ = [ "TerminalPrinter", @@ -108,13 +110,13 @@ def print_block(self, block: dict, starting_level: int = 1) -> None: self.print_block(v, starting_level=starting_level + 1) print() - def print_config(self, configuration: ConfigDict) -> None: + def print_config(self, configuration: MetaDict) -> None: """ Pretty print a configuration Parameters ---------- - configuration: ConfigDict + configuration: :class:`~montreal_forced_aligner.abc.MetaDict` Configuration to print """ for k, v in configuration.items(): @@ -323,7 +325,7 @@ def load_scp(path: str, data_type: Optional[Type] = str) -> CorpusMappingType: Returns ------- dict - Dictionary where the keys are the first couple and the values are all + PronunciationDictionary where the keys are the first couple and the values are all other columns in the script file """ diff --git a/montreal_forced_aligner/lm/__init__.py b/montreal_forced_aligner/lm/__init__.py index b093be9f..290851df 100644 --- a/montreal_forced_aligner/lm/__init__.py +++ b/montreal_forced_aligner/lm/__init__.py @@ -1 +1,12 @@ -"""Classes for training language models""" +""" +Language modeling +================= + + +""" + +from .trainer import LmTrainer + +__all__ = ["trainer", "LmTrainer"] + +LmTrainer.__module__ = "montreal_forced_aligner.lm" diff --git a/montreal_forced_aligner/lm/trainer.py b/montreal_forced_aligner/lm/trainer.py index 0b0c5e84..daf64f22 100644 --- a/montreal_forced_aligner/lm/trainer.py +++ b/montreal_forced_aligner/lm/trainer.py @@ -12,8 +12,8 @@ from ..models import LanguageModel if TYPE_CHECKING: + from ..abc import Dictionary from ..config.train_lm_config import TrainLMConfig - from ..dictionary import DictionaryType __all__ = ["LmTrainer"] @@ -25,13 +25,13 @@ class LmTrainer: Parameters ---------- - source: class:`~montreal_forced_aligner.corpus.base.Corpus` or str + source: class:`~montreal_forced_aligner.corpus.Corpus` or str Either a alignable corpus or a path to an ARPA format language model config : class:`~montreal_forced_aligner.config.TrainLMConfig` Config class for training language model output_model_path : str Path to output trained model - dictionary : class:`~montreal_forced_aligner.dictionary.Dictionary`, optional + dictionary : class:`~montreal_forced_aligner.dictionary.PronunciationDictionary`, optional Optional dictionary to calculate unknown words temp_directory : str, optional Specifies the temporary directory root to save files need for Kaldi. @@ -51,7 +51,7 @@ def __init__( source: Union[Corpus, str], config: TrainLMConfig, output_model_path: str, - dictionary: Optional[DictionaryType] = None, + dictionary: Optional[Dictionary] = None, temp_directory: Optional[str] = None, supplemental_model_path: Optional[str] = None, supplemental_model_weight: int = 1, @@ -180,9 +180,7 @@ def train(self) -> None: training_path = os.path.join(self.temp_directory, "training.txt") with open(training_path, "w", encoding="utf8") as f: - for text in self.source.normalized_text_iter( - self.dictionary, self.config.count_threshold - ): + for text in self.source.normalized_text_iter(self.config.count_threshold): f.write(f"{text}\n") if self.dictionary is not None: diff --git a/montreal_forced_aligner/models.py b/montreal_forced_aligner/models.py index fd632bd5..7d1f0437 100644 --- a/montreal_forced_aligner/models.py +++ b/montreal_forced_aligner/models.py @@ -1,12 +1,17 @@ -"""Class definitions for Montreal Forced Aligner models""" +""" +Model classes +============= + +""" from __future__ import annotations import os from shutil import copy, copyfile, make_archive, move, rmtree, unpack_archive -from typing import TYPE_CHECKING, Any, Collection, Dict, Optional, Union +from typing import TYPE_CHECKING, Collection, Dict, Optional, Tuple, Union import yaml +from .abc import Dictionary, MetaDict, MfaModel, Trainer from .exceptions import ( LanguageModelNotFoundError, ModelLoadError, @@ -17,15 +22,10 @@ if TYPE_CHECKING: from logging import Logger - from .aligner.adapting import AdaptingAligner from .config import FeatureConfig + from .config.dictionary_config import DictionaryConfig from .config.train_config import TrainingConfig - from .dictionary import Dictionary - from .lm.trainer import LmTrainer - from .trainers import BaseTrainer - - TrainerType = Union[BaseTrainer, LmTrainer, AdaptingAligner] - MetaDict = Dict[str, Any] + from .dictionary import PronunciationDictionary # default format for output @@ -35,14 +35,14 @@ "Archive", "LanguageModel", "AcousticModel", - "IvectorExtractor", + "IvectorExtractorModel", "DictionaryModel", "G2PModel", "MODEL_TYPES", ] -class Archive: +class Archive(MfaModel): """ Class representing data in a directory or archive file (zip, tar, tar.gz/tgz) @@ -85,7 +85,7 @@ def __init__(self, source: str, root_directory: Optional[str] = None): def get_subclass_object( self, - ) -> Union[AcousticModel, G2PModel, LanguageModel, IvectorExtractor]: + ) -> Union[AcousticModel, G2PModel, LanguageModel, IvectorExtractorModel]: """ Instantiate subclass models based on files contained in the archive @@ -102,7 +102,7 @@ def get_subclass_object( if f.endswith(".arpa"): return LanguageModel(self.dirname, self.root_directory) if f == "final.ie": - return IvectorExtractor(self.dirname, self.root_directory) + return IvectorExtractorModel(self.dirname, self.root_directory) raise ModelLoadError(self.source) @classmethod @@ -168,13 +168,13 @@ def meta(self) -> dict: self._meta = yaml.safe_load(f) return self._meta - def add_meta_file(self, trainer: TrainerType) -> None: + def add_meta_file(self, trainer: Trainer) -> None: """ Add a metadata file from a given trainer to the model Parameters ---------- - trainer: TrainerType + trainer: Trainer The trainer to construct the metadata from """ with open(os.path.join(self.dirname, "meta.yaml"), "w", encoding="utf8") as f: @@ -255,17 +255,17 @@ class AcousticModel(Archive): files = ["final.mdl", "final.alimdl", "final.occs", "lda.mat", "tree"] extensions = [".zip", ".am"] - def add_meta_file(self, aligner: TrainerType) -> None: + def add_meta_file(self, trainer: Trainer) -> None: """ Add metadata file from a model trainer Parameters ---------- - aligner: TrainerType + trainer: :class:`~montreal_forced_aligner.abc.Trainer` Trainer to supply metadata information about the acoustic model """ with open(os.path.join(self.dirname, "meta.yaml"), "w", encoding="utf8") as f: - yaml.dump(aligner.meta, f) + yaml.dump(trainer.meta, f) @property def feature_config(self) -> FeatureConfig: @@ -278,7 +278,7 @@ def feature_config(self) -> FeatureConfig: fc.update(self.meta["features"]) return fc - def adaptation_config(self) -> TrainingConfig: + def adaptation_config(self) -> Tuple[TrainingConfig, DictionaryConfig]: """ Generate an adaptation configuration @@ -290,10 +290,10 @@ def adaptation_config(self) -> TrainingConfig: from .config.train_config import load_no_sat_adapt, load_sat_adapt if self.meta["features"]["fmllr"]: - train, align = load_sat_adapt() + train, align, dictionary = load_sat_adapt() else: - train, align = load_no_sat_adapt() - return train + train, align, dictionary = load_no_sat_adapt() + return train, dictionary @property def meta(self) -> MetaDict: @@ -430,8 +430,8 @@ def validate(self, dictionary: Union[Dictionary, G2PModel]) -> None: Parameters ---------- - dictionary: Union[Dictionary, G2PModel] - Dictionary or G2P model to compare phone sets with + dictionary: Union[DictionaryConfig, G2PModel] + PronunciationDictionary or G2P model to compare phone sets with Raises ------ @@ -441,12 +441,12 @@ def validate(self, dictionary: Union[Dictionary, G2PModel]) -> None: if isinstance(dictionary, G2PModel): missing_phones = dictionary.meta["phones"] - set(self.meta["phones"]) else: - missing_phones = dictionary.nonsil_phones - set(self.meta["phones"]) + missing_phones = dictionary.config.non_silence_phones - set(self.meta["phones"]) if missing_phones: raise (PronunciationAcousticMismatchError(missing_phones)) -class IvectorExtractor(Archive): +class IvectorExtractorModel(Archive): """ Model class for IvectorExtractor """ @@ -503,14 +503,16 @@ def feature_config(self) -> FeatureConfig: class G2PModel(Archive): extensions = [".zip", ".g2p"] - def add_meta_file(self, dictionary: Dictionary, architecture: Optional[str] = None) -> None: + def add_meta_file( + self, dictionary: PronunciationDictionary, architecture: Optional[str] = None + ) -> None: """ Construct meta data information for the G2P model from the dictionary it was trained from Parameters ---------- - dictionary: Dictionary - Dictionary that was the training data for the G2P model + dictionary: PronunciationDictionary + PronunciationDictionary that was the training data for the G2P model architecture: str, optional Architecture of the G2P model, defaults to "pynini" """ @@ -520,7 +522,7 @@ def add_meta_file(self, dictionary: Dictionary, architecture: Optional[str] = No architecture = "pynini" with open(os.path.join(self.dirname, "meta.yaml"), "w", encoding="utf8") as f: meta = { - "phones": sorted(dictionary.nonsil_phones), + "phones": sorted(dictionary.config.non_silence_phones), "graphemes": sorted(dictionary.graphemes), "architecture": architecture, "version": get_mfa_version(), @@ -693,12 +695,133 @@ def add_arpa_file(self, arpa_path: str) -> None: copyfile(arpa_path, os.path.join(self.dirname, name)) -class DictionaryModel(Archive): +class DictionaryModel(MfaModel): """ Class for representing MFA pronunciation dictionaries """ - extensions = [".dict", f".{FORMAT}", ".txt", ".yaml"] + extensions = [".dict", ".txt", ".yaml", ".yml"] + + def __init__(self, path: str): + self.path = path + count = 0 + self.pronunciation_probabilities = True + self.silence_probabilities = True + with open(self.path, "r", encoding="utf8") as f: + for line in f: + line = line.strip() + if not line: + continue + line = line.split() + _ = line.pop(0) # word + next_item = line.pop(0) + if self.pronunciation_probabilities: + try: + prob = float(next_item) + if prob > 1 or prob < 0: + raise ValueError + except ValueError: + self.pronunciation_probabilities = False + try: + next_item = line.pop(0) + except IndexError: + self.silence_probabilities = False + if self.silence_probabilities: + try: + prob = float(next_item) + if prob > 1 or prob < 0: + raise ValueError + except ValueError: + self.silence_probabilities = False + count += 1 + if count > 10: + break + + @property + def meta(self) -> MetaDict: + return { + "pronunciation_probabilities": self.pronunciation_probabilities, + "silence_probabilities": self.silence_probabilities, + } + + def add_meta_file(self, trainer: Trainer) -> None: + raise NotImplementedError + + def pretty_print(self): + """ + Pretty print the dictionary's meta data using TerminalPrinter + """ + printer = TerminalPrinter() + configuration_data = {"Dictionary": {"name": (self.name, "green"), "data": self.meta}} + printer.print_config(configuration_data) + + @classmethod + def valid_extension(cls, filename: str) -> bool: + """ + Check whether a file has a valid extension for the given model archive + + Parameters + ---------- + filename: str + File name to check + + Returns + ------- + bool + True if the extension matches the models allowed extensions + """ + if os.path.splitext(filename)[1] in cls.extensions: + return True + return False + + @classmethod + def generate_path(cls, root: str, name: str, enforce_existence: bool = True) -> Optional[str]: + """ + Generate a path for a given model from the root directory and the name of the model + + Parameters + ---------- + root: str + Root directory for the full path + name: str + Name of the model + enforce_existence: bool + Flag to return None if the path doesn't exist, defaults to True + + Returns + ------- + str + Full path in the root directory for the model + """ + for ext in cls.extensions: + path = os.path.join(root, name + ext) + if os.path.exists(path) or not enforce_existence: + return path + return None + + @property + def is_multiple(self): + return os.path.splitext(self.path)[1] in [".yaml", ".yml"] + + @property + def name(self): + return os.path.splitext(os.path.basename(self.path))[0] + + def load_dictionary_paths(self) -> Dict[str, DictionaryModel]: + from .utils import get_available_dictionaries, get_dictionary_path + + mapping = {} + if self.is_multiple: + available_langs = get_available_dictionaries() + with open(self.path, "r", encoding="utf8") as f: + data = yaml.safe_load(f) + for speaker, path in data.items(): + if path in available_langs: + path = get_dictionary_path(path) + mapping[speaker] = DictionaryModel(path) + else: + mapping["default"] = self + return mapping MODEL_TYPES = { @@ -706,5 +829,5 @@ class DictionaryModel(Archive): "g2p": G2PModel, "dictionary": DictionaryModel, "language_model": LanguageModel, - "ivector": IvectorExtractor, + "ivector": IvectorExtractorModel, } diff --git a/montreal_forced_aligner/multiprocessing/__init__.py b/montreal_forced_aligner/multiprocessing/__init__.py index a5297b8f..a6ef7a77 100644 --- a/montreal_forced_aligner/multiprocessing/__init__.py +++ b/montreal_forced_aligner/multiprocessing/__init__.py @@ -1,6 +1,8 @@ -"""Multiprocessing functions and classes for Montreal Forced Aligner""" -from .alignment import acc_stats # noqa -from .alignment import align # noqa +""" +Multiprocessing functions +========================= + +""" from .alignment import calc_fmllr # noqa from .alignment import calc_lda_mllt # noqa from .alignment import compile_information # noqa @@ -12,8 +14,22 @@ from .alignment import lda_acc_stats # noqa from .alignment import mono_align_equal # noqa from .alignment import train_map # noqa -from .alignment import tree_stats # noqa; noqa -from .helper import Counter, Stopped, run_mp, run_non_mp # noqa +from .alignment import tree_stats # noqa +from .alignment import ( # noqa + CleanupWordCtmProcessWorker, + CombineProcessWorker, + ExportPreparationProcessWorker, + ExportTextGridProcessWorker, + NoCleanupWordCtmProcessWorker, + PhoneCtmProcessWorker, + acc_stats, + acc_stats_func, + align, + align_func, +) +from .classes import Job # noqa +from .corpus import CorpusProcessWorker # noqa +from .helper import Counter, ProcessWorker, Stopped, run_mp, run_non_mp # noqa from .ivector import acc_global_stats # noqa from .ivector import acc_ivector_stats # noqa from .ivector import extract_ivectors # noqa @@ -22,3 +38,27 @@ from .ivector import segment_vad # noqa from .pronunciations import generate_pronunciations # noqa from .transcription import transcribe, transcribe_fmllr # noqa + +__all__ = [ + "alignment", + "classes", + "corpus", + "features", + "helper", + "ivector", + "pronunciations", + "transcription", +] + +Job.__module__ = "montreal_forced_aligner.multiprocessing" +CleanupWordCtmProcessWorker.__module__ = "montreal_forced_aligner.multiprocessing" +CombineProcessWorker.__module__ = "montreal_forced_aligner.multiprocessing" +PhoneCtmProcessWorker.__module__ = "montreal_forced_aligner.multiprocessing" +ExportPreparationProcessWorker.__module__ = "montreal_forced_aligner.multiprocessing" +ExportTextGridProcessWorker.__module__ = "montreal_forced_aligner.multiprocessing" +NoCleanupWordCtmProcessWorker.__module__ = "montreal_forced_aligner.multiprocessing" + +CorpusProcessWorker.__module__ = "montreal_forced_aligner.multiprocessing" +Counter.__module__ = "montreal_forced_aligner.multiprocessing" +Stopped.__module__ = "montreal_forced_aligner.multiprocessing" +ProcessWorker.__module__ = "montreal_forced_aligner.multiprocessing" diff --git a/montreal_forced_aligner/multiprocessing/alignment.py b/montreal_forced_aligner/multiprocessing/alignment.py index e5cad626..57ad5c92 100644 --- a/montreal_forced_aligner/multiprocessing/alignment.py +++ b/montreal_forced_aligner/multiprocessing/alignment.py @@ -1,4 +1,8 @@ -"""Multiprocessing files for alignment functions in MFA""" +""" +Aligment functions +------------------ + +""" from __future__ import annotations import multiprocessing as mp @@ -28,9 +32,10 @@ from .helper import run_mp, run_non_mp if TYPE_CHECKING: + from ..abc import Aligner, CtmErrorDict, MetaDict, Trainer from ..aligner.adapting import AdaptingAligner from ..aligner.base import BaseAligner - from ..config.align_config import AlignConfig, ConfigDict + from ..config.align_config import AlignConfig from ..corpus.classes import ( CleanupWordCtmArguments, CombineCtmArguments, @@ -40,17 +45,11 @@ PhoneCtmArguments, Utterance, ) - from ..textgrid import CtmInterval - from ..trainers import BaseTrainer, LdaTrainer, MonophoneTrainer + from ..data import CtmType + from ..trainers import BaseTrainer, LdaTrainer, MonophoneTrainer, SatTrainer ConfigType = Union[BaseTrainer, AlignConfig] - IterationType = Union[str, int] - - AlignerType = Union[BaseTrainer, BaseAligner] - CtmType = List[CtmInterval] - -CtmErrorDict = Dict[Tuple[str, int], str] queue_polling_timeout = 1 @@ -78,6 +77,22 @@ "lda_acc_stats", "train_map", "parse_iteration_alignments", + "convert_alignments_func", + "align_func", + "ali_to_ctm_func", + "compute_alignment_improvement_func", + "mono_align_equal_func", + "calc_fmllr_func", + "calc_lda_mllt_func", + "lda_acc_stats_func", + "tree_stats_func", + "map_acc_stats_func", + "acc_stats_two_feats_func", + "compile_information_func", + "compile_train_graphs_func", + "compile_utterance_train_graphs_func", + "test_utterances_func", + "acc_stats_func", ] @@ -124,20 +139,19 @@ def acc_stats_func( acc_proc.communicate() -def acc_stats(aligner: AlignerType): +def acc_stats(aligner: Trainer): """ - Multiprocessing function that computes stats for GMM training - - See http://kaldi-asr.org/doc/gmm-acc-stats-ali_8cc.html for more details - on the Kaldi binary this runs. - - Also see https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/train_mono.sh - for the bash script this function was extracted from + Multiprocessing function that accumulates stats for GMM training Parameters ---------- - aligner : :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner : Trainer Trainer + + Notes + ----- + See :kaldi_src:`gmmbin/gmm-acc-stats-ali` for more details on the Kaldi + binary, and :kaldi_steps:`train_mono` for an example Kaldi script """ arguments = [j.acc_stats_arguments(aligner) for j in aligner.corpus.jobs] @@ -232,13 +246,13 @@ def compile_train_graphs_func( model_path: str Path to the acoustic model file text_int_paths: Dict[str, str] - Dictionary of text int files per dictionary name + PronunciationDictionary of text int files per dictionary name disambig_paths: Dict[str, str] - Dictionary of disambiguation symbol int files per dictionary name + PronunciationDictionary of disambiguation symbol int files per dictionary name lexicon_fst_paths: Dict[str, str] - Dictionary of L.fst files per dictionary name + PronunciationDictionary of L.fst files per dictionary name fst_scp_paths: Dict[str, str] - Dictionary of utterance FST scp files per dictionary name + PronunciationDictionary of utterance FST scp files per dictionary name """ with open(log_path, "w", encoding="utf8") as log_file: for dict_name in dictionaries: @@ -262,7 +276,7 @@ def compile_train_graphs_func( proc.communicate() -def compile_train_graphs(aligner: AlignerType) -> None: +def compile_train_graphs(aligner: Union[BaseAligner, BaseTrainer]) -> None: """ Multiprocessing function that compiles training graphs for utterances @@ -274,7 +288,7 @@ def compile_train_graphs(aligner: AlignerType) -> None: Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner """ aligner.logger.debug("Compiling training graphs...") @@ -308,13 +322,13 @@ def mono_align_equal_func( dictionaries: List[str] List of dictionary names feature_strings: Dict[str, str] - Dictionary of feature strings per dictionary name + PronunciationDictionary of feature strings per dictionary name fst_scp_paths: Dict[str, str] - Dictionary of utterance FST scp files per dictionary name + PronunciationDictionary of utterance FST scp files per dictionary name ali_ark_paths: Dict[str, str] - Dictionary of alignment archives per dictionary name + PronunciationDictionary of alignment archives per dictionary name acc_paths: Dict[str, str] - Dictionary of accumulated stats files per dictionary name + PronunciationDictionary of accumulated stats files per dictionary name model_path: str Path to the acoustic model file """ @@ -362,7 +376,7 @@ def mono_align_equal(aligner: MonophoneTrainer): Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.MonophoneTrainer` + aligner: :class:`~montreal_forced_aligner.trainers.MonophoneTrainer` Monophone trainer """ @@ -413,7 +427,7 @@ def align_func( ali_paths: Dict[str, str], score_paths: Dict[str, str], loglike_paths: Dict[str, str], - align_options: ConfigDict, + align_options: MetaDict, ): """ Multiprocessing function for alignment @@ -425,18 +439,18 @@ def align_func( dictionaries: List[str] List of dictionary names fst_scp_paths: Dict[str, str] - Dictionary of FST scp file paths per dictionary name + PronunciationDictionary of FST scp file paths per dictionary name feature_strings: Dict[str, str] - Dictionary of feature strings per dictionary name + PronunciationDictionary of feature strings per dictionary name model_path: str Path to the acoustic model file ali_paths: Dict[str, str] - Dictionary of alignment archives per dictionary name + PronunciationDictionary of alignment archives per dictionary name score_paths: Dict[str, str] - Dictionary of scores files per dictionary name + PronunciationDictionary of scores files per dictionary name loglike_paths: Dict[str, str] - Dictionary of log likelihood files per dictionary name - align_options: ConfigDict + PronunciationDictionary of log likelihood files per dictionary name + align_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for alignment """ with open(log_path, "w", encoding="utf8") as log_file: @@ -481,7 +495,7 @@ def align_func( align_proc.communicate() -def align(aligner: AlignerType) -> None: +def align(aligner: Union[BaseAligner, BaseTrainer]) -> None: """ Multiprocessing function that aligns based on the current model @@ -494,7 +508,7 @@ def align(aligner: AlignerType) -> None: Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner """ begin = time.time() @@ -566,14 +580,14 @@ def compile_information_func(align_log_path: str) -> Dict[str, Union[List[str], return data -def compile_information(aligner: AlignerType) -> Tuple[Dict[str, str], float]: +def compile_information(aligner: Union[BaseAligner, BaseTrainer]) -> Tuple[Dict[str, str], float]: """ Compiles information about alignment, namely what the overall log-likelihood was and how many files were unaligned Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns @@ -647,11 +661,11 @@ def compute_alignment_improvement_func( model_path: str Path to the acoustic model file text_int_paths: Dict[str, str] - Dictionary of text int files per dictionary name + PronunciationDictionary of text int files per dictionary name word_boundary_paths: Dict[str, str] - Dictionary of word boundary files per dictionary name + PronunciationDictionary of word boundary files per dictionary name ali_paths: Dict[str, str] - Dictionary of alignment archives per dictionary name + PronunciationDictionary of alignment archives per dictionary name frame_shift: int Frame shift of feature generation, in ms reversed_phone_mappings: Dict[str, Dict[int, str]] @@ -659,7 +673,7 @@ def compute_alignment_improvement_func( positions: Dict[str, List[str]] Positions per dictionary name phone_ctm_paths: Dict[str, str] - Dictionary of phone ctm files per dictionary name + PronunciationDictionary of phone ctm files per dictionary name """ try: @@ -754,16 +768,16 @@ def compute_alignment_improvement_func( def parse_iteration_alignments( - aligner: AlignerType, iteration: Optional[IterationType] = None + aligner: Trainer, iteration: Optional[int] = None ) -> Dict[str, List[Tuple[float, float, str]]]: """ Function to parse phone CTMs in a given iteration Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Trainer Aligner - iteration: IterationType + iteration: int Iteration to compute over Returns ------- @@ -853,14 +867,14 @@ def compare_alignments( return utterances_aligned_diff, mean_difference -def compute_alignment_improvement(aligner: AlignerType) -> None: +def compute_alignment_improvement(aligner: Union[BaseAligner, BaseTrainer]) -> None: """ Computes aligner improvements in terms of number of aligned files and phone boundaries for debugging purposes Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: :class:`~montreal_forced_aligner.trainers.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` Aligner """ jobs = [x.alignment_improvement_arguments(aligner) for x in aligner.corpus.jobs] @@ -924,17 +938,17 @@ def ali_to_ctm_func( dictionaries: List[str] List of dictionary names ali_paths: Dict[str, str] - Dictionary of alignment archives per dictionary name + PronunciationDictionary of alignment archives per dictionary name text_int_paths: Dict[str, str] - Dictionary of text int files per dictionary name + PronunciationDictionary of text int files per dictionary name word_boundary_int_paths: Dict[str, str] - Dictionary of word boundary int files per dictionary name + PronunciationDictionary of word boundary int files per dictionary name frame_shift: float Frame shift of feature generation in seconds model_path: str Path to the acoustic model file ctm_paths: Dict[str, str] - Dictionary of CTM files per dictionary name + PronunciationDictionary of CTM files per dictionary name word_mode: bool Flag for whether to parse words or phones """ @@ -1016,10 +1030,10 @@ class NoCleanupWordCtmProcessWorker(mp.Process): Job name to_process_queue: :class:`~multiprocessing.Queue` Return queue of jobs for later workers to process - stopped: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + stopped: :class:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Stop check for processing error_catching: CtmErrorDict - Dictionary for storing errors encountered + PronunciationDictionary for storing errors encountered arguments: :class:`~montreal_forced_aligner.multiprocessing.classes.NoCleanupWordCtmArguments` Arguments to pass to the CTM processing function """ @@ -1043,7 +1057,7 @@ def __init__( # Corpus information self.utterances = arguments.utterances - # Dictionary information + # PronunciationDictionary information self.dictionary_data = arguments.dictionary_data def run(self) -> None: @@ -1114,10 +1128,10 @@ class CleanupWordCtmProcessWorker(mp.Process): Job name to_process_queue: :class:`~multiprocessing.Queue` Return queue of jobs for later workers to process - stopped: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + stopped: :class:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Stop check for processing error_catching: CtmErrorDict - Dictionary for storing errors encountered + PronunciationDictionary for storing errors encountered arguments: :class:`~montreal_forced_aligner.multiprocessing.classes.CleanupWordCtmArguments` Arguments to pass to the CTM processing function """ @@ -1141,7 +1155,7 @@ def __init__( # Corpus information self.utterances = arguments.utterances - # Dictionary information + # PronunciationDictionary information self.dictionary_data = arguments.dictionary_data def run(self) -> None: @@ -1214,10 +1228,10 @@ class PhoneCtmProcessWorker(mp.Process): Job name to_process_queue: :class:`~multiprocessing.Queue` Return queue of jobs for later workers to process - stopped: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + stopped: :class:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Stop check for processing error_catching: CtmErrorDict - Dictionary for storing errors encountered + PronunciationDictionary for storing errors encountered arguments: :class:`~montreal_forced_aligner.multiprocessing.classes.PhoneCtmArguments` Arguments to pass to the CTM processing function """ @@ -1316,12 +1330,12 @@ class CombineProcessWorker(mp.Process): Input queue of phone and word ctms to combine to_export_queue: :class:`~multiprocessing.Queue` Export queue of combined CTMs - stopped: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + stopped: :class:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Stop check for processing - finished_combining: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + finished_combining: :class:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Signal that this worker has finished combining all CTMs error_catching: CtmErrorDict - Dictionary for storing errors encountered + PronunciationDictionary for storing errors encountered arguments: :class:`~montreal_forced_aligner.multiprocessing.classes.CombineCtmArguments` Arguments to pass to the CTM combining function """ @@ -1407,12 +1421,12 @@ class ExportTextGridProcessWorker(mp.Process): ---------- for_write_queue: :class:`~multiprocessing.Queue` Input queue of files to export - stopped: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + stopped: :class:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Stop check for processing - finished_processing: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + finished_processing: :class:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Input signal that all jobs have been added and no more new ones will come in textgrid_errors: CtmErrorDict - Dictionary for storing errors encountered + PronunciationDictionary for storing errors encountered arguments: :class:`~montreal_forced_aligner.multiprocessing.classes.ExportTextGridArguments` Arguments to pass to the TextGrid export function """ @@ -1474,9 +1488,9 @@ class ExportPreparationProcessWorker(mp.Process): Input queue of combined CTMs for_write_queue: :class:`~multiprocessing.Queue` Export queue of files to export - stopped: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + stopped: :class:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Stop check for processing - finished_combining: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + finished_combining: :class:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Input signal that all CTMs have been combined files: Dict[str, File] Files in corpus @@ -1531,13 +1545,13 @@ def run(self) -> None: raise -def ctms_to_textgrids_mp(aligner: AlignerType): +def ctms_to_textgrids_mp(aligner: Aligner): """ Multiprocessing function for exporting alignment CTM information as TextGrids Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner """ export_begin = time.time() @@ -1665,7 +1679,7 @@ def ctms_to_textgrids_mp(aligner: AlignerType): output_textgrid_writing_errors(aligner.textgrid_output, textgrid_errors) -def convert_ali_to_textgrids(aligner: AlignerType) -> None: +def convert_ali_to_textgrids(aligner: Aligner) -> None: """ Multiprocessing function that aligns based on the current model @@ -1684,7 +1698,7 @@ def convert_ali_to_textgrids(aligner: AlignerType) -> None: Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: :class:`~montreal_forced_aligner.abc.Aligner` Aligner """ log_directory = aligner.working_log_directory @@ -1729,11 +1743,11 @@ def tree_stats_func( model_path: str Path to the acoustic model file feature_strings: Dict[str, str] - Dictionary of feature strings per dictionary name + PronunciationDictionary of feature strings per dictionary name ali_paths: Dict[str, str] - Dictionary of alignment archives per dictionary name + PronunciationDictionary of alignment archives per dictionary name treeacc_paths: Dict[str, str] - Dictionary of accumulated tree stats files per dictionary name + PronunciationDictionary of accumulated tree stats files per dictionary name """ with open(log_path, "w", encoding="utf8") as log_file: for dict_name in dictionaries: @@ -1753,7 +1767,7 @@ def tree_stats_func( ) -def tree_stats(aligner: AlignerType) -> None: +def tree_stats(trainer: Trainer) -> None: """ Multiprocessing function that computes stats for decision tree training @@ -1762,31 +1776,31 @@ def tree_stats(aligner: AlignerType) -> None: Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` - Aligner + trainer: :class:`~montreal_forced_aligner.abc.Trainer` + Trainer """ - jobs = [j.tree_stats_arguments(aligner) for j in aligner.corpus.jobs] + jobs = [j.tree_stats_arguments(trainer) for j in trainer.corpus.jobs] - if aligner.use_mp: - run_mp(tree_stats_func, jobs, aligner.working_log_directory) + if trainer.use_mp: + run_mp(tree_stats_func, jobs, trainer.working_log_directory) else: - run_non_mp(tree_stats_func, jobs, aligner.working_log_directory) + run_non_mp(tree_stats_func, jobs, trainer.working_log_directory) tree_accs = [] for x in jobs: tree_accs.extend(x.treeacc_paths.values()) - log_path = os.path.join(aligner.working_log_directory, "sum_tree_acc.log") + log_path = os.path.join(trainer.working_log_directory, "sum_tree_acc.log") with open(log_path, "w", encoding="utf8") as log_file: subprocess.call( [ thirdparty_binary("sum-tree-stats"), - os.path.join(aligner.working_directory, "treeacc"), + os.path.join(trainer.working_directory, "treeacc"), ] + tree_accs, stderr=log_file, ) - if not aligner.debug: + if not trainer.debug: for f in tree_accs: os.remove(f) @@ -1816,9 +1830,9 @@ def convert_alignments_func( align_model_path: str Path to the alignment acoustic model file ali_paths: Dict[str, str] - Dictionary of alignment archives per dictionary name + PronunciationDictionary of alignment archives per dictionary name new_ali_paths: Dict[str, str] - Dictionary of new alignment archives per dictionary name + PronunciationDictionary of new alignment archives per dictionary name """ with open(log_path, "w", encoding="utf8") as log_file: for dict_name in dictionaries: @@ -1837,7 +1851,7 @@ def convert_alignments_func( ) -def convert_alignments(aligner: AlignerType) -> None: +def convert_alignments(trainer: Trainer) -> None: """ Multiprocessing function that converts alignments from previous training @@ -1846,15 +1860,15 @@ def convert_alignments(aligner: AlignerType) -> None: Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` - Aligner + trainer: :class:`~montreal_forced_aligner.abc.Trainer` + Trainer """ - jobs = [x.convert_alignment_arguments(aligner) for x in aligner.corpus.jobs] - if aligner.use_mp: - run_mp(convert_alignments_func, jobs, aligner.working_log_directory) + jobs = [x.convert_alignment_arguments(trainer) for x in trainer.corpus.jobs] + if trainer.use_mp: + run_mp(convert_alignments_func, jobs, trainer.working_log_directory) else: - run_non_mp(convert_alignments_func, jobs, aligner.working_log_directory) + run_non_mp(convert_alignments_func, jobs, trainer.working_log_directory) def calc_fmllr_func( @@ -1866,7 +1880,7 @@ def calc_fmllr_func( model_path: str, spk2utt_paths: Dict[str, str], trans_paths: Dict[str, str], - fmllr_options: ConfigDict, + fmllr_options: MetaDict, ) -> None: """ Multiprocessing function for calculating fMLLR transforms @@ -1878,18 +1892,18 @@ def calc_fmllr_func( dictionaries: List[str] List of dictionary names feature_strings: Dict[str, str] - Dictionary of feature strings per dictionary name + PronunciationDictionary of feature strings per dictionary name ali_paths: Dict[str, str] - Dictionary of alignment archives per dictionary name + PronunciationDictionary of alignment archives per dictionary name ali_model_path: str Path to the alignment acoustic model file model_path: str Path to the acoustic model file spk2utt_paths: Dict[str, str] - Dictionary of spk2utt scps per dictionary name + PronunciationDictionary of spk2utt scps per dictionary name trans_paths: Dict[str, str] - Dictionary of fMLLR transform archives per dictionary name - fmllr_options: ConfigDict + PronunciationDictionary of fMLLR transform archives per dictionary name + fmllr_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for fMLLR estimation """ with open(log_path, "w", encoding="utf8") as log_file: @@ -2007,7 +2021,7 @@ def calc_fmllr_func( est_proc.communicate() -def calc_fmllr(aligner: AlignerType) -> None: +def calc_fmllr(aligner: Aligner) -> None: """ Multiprocessing function that computes speaker adaptation (fMLLR) @@ -2027,7 +2041,7 @@ def calc_fmllr(aligner: AlignerType) -> None: Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: :class:`~montreal_forced_aligner.abc.Aligner` Aligner """ begin = time.time() @@ -2062,15 +2076,15 @@ def acc_stats_two_feats_func( dictionaries: List[str] List of dictionary names ali_paths: Dict[str, str] - Dictionary of alignment archives per dictionary name + PronunciationDictionary of alignment archives per dictionary name acc_paths: Dict[str, str] - Dictionary of accumulated stats files per dictionary name + PronunciationDictionary of accumulated stats files per dictionary name model_path: str Path to the acoustic model file feature_strings: Dict[str, str] - Dictionary of feature strings per dictionary name + PronunciationDictionary of feature strings per dictionary name si_feature_strings: Dict[str, str] - Dictionary of speaker-independent feature strings per dictionary name + PronunciationDictionary of speaker-independent feature strings per dictionary name """ with open(log_path, "w", encoding="utf8") as log_file: for dict_name in dictionaries: @@ -2100,14 +2114,14 @@ def acc_stats_two_feats_func( acc_proc.communicate() -def create_align_model(aligner: AlignerType) -> None: +def create_align_model(aligner: SatTrainer) -> None: """ Create alignment model for speaker-adapted training that will use speaker-independent features in later aligning Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: :class:`~montreal_forced_aligner.trainers.SatTrainer` Aligner """ aligner.logger.info("Creating alignment model for speaker-independent features...") @@ -2161,7 +2175,7 @@ def lda_acc_stats_func( feature_strings: Dict[str, str], ali_paths: Dict[str, str], model_path: str, - lda_options: ConfigDict, + lda_options: MetaDict, acc_paths: Dict[str, str], ) -> None: """ @@ -2174,12 +2188,12 @@ def lda_acc_stats_func( dictionaries: List[str] List of dictionary names feature_strings: Dict[str, str] - Dictionary of feature strings per dictionary name + PronunciationDictionary of feature strings per dictionary name ali_paths: Dict[str, str] Dictionary of alignment archives per dictionary name model_path: str Path to the acoustic model file - lda_options: ConfigDict + lda_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for LDA acc_paths: Dict[str, str] Dictionary of accumulated stats files per dictionary name @@ -2244,7 +2258,7 @@ def lda_acc_stats(aligner: LdaTrainer) -> None: Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.LdaTrainer` + aligner: :class:`~montreal_forced_aligner.trainers.LdaTrainer` Trainer """ arguments = [x.lda_acc_stats_arguments(aligner) for x in aligner.corpus.jobs] @@ -2279,7 +2293,7 @@ def calc_lda_mllt_func( feature_strings: Dict[str, str], ali_paths: Dict[str, str], model_path: str, - lda_options: ConfigDict, + lda_options: MetaDict, macc_paths: Dict[str, str], ) -> None: """ @@ -2297,7 +2311,7 @@ def calc_lda_mllt_func( Dictionary of alignment archives per dictionary name model_path: str Path to the acoustic model file - lda_options: ConfigDict + lda_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for LDA macc_paths: Dict[str, str] Dictionary of accumulated stats files per dictionary name @@ -2366,7 +2380,7 @@ def calc_lda_mllt(aligner: LdaTrainer) -> None: Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.LdaTrainer` + aligner: :class:`~montreal_forced_aligner.trainers.LdaTrainer` Trainer """ jobs = [x.calc_lda_mllt_arguments(aligner) for x in aligner.corpus.jobs] @@ -2438,13 +2452,13 @@ def map_acc_stats_func( dictionaries: List[str] List of dictionary names feature_strings: Dict[str, str] - Dictionary of feature strings per dictionary name + PronunciationDictionary of feature strings per dictionary name model_path: str Path to the acoustic model file ali_paths: Dict[str, str] - Dictionary of alignment archives per dictionary name + PronunciationDictionary of alignment archives per dictionary name acc_paths: Dict[str, str] - Dictionary of accumulated stats files per dictionary name + PronunciationDictionary of accumulated stats files per dictionary name """ with open(log_path, "w", encoding="utf8") as log_file: for dict_name in dictionaries: @@ -2604,17 +2618,17 @@ def test_utterances_func( dictionaries: List[str] List of dictionaries feature_strings: Dict[str, str] - Dictionary of feature strings per dictionary name + PronunciationDictionary of feature strings per dictionary name words_paths: Dict[str, str] - Dictionary of word mapping files per dictionary name + PronunciationDictionary of word mapping files per dictionary name graphs_paths: Dict[str, str] - Dictionary of utterance FST graph archives per dictionary name + PronunciationDictionary of utterance FST graph archives per dictionary name text_int_paths: Dict[str, str] - Dictionary of text.int files per dictionary name + PronunciationDictionary of text.int files per dictionary name edits_paths: Dict[str, str] - Dictionary of paths to save transcription differences per dictionary name + PronunciationDictionary of paths to save transcription differences per dictionary name out_int_paths: Dict[str, str] - Dictionary of output .int files per dictionary name + PronunciationDictionary of output .int files per dictionary name model_path: str Acoustic model path """ @@ -2681,13 +2695,13 @@ def compile_utterance_train_graphs_func( dictionaries: List[str] List of dictionaries disambig_int_paths: Dict[str, str] - Dictionary of disambiguation symbol int files per dictionary name + PronunciationDictionary of disambiguation symbol int files per dictionary name disambig_L_fst_paths: Dict[str, str] - Dictionary of disambiguation lexicon FSTs per dictionary name + PronunciationDictionary of disambiguation lexicon FSTs per dictionary name fst_paths: Dict[str, str] - Dictionary of pregenerated utterance FST scp files per dictionary name + PronunciationDictionary of pregenerated utterance FST scp files per dictionary name graphs_paths: Dict[str, str] - Dictionary of utterance FST graph archives per dictionary name + PronunciationDictionary of utterance FST graph archives per dictionary name model_path: str Acoustic model path tree_path: str diff --git a/montreal_forced_aligner/multiprocessing/classes.py b/montreal_forced_aligner/multiprocessing/classes.py index 5e16c9d4..edf89703 100644 --- a/montreal_forced_aligner/multiprocessing/classes.py +++ b/montreal_forced_aligner/multiprocessing/classes.py @@ -1,38 +1,37 @@ -"""Class definitions for multiprocessing Jobs""" +""" +Multiprocessing classes +----------------------- + +""" from __future__ import annotations import os -from typing import TYPE_CHECKING, Collection, Dict, List, NamedTuple, Set, Tuple, Union +from typing import TYPE_CHECKING, Collection, Dict, List, NamedTuple, Optional, Set, Tuple + +if TYPE_CHECKING: + from ..corpus.classes import File, Speaker, Utterance -from ..corpus.classes import File, Speaker, Utterance +from ..abc import IvectorExtractor, MetaDict, MfaWorker from ..helper import output_mapping, save_scp if TYPE_CHECKING: + from ..abc import Aligner, MappingType, ReversedMappingType, WordsType from ..aligner.adapting import AdaptingAligner from ..aligner.base import BaseAligner from ..config import FeatureConfig - from ..config.align_config import AlignConfig, ConfigDict from ..corpus import Corpus - from ..dictionary import ( - Dictionary, - DictionaryData, - MappingType, - ReversedMappingType, - WordsType, - ) + from ..dictionary import DictionaryData from ..segmenter import Segmenter - from ..speaker_classifier import SpeakerClassifier - from ..trainers import BaseTrainer, IvectorExtractorTrainer, LdaTrainer, SatTrainer + from ..trainers import ( + BaseTrainer, + IvectorExtractorTrainer, + LdaTrainer, + MonophoneTrainer, + SatTrainer, + ) from ..transcriber import Transcriber from ..validator import CorpusValidator - ConfigType = Union[BaseTrainer, AlignConfig] - FmllrConfigType = Union[SatTrainer, AlignConfig] - LdaConfigType = Union[LdaTrainer, AlignConfig] - - IterationType = Union[str, int] - - AlignerType = Union[BaseTrainer, BaseAligner] __all__ = [ "Job", @@ -65,7 +64,6 @@ "LdaAccStatsArguments", "MapAccStatsArguments", "GaussToPostArguments", - "ClassifySpeakersArguments", "InitialFmllrArguments", "ExtractIvectorsArguments", "ExportTextGridArguments", @@ -86,7 +84,7 @@ class VadArguments(NamedTuple): dictionaries: List[str] feats_scp_paths: Dict[str, str] vad_scp_paths: Dict[str, str] - vad_options: ConfigDict + vad_options: MetaDict class MfccArguments(NamedTuple): @@ -100,7 +98,7 @@ class MfccArguments(NamedTuple): lengths_paths: Dict[str, str] segment_paths: Dict[str, str] wav_paths: Dict[str, str] - mfcc_options: ConfigDict + mfcc_options: MetaDict class CompileTrainGraphsArguments(NamedTuple): @@ -150,7 +148,7 @@ class AlignArguments(NamedTuple): ali_paths: Dict[str, str] score_paths: Dict[str, str] loglike_paths: Dict[str, str] - align_options: ConfigDict + align_options: MetaDict class CompileInformationArguments(NamedTuple): @@ -269,7 +267,7 @@ class CalcFmllrArguments(NamedTuple): model_path: str spk2utt_paths: Dict[str, str] trans_paths: Dict[str, str] - fmllr_options: ConfigDict + fmllr_options: MetaDict class AccStatsTwoFeatsArguments(NamedTuple): @@ -292,7 +290,7 @@ class LdaAccStatsArguments(NamedTuple): feature_strings: Dict[str, str] ali_paths: Dict[str, str] model_path: str - lda_options: ConfigDict + lda_options: MetaDict acc_paths: Dict[str, str] @@ -304,7 +302,7 @@ class CalcLdaMlltArguments(NamedTuple): feature_strings: Dict[str, str] ali_paths: Dict[str, str] model_path: str - lda_options: ConfigDict + lda_options: MetaDict macc_paths: Dict[str, str] @@ -325,7 +323,7 @@ class GmmGselectArguments(NamedTuple): log_path: str dictionaries: List[str] feature_strings: Dict[str, str] - ivector_options: ConfigDict + ivector_options: MetaDict dubm_model: str gselect_paths: Dict[str, str] @@ -336,7 +334,7 @@ class AccGlobalStatsArguments(NamedTuple): log_path: str dictionaries: List[str] feature_strings: Dict[str, str] - ivector_options: ConfigDict + ivector_options: MetaDict gselect_paths: Dict[str, str] acc_paths: Dict[str, str] dubm_path: str @@ -348,7 +346,7 @@ class GaussToPostArguments(NamedTuple): log_path: str dictionaries: List[str] feature_strings: Dict[str, str] - ivector_options: ConfigDict + ivector_options: MetaDict post_paths: Dict[str, str] dubm_path: str @@ -359,7 +357,7 @@ class AccIvectorStatsArguments(NamedTuple): log_path: str dictionaries: List[str] feature_strings: Dict[str, str] - ivector_options: ConfigDict + ivector_options: MetaDict ie_path: str post_paths: Dict[str, str] acc_init_paths: Dict[str, str] @@ -371,7 +369,7 @@ class ExtractIvectorsArguments(NamedTuple): log_path: str dictionaries: List[str] feature_strings: Dict[str, str] - ivector_options: ConfigDict + ivector_options: MetaDict ali_paths: Dict[str, str] ie_path: str ivector_paths: Dict[str, str] @@ -408,21 +406,11 @@ class TestUtterancesArguments(NamedTuple): class SegmentVadArguments(NamedTuple): - """Arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.segment_vad_func`""" + """Arguments for :func:`~montreal_forced_aligner.multiprocessing.ivector.segment_vad_func`""" dictionaries: List[str] vad_paths: Dict[str, str] - segmentation_options: ConfigDict - - -class ClassifySpeakersArguments(NamedTuple): - """Arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.classify_speakers_func`""" - - log_path: str - dictionaries: List[str] - model_path: str - labels_path: str - ivector_paths: Dict[str, str] + segmentation_options: MetaDict class GeneratePronunciationsArguments(NamedTuple): @@ -451,7 +439,7 @@ class CreateHclgArguments(NamedTuple): model_path: str disambig_L_path: str disambig_int_path: str - hclg_options: ConfigDict + hclg_options: MetaDict words_mapping: MappingType @property @@ -465,7 +453,7 @@ class DecodeArguments(NamedTuple): log_path: str dictionaries: List[str] feature_strings: Dict[str, str] - decode_options: ConfigDict + decode_options: MetaDict model_path: str lat_paths: Dict[str, str] words_paths: Dict[str, str] @@ -477,7 +465,7 @@ class ScoreArguments(NamedTuple): log_path: str dictionaries: List[str] - score_options: ConfigDict + score_options: MetaDict lat_paths: Dict[str, str] rescored_lat_paths: Dict[str, str] carpa_rescored_lat_paths: Dict[str, str] @@ -490,7 +478,7 @@ class LmRescoreArguments(NamedTuple): log_path: str dictionaries: List[str] - lm_rescore_options: ConfigDict + lm_rescore_options: MetaDict lat_paths: Dict[str, str] rescored_lat_paths: Dict[str, str] old_g_paths: Dict[str, str] @@ -515,7 +503,7 @@ class InitialFmllrArguments(NamedTuple): dictionaries: List[str] feature_strings: Dict[str, str] model_path: str - fmllr_options: ConfigDict + fmllr_options: MetaDict pre_trans_paths: Dict[str, str] lat_paths: Dict[str, str] spk2utt_paths: Dict[str, str] @@ -528,7 +516,7 @@ class LatGenFmllrArguments(NamedTuple): dictionaries: List[str] feature_strings: Dict[str, str] model_path: str - decode_options: ConfigDict + decode_options: MetaDict words_paths: Dict[str, str] hclg_paths: Dict[str, str] tmp_lat_paths: Dict[str, str] @@ -541,7 +529,7 @@ class FinalFmllrArguments(NamedTuple): dictionaries: List[str] feature_strings: Dict[str, str] model_path: str - fmllr_options: ConfigDict + fmllr_options: MetaDict trans_paths: Dict[str, str] spk2utt_paths: Dict[str, str] tmp_lat_paths: Dict[str, str] @@ -554,7 +542,7 @@ class FmllrRescoreArguments(NamedTuple): dictionaries: List[str] feature_strings: Dict[str, str] model_path: str - fmllr_options: ConfigDict + fmllr_options: MetaDict tmp_lat_paths: Dict[str, str] final_lat_paths: Dict[str, str] @@ -573,16 +561,16 @@ class Job: Attributes ---------- - speakers: List[:class:`~montreal_forced_aligner.corpus.classes.Speaker`] + speakers: List[:class:`~montreal_forced_aligner.corpus.Speaker`] List of speakers associated with this job - dictionaries: Set[:class:`~montreal_forced_aligner.dictionary.Dictionary`] + dictionaries: Set[:class:`~montreal_forced_aligner.dictionary.PronunciationDictionary`] Set of dictionaries that the job's speakers use - subset_utts: Set[:class:`~montreal_forced_aligner.corpus.classes.Utterance`] + subset_utts: Set[:class:`~montreal_forced_aligner.corpus.Utterance`] When trainers are just using a subset of the corpus, the subset of utterances on each job will be set and used to filter the job's utterances - subset_speakers: Set[:class:`~montreal_forced_aligner.corpus.classes.Speaker`] + subset_speakers: Set[:class:`~montreal_forced_aligner.corpus.Speaker`] When subset_utts is set, this property will be calculated as the subset of speakers that the utterances correspond to - subset_dictionaries: Set[:class:`~montreal_forced_aligner.dictionary.Dictionary`] + subset_dictionaries: Set[:class:`~montreal_forced_aligner.dictionary.PronunciationDictionary`] Subset of dictionaries that the subset of speakers use """ @@ -590,11 +578,11 @@ class Job: def __init__(self, name: int): self.name = name self.speakers: List[Speaker] = [] - self.dictionaries: Set[Dictionary] = set() + self.dictionaries = set() - self.subset_utts: Set[Utterance] = set() - self.subset_speakers: Set[Speaker] = set() - self.subset_dictionaries: Set[Dictionary] = set() + self.subset_utts = set() + self.subset_speakers = set() + self.subset_dictionaries = set() def add_speaker(self, speaker: Speaker) -> None: """ @@ -602,19 +590,19 @@ def add_speaker(self, speaker: Speaker) -> None: Parameters ---------- - speaker: :class:`~montreal_forced_aligner.corpus.classes.Speaker` + speaker: :class:`~montreal_forced_aligner.corpus.Speaker` Speaker to add """ self.speakers.append(speaker) self.dictionaries.add(speaker.dictionary) - def set_subset(self, subset_utts: Collection[Utterance]) -> None: + def set_subset(self, subset_utts: Optional[Collection[Utterance]]) -> None: """ Set the current subset for the trainer Parameters ---------- - subset_utts: List[:class:`~montreal_forced_aligner.corpus.classes.Utterance`] + subset_utts: Collection[:class:`~montreal_forced_aligner.corpus.Utterance`], optional Subset of utterances for this job to use """ if subset_utts is None: @@ -934,7 +922,7 @@ def set_feature_config(self, feature_config: FeatureConfig) -> None: Parameters ---------- - feature_config: :class:`~montreal_forced_aligner.config.features.FeatureConfig` + feature_config: :class:`~montreal_forced_aligner.config.FeatureConfig` Feature configuration """ self.feature_config = feature_config @@ -947,7 +935,7 @@ def construct_base_feature_string(self, corpus: Corpus, all_feats: bool = False) Parameters ---------- - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to use as the source all_feats: bool Flag for whether all features across all jobs should be taken into account @@ -979,7 +967,7 @@ def construct_base_feature_string(self, corpus: Corpus, all_feats: bool = False) def construct_feature_proc_strings( self, - aligner: Union[AlignerType, SpeakerClassifier, Transcriber], + aligner: MfaWorker, speaker_independent: bool = False, ) -> Dict[str, str]: """ @@ -988,7 +976,7 @@ def construct_feature_proc_strings( Parameters ---------- - aligner: Union[AlignerType, :class:`~montreal_forced_aligner.speaker_classifier.SpeakerClassifier`, :class:`~montreal_forced_aligner.transcriber.Transcriber`] + aligner: :class:`~montreal_forced_aligner.abc.MfaWorker` Aligner, Transcriber or other main utility class that uses the features speaker_independent: bool Flag for whether features should be speaker-independent regardless of the presence of fMLLR transforms @@ -1066,7 +1054,7 @@ def compile_utterance_train_graphs_arguments( Returns ------- - CompileUtteranceTrainGraphsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.CompileUtteranceTrainGraphsArguments` Arguments for processing """ dictionary_paths = validator.dictionary.output_paths @@ -1103,7 +1091,7 @@ def test_utterances_arguments(self, validator: CorpusValidator) -> TestUtterance Returns ------- - TestUtterancesArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.TestUtterancesArguments` Arguments for processing """ dictionary_paths = validator.dictionary.output_paths @@ -1123,19 +1111,19 @@ def test_utterances_arguments(self, validator: CorpusValidator) -> TestUtterance ) def extract_ivector_arguments( - self, ivector_extractor: SpeakerClassifier + self, ivector_extractor: IvectorExtractor ) -> ExtractIvectorsArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.ivector.extract_ivectors_func` Parameters ---------- - ivector_extractor: :class:`~montreal_forced_aligner.speaker_classifier.SpeakerClassifier` - Speaker classifier + ivector_extractor: :class:`~montreal_forced_aligner.abc.IvectorExtractor` + Ivector extractor Returns ------- - ExtractIvectorsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.ExtractIvectorsArguments` Arguments for processing """ return ExtractIvectorsArguments( @@ -1164,7 +1152,7 @@ def create_hclgs_arguments(self, transcriber: Transcriber) -> Dict[str, CreateHc Returns ------- - Dict[str, CreateHclgArguments] + Dict[str, :class:`~montreal_forced_aligner.multiprocessing.classes.CreateHclgArguments`] Per dictionary arguments for HCLG """ args = {} @@ -1199,7 +1187,7 @@ def decode_arguments(self, transcriber: Transcriber) -> DecodeArguments: Returns ------- - DecodeArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.DecodeArguments` Arguments for processing """ return DecodeArguments( @@ -1224,7 +1212,7 @@ def score_arguments(self, transcriber: Transcriber) -> ScoreArguments: Returns ------- - ScoreArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.ScoreArguments` Arguments for processing """ return ScoreArguments( @@ -1251,7 +1239,7 @@ def lm_rescore_arguments(self, transcriber: Transcriber) -> LmRescoreArguments: Returns ------- - LmRescoreArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.LmRescoreArguments` Arguments for processing """ return LmRescoreArguments( @@ -1277,7 +1265,7 @@ def carpa_lm_rescore_arguments(self, transcriber: Transcriber) -> CarpaLmRescore Returns ------- - CarpaLmRescoreArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.CarpaLmRescoreArguments` Arguments for processing """ return CarpaLmRescoreArguments( @@ -1302,7 +1290,7 @@ def initial_fmllr_arguments(self, transcriber: Transcriber) -> InitialFmllrArgum Returns ------- - InitialFmllrArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.InitialFmllrArguments` Arguments for processing """ return InitialFmllrArguments( @@ -1327,7 +1315,7 @@ def lat_gen_fmllr_arguments(self, transcriber: Transcriber) -> LatGenFmllrArgume Returns ------- - LatGenFmllrArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.LatGenFmllrArguments` Arguments for processing """ return LatGenFmllrArguments( @@ -1352,7 +1340,7 @@ def final_fmllr_arguments(self, transcriber: Transcriber) -> FinalFmllrArguments Returns ------- - FinalFmllrArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.FinalFmllrArguments` Arguments for processing """ return FinalFmllrArguments( @@ -1377,7 +1365,7 @@ def fmllr_rescore_arguments(self, transcriber: Transcriber) -> FmllrRescoreArgum Returns ------- - FmllrRescoreArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.FmllrRescoreArguments` Arguments for processing """ return FmllrRescoreArguments( @@ -1396,12 +1384,12 @@ def vad_arguments(self, corpus: Corpus) -> VadArguments: Parameters ---------- - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus Returns ------- - VadArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.VadArguments` Arguments for processing """ return VadArguments( @@ -1414,7 +1402,7 @@ def vad_arguments(self, corpus: Corpus) -> VadArguments: def segments_vad_arguments(self, segmenter: Segmenter) -> SegmentVadArguments: """ - Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.segment_vad_func` + Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.ivector.segment_vad_func` Parameters ---------- @@ -1423,7 +1411,7 @@ def segments_vad_arguments(self, segmenter: Segmenter) -> SegmentVadArguments: Returns ------- - SegmentVadArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.SegmentVadArguments` Arguments for processing """ return SegmentVadArguments( @@ -1438,12 +1426,12 @@ def mfcc_arguments(self, corpus: Corpus) -> MfccArguments: Parameters ---------- - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus Returns ------- - MfccArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.MfccArguments` Arguments for processing """ return MfccArguments( @@ -1456,18 +1444,18 @@ def mfcc_arguments(self, corpus: Corpus) -> MfccArguments: self.feature_config.mfcc_options, ) - def acc_stats_arguments(self, aligner: AlignerType) -> AccStatsArguments: + def acc_stats_arguments(self, aligner: BaseTrainer) -> AccStatsArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.acc_stats_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: :class:`~montreal_forced_aligner.trainers.BaseTrainer` Aligner Returns ------- - AccStatsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.AccStatsArguments` Arguments for processing """ return AccStatsArguments( @@ -1483,18 +1471,18 @@ def acc_stats_arguments(self, aligner: AlignerType) -> AccStatsArguments: aligner.current_model_path, ) - def mono_align_equal_arguments(self, aligner: AlignerType) -> MonoAlignEqualArguments: + def mono_align_equal_arguments(self, aligner: MonophoneTrainer) -> MonoAlignEqualArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.mono_align_equal_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: :class:`~montreal_forced_aligner.trainers.MonophoneTrainer` Aligner Returns ------- - MonoAlignEqualArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.MonoAlignEqualArguments` Arguments for processing """ return MonoAlignEqualArguments( @@ -1507,18 +1495,18 @@ def mono_align_equal_arguments(self, aligner: AlignerType) -> MonoAlignEqualArgu aligner.current_model_path, ) - def align_arguments(self, aligner: AlignerType) -> AlignArguments: + def align_arguments(self, aligner: Aligner) -> AlignArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.align_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns ------- - AlignArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.AlignArguments` Arguments for processing """ if aligner.iteration is not None: @@ -1539,18 +1527,18 @@ def align_arguments(self, aligner: AlignerType) -> AlignArguments: aligner.align_options, ) - def compile_information_arguments(self, aligner: AlignerType) -> CompileInformationArguments: + def compile_information_arguments(self, aligner: BaseTrainer) -> CompileInformationArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.compile_information_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: :class:`~montreal_forced_aligner.trainers.BaseTrainer` Aligner Returns ------- - CompileInformationArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.CompileInformationArguments` Arguments for processing """ if aligner.iteration is not None: @@ -1772,19 +1760,19 @@ def multilingual_ipa(self) -> Dict[str, bool]: return data def generate_pronunciations_arguments( - self, aligner: AlignerType + self, aligner: Aligner ) -> GeneratePronunciationsArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.pronunciations.generate_pronunciations_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns ------- - GeneratePronunciationsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.GeneratePronunciationsArguments` Arguments for processing """ return GeneratePronunciationsArguments( @@ -1795,24 +1783,24 @@ def generate_pronunciations_arguments( self.construct_path_dictionary(aligner.data_directory, "text", "int.scp"), self.word_boundary_int_files(), self.construct_path_dictionary(aligner.working_directory, "ali", "ark"), - aligner.current_model_path, + aligner.model_path, self.construct_path_dictionary(aligner.working_directory, "prons", "scp"), ) def alignment_improvement_arguments( - self, aligner: AlignerType + self, aligner: BaseTrainer ) -> AlignmentImprovementArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.compute_alignment_improvement_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: :class:`~montreal_forced_aligner.trainers.BaseTrainer` Aligner Returns ------- - AlignmentImprovementArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.AlignmentImprovementArguments` Arguments for processing """ return AlignmentImprovementArguments( @@ -1830,18 +1818,18 @@ def alignment_improvement_arguments( ), ) - def ali_to_word_ctm_arguments(self, aligner: AlignerType) -> AliToCtmArguments: + def ali_to_word_ctm_arguments(self, aligner: BaseAligner) -> AliToCtmArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.ali_to_ctm_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns ------- - AliToCtmArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.AliToCtmArguments` Arguments for processing """ return AliToCtmArguments( @@ -1856,18 +1844,18 @@ def ali_to_word_ctm_arguments(self, aligner: AlignerType) -> AliToCtmArguments: True, ) - def ali_to_phone_ctm_arguments(self, aligner: AlignerType) -> AliToCtmArguments: + def ali_to_phone_ctm_arguments(self, aligner: Aligner) -> AliToCtmArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.ali_to_ctm_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns ------- - AliToCtmArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.AliToCtmArguments` Arguments for processing """ return AliToCtmArguments( @@ -1888,7 +1876,7 @@ def job_utts(self) -> Dict[str, Dict[str, Utterance]]: Returns ------- - Dict[str, Dict[str, Utterance]] + Dict[str, Dict[str, :class:`~montreal_forced_aligner.corpus.Utterance`]] Mapping of dictionary name to Utterance mappings """ data = {} @@ -1907,7 +1895,7 @@ def job_files(self) -> Dict[str, File]: Returns ------- - Dict[str, File] + Dict[str, :class:`~montreal_forced_aligner.corpus.File`] Mapping of file name to File objects """ data = {} @@ -1922,18 +1910,18 @@ def job_files(self) -> Dict[str, File]: data[f.name] = f return data - def cleanup_word_ctm_arguments(self, aligner: AlignerType) -> CleanupWordCtmArguments: + def cleanup_word_ctm_arguments(self, aligner: Aligner) -> CleanupWordCtmArguments: """ - Generate Job arguments for :class:`~montreal_forced_aligner.multiprocessing.alignment.CleanupWordCtmProcessWorker` + Generate Job arguments for :class:`~montreal_forced_aligner.multiprocessing.CleanupWordCtmProcessWorker` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns ------- - CleanupWordCtmArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.CleanupWordCtmArguments` Arguments for processing """ return CleanupWordCtmArguments( @@ -1943,18 +1931,18 @@ def cleanup_word_ctm_arguments(self, aligner: AlignerType) -> CleanupWordCtmArgu self.dictionary_data(), ) - def no_cleanup_word_ctm_arguments(self, aligner: AlignerType) -> NoCleanupWordCtmArguments: + def no_cleanup_word_ctm_arguments(self, aligner: Aligner) -> NoCleanupWordCtmArguments: """ - Generate Job arguments for :class:`~montreal_forced_aligner.multiprocessing.alignment.NoCleanupWordCtmProcessWorker` + Generate Job arguments for :class:`~montreal_forced_aligner.multiprocessing.NoCleanupWordCtmProcessWorker` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns ------- - NoCleanupWordCtmArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.NoCleanupWordCtmArguments` Arguments for processing """ return NoCleanupWordCtmArguments( @@ -1964,18 +1952,18 @@ def no_cleanup_word_ctm_arguments(self, aligner: AlignerType) -> NoCleanupWordCt self.dictionary_data(), ) - def phone_ctm_arguments(self, aligner: AlignerType) -> PhoneCtmArguments: + def phone_ctm_arguments(self, aligner: Aligner) -> PhoneCtmArguments: """ - Generate Job arguments for :class:`~montreal_forced_aligner.multiprocessing.alignment.PhoneCtmProcessWorker` + Generate Job arguments for :class:`~montreal_forced_aligner.multiprocessing.PhoneCtmProcessWorker` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns ------- - PhoneCtmArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.PhoneCtmArguments` Arguments for processing """ return PhoneCtmArguments( @@ -2000,18 +1988,18 @@ def dictionary_data(self) -> Dict[str, DictionaryData]: data[dictionary.name] = dictionary.data() return data - def combine_ctm_arguments(self, aligner: AlignerType) -> CombineCtmArguments: + def combine_ctm_arguments(self, aligner: Aligner) -> CombineCtmArguments: """ - Generate Job arguments for :class:`~montreal_forced_aligner.multiprocessing.alignment.CombineProcessWorker` + Generate Job arguments for :class:`~montreal_forced_aligner.multiprocessing.CombineProcessWorker` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns ------- - CombineCtmArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.CombineCtmArguments` Arguments for processing """ return CombineCtmArguments( @@ -2021,18 +2009,18 @@ def combine_ctm_arguments(self, aligner: AlignerType) -> CombineCtmArguments: aligner.align_config.cleanup_textgrids, ) - def export_textgrid_arguments(self, aligner: AlignerType) -> ExportTextGridArguments: + def export_textgrid_arguments(self, aligner: Aligner) -> ExportTextGridArguments: """ - Generate Job arguments for :class:`~montreal_forced_aligner.multiprocessing.alignment.ExportTextGridProcessWorker` + Generate Job arguments for :class:`~montreal_forced_aligner.multiprocessing.ExportTextGridProcessWorker` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns ------- - ExportTextGridArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.ExportTextGridArguments` Arguments for processing """ return ExportTextGridArguments( @@ -2042,42 +2030,42 @@ def export_textgrid_arguments(self, aligner: AlignerType) -> ExportTextGridArgum aligner.backup_output_directory, ) - def tree_stats_arguments(self, aligner: AlignerType) -> TreeStatsArguments: + def tree_stats_arguments(self, aligner: BaseTrainer) -> TreeStatsArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.tree_stats_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: :class:`~montreal_forced_aligner.trainers.BaseTrainer` Aligner Returns ------- - TreeStatsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.TreeStatsArguments` Arguments for processing """ return TreeStatsArguments( os.path.join(aligner.working_log_directory, f"acc_tree.{self.name}.log"), self.current_dictionary_names, - aligner.dictionary.silence_csl, + aligner.dictionary.config.silence_csl, aligner.previous_trainer.alignment_model_path, self.construct_feature_proc_strings(aligner), self.construct_path_dictionary(aligner.previous_trainer.align_directory, "ali", "ark"), self.construct_path_dictionary(aligner.working_directory, "tree", "acc"), ) - def convert_alignment_arguments(self, aligner: AlignerType) -> ConvertAlignmentsArguments: + def convert_alignment_arguments(self, aligner: BaseTrainer) -> ConvertAlignmentsArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.convert_alignments_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: :class:`~montreal_forced_aligner.trainers.BaseTrainer` Aligner Returns ------- - ConvertAlignmentsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.ConvertAlignmentsArguments` Arguments for processing """ return ConvertAlignmentsArguments( @@ -2092,18 +2080,18 @@ def convert_alignment_arguments(self, aligner: AlignerType) -> ConvertAlignments self.construct_path_dictionary(aligner.working_directory, "ali", "ark"), ) - def calc_fmllr_arguments(self, aligner: AlignerType) -> CalcFmllrArguments: + def calc_fmllr_arguments(self, aligner: Aligner) -> CalcFmllrArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.calc_fmllr_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns ------- - CalcFmllrArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.CalcFmllrArguments` Arguments for processing """ return CalcFmllrArguments( @@ -2112,24 +2100,24 @@ def calc_fmllr_arguments(self, aligner: AlignerType) -> CalcFmllrArguments: self.construct_feature_proc_strings(aligner), self.construct_path_dictionary(aligner.working_directory, "ali", "ark"), aligner.alignment_model_path, - aligner.current_model_path, + aligner.model_path, self.construct_path_dictionary(aligner.data_directory, "spk2utt", "scp"), self.construct_path_dictionary(aligner.working_directory, "trans", "ark"), aligner.fmllr_options, ) - def acc_stats_two_feats_arguments(self, aligner: AlignerType) -> AccStatsTwoFeatsArguments: + def acc_stats_two_feats_arguments(self, aligner: SatTrainer) -> AccStatsTwoFeatsArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.acc_stats_two_feats_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: :class:`~montreal_forced_aligner.trainers.SatTrainer` Aligner Returns ------- - AccStatsTwoFeatsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.AccStatsTwoFeatsArguments` Arguments for processing """ return AccStatsTwoFeatsArguments( @@ -2148,12 +2136,12 @@ def lda_acc_stats_arguments(self, aligner: LdaTrainer) -> LdaAccStatsArguments: Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.LdaTrainer` + aligner: :class:`~montreal_forced_aligner.trainers.LdaTrainer` Aligner Returns ------- - LdaAccStatsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.LdaAccStatsArguments` Arguments for processing """ return LdaAccStatsArguments( @@ -2174,12 +2162,12 @@ def calc_lda_mllt_arguments(self, aligner: LdaTrainer) -> CalcLdaMlltArguments: Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.LdaTrainer` + aligner: :class:`~montreal_forced_aligner.trainers.LdaTrainer` Aligner Returns ------- - CalcLdaMlltArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.CalcLdaMlltArguments` Arguments for processing """ return CalcLdaMlltArguments( @@ -2200,12 +2188,12 @@ def ivector_acc_stats_arguments( Parameters ---------- - trainer: :class:`~montreal_forced_aligner.trainer.IvectorExtractorTrainer` + trainer: :class:`~montreal_forced_aligner.trainers.IvectorExtractorTrainer` Aligner Returns ------- - AccIvectorStatsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.AccIvectorStatsArguments` Arguments for processing """ return AccIvectorStatsArguments( @@ -2229,7 +2217,7 @@ def map_acc_stats_arguments(self, aligner: AdaptingAligner) -> MapAccStatsArgume Returns ------- - MapAccStatsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.MapAccStatsArguments` Arguments for processing """ return MapAccStatsArguments( @@ -2247,12 +2235,12 @@ def gmm_gselect_arguments(self, aligner: IvectorExtractorTrainer) -> GmmGselectA Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.IvectorExtractorTrainer` + aligner: :class:`~montreal_forced_aligner.trainers.IvectorExtractorTrainer` Aligner Returns ------- - GmmGselectArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.GmmGselectArguments` Arguments for processing """ return GmmGselectArguments( @@ -2272,12 +2260,12 @@ def acc_global_stats_arguments( Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.IvectorExtractorTrainer` + aligner: :class:`~montreal_forced_aligners.trainers.IvectorExtractorTrainer` Aligner Returns ------- - AccGlobalStatsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.AccGlobalStatsArguments` Arguments for processing """ return AccGlobalStatsArguments( @@ -2301,12 +2289,12 @@ def gauss_to_post_arguments(self, aligner: IvectorExtractorTrainer) -> GaussToPo Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.IvectorExtractorTrainer` + aligner: :class:`~montreal_forced_aligner.trainers.IvectorExtractorTrainer` Aligner Returns ------- - GaussToPostArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.GaussToPostArguments` Arguments for processing """ return GaussToPostArguments( @@ -2318,18 +2306,18 @@ def gauss_to_post_arguments(self, aligner: IvectorExtractorTrainer) -> GaussToPo aligner.current_dubm_path, ) - def compile_train_graph_arguments(self, aligner: AlignerType) -> CompileTrainGraphsArguments: + def compile_train_graph_arguments(self, aligner: Aligner) -> CompileTrainGraphsArguments: """ Generate Job arguments for :func:`~montreal_forced_aligner.multiprocessing.alignment.compile_train_graphs_func` Parameters ---------- - aligner: :class:`~montreal_forced_aligner.trainer.BaseTrainer` or :class:`~montreal_forced_aligner.aligner.BaseAligner` + aligner: Aligner Aligner Returns ------- - CompileTrainGraphsArguments + :class:`~montreal_forced_aligner.multiprocessing.classes.CompileTrainGraphsArguments` Arguments for processing """ dictionary_paths = aligner.dictionary.output_paths @@ -2338,7 +2326,7 @@ def compile_train_graph_arguments(self, aligner: AlignerType) -> CompileTrainGra for k, v in dictionary_paths.items() } lexicon_fst_paths = {k: os.path.join(v, "L.fst") for k, v in dictionary_paths.items()} - model_path = aligner.current_model_path + model_path = aligner.model_path if not os.path.exists(model_path): model_path = aligner.alignment_model_path return CompileTrainGraphsArguments( @@ -2360,7 +2348,7 @@ def utt2fst_scp_data( Parameters ---------- - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to generate data for num_frequent_words: int Number of frequent words to include in the unigram language model @@ -2378,7 +2366,7 @@ def utt2fst_scp_data( new_text = [] dictionary = utterance.speaker.dictionary if dictionary.name not in most_frequent: - word_frequencies = corpus.get_word_frequency(dictionary) + word_frequencies = corpus.get_word_frequency() most_frequent[dictionary.name] = sorted( word_frequencies.items(), key=lambda x: -x[1] )[:num_frequent_words] @@ -2402,7 +2390,7 @@ def output_utt_fsts(self, corpus: Corpus, num_frequent_words: int = 10) -> None: Parameters ---------- - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to generate FSTs for num_frequent_words: int Number of frequent words diff --git a/montreal_forced_aligner/multiprocessing/corpus.py b/montreal_forced_aligner/multiprocessing/corpus.py index 182313db..b3907088 100644 --- a/montreal_forced_aligner/multiprocessing/corpus.py +++ b/montreal_forced_aligner/multiprocessing/corpus.py @@ -1,4 +1,9 @@ -"""Multiprocessing functions for loading corpora""" +""" +Corpus loading worker +--------------------- + + +""" from __future__ import annotations import multiprocessing as mp @@ -7,7 +12,6 @@ from queue import Empty from typing import TYPE_CHECKING, Dict, Union -from ..corpus.classes import parse_file from ..exceptions import TextGridParseError, TextParseError if TYPE_CHECKING: @@ -35,9 +39,9 @@ class CorpusProcessWorker(mp.Process): Dictionary to catch errors return_q: :class:`~multiprocessing.Queue` Return queue for processed Files - stopped: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + stopped: :func:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Stop check for whether corpus loading should exit - finished_adding: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + finished_adding: :class:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Signal that the main thread has stopped adding new files to be processed """ @@ -60,6 +64,8 @@ def run(self) -> None: """ Run the corpus loading job """ + from ..corpus.classes import parse_file + while True: try: arguments = self.job_q.get(timeout=1) diff --git a/montreal_forced_aligner/multiprocessing/features.py b/montreal_forced_aligner/multiprocessing/features.py index 416cf6a8..a1f35209 100644 --- a/montreal_forced_aligner/multiprocessing/features.py +++ b/montreal_forced_aligner/multiprocessing/features.py @@ -1,4 +1,8 @@ -"""Multiprocessing functions for generating features""" +""" +Feature generation functions +---------------------------- + +""" from __future__ import annotations import os @@ -12,10 +16,10 @@ if TYPE_CHECKING: SpeakerCharacterType = Union[str, int] + from ..abc import MetaDict from ..corpus import Corpus - from ..corpus.classes import ConfigDict -__all__ = ["mfcc", "compute_vad", "calc_cmvn"] +__all__ = ["mfcc", "compute_vad", "calc_cmvn", "mfcc_func", "compute_vad_func"] def mfcc_func( @@ -25,7 +29,7 @@ def mfcc_func( lengths_paths: Dict[str, str], segment_paths: Dict[str, str], wav_paths: Dict[str, str], - mfcc_options: ConfigDict, + mfcc_options: MetaDict, ) -> None: """ Multiprocessing function for generating MFCC features @@ -44,7 +48,7 @@ def mfcc_func( Dictionary of segment scp files per dictionary name wav_paths: Dict[str, str] Dictionary of sound file scp files per dictionary name - mfcc_options: ConfigDict + mfcc_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for MFCC generation """ with open(log_path, "w") as log_file: @@ -116,7 +120,7 @@ def mfcc(corpus: Corpus) -> None: Parameters ---------- - corpus : :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus : :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to generate MFCC features for """ log_directory = os.path.join(corpus.split_directory, "log") @@ -135,7 +139,7 @@ def calc_cmvn(corpus: Corpus) -> None: Parameters ---------- - corpus : :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus : :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to run CMVN calculation """ spk2utt = os.path.join(corpus.output_directory, "spk2utt.scp") @@ -167,7 +171,7 @@ def compute_vad_func( dictionaries: List[str], feats_scp_paths: Dict[str, str], vad_scp_paths: Dict[str, str], - vad_options: ConfigDict, + vad_options: MetaDict, ) -> None: """ Multiprocessing function to compute voice activity detection @@ -179,10 +183,10 @@ def compute_vad_func( dictionaries: List[str] List of dictionary names feats_scp_paths: Dict[str, str] - Dictionary of feature scp files per dictionary name + PronunciationDictionary of feature scp files per dictionary name vad_scp_paths: Dict[str, str] - Dictionary of vad scp files per dictionary name - vad_options: ConfigDict + PronunciationDictionary of vad scp files per dictionary name + vad_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for VAD """ with open(log_path, "w") as log_file: @@ -209,7 +213,7 @@ def compute_vad(corpus: Corpus) -> None: Parameters ---------- - corpus : :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus : :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to compute VAD """ log_directory = os.path.join(corpus.split_directory, "log") diff --git a/montreal_forced_aligner/multiprocessing/helper.py b/montreal_forced_aligner/multiprocessing/helper.py index 4b800468..bfb32979 100644 --- a/montreal_forced_aligner/multiprocessing/helper.py +++ b/montreal_forced_aligner/multiprocessing/helper.py @@ -1,4 +1,8 @@ -"""Helper classes and functions for multiprocessing""" +""" +Multiprocessing helpers +----------------------- + +""" from __future__ import annotations import multiprocessing as mp @@ -19,9 +23,9 @@ class Counter(object): Attributes ---------- - val: multiprocessing.Value + val: :func:`~multiprocessing.Value` Integer to increment - lock: multiprocessing.Lock + lock: :class:`~multiprocessing.Lock` Lock for process safety """ @@ -46,9 +50,9 @@ class Stopped(object): Attributes ---------- - val: multiprocessing.Value + val: :func:`~multiprocessing.Value` 0 if not stopped, 1 if stopped - lock: multiprocessing.Lock + lock: :class:`~multiprocessing.Lock` Lock for process safety _source: multiprocessing.Value 1 if it was a Ctrl+C event that stopped it, 0 otherwise @@ -94,7 +98,7 @@ class ProcessWorker(mp.Process): Multiprocessing function to call on arguments from job_q return_dict: Dict Dictionary for collecting errors - stopped: :class:`~montreal_forced_aligner.multiprocess.helper.Stopped` + stopped: :class:`~montreal_forced_aligner.multiprocessing.helper.Stopped` Stop check return_info: Dict[int, Any], optional Optional dictionary to fill if the function should return information to main thread diff --git a/montreal_forced_aligner/multiprocessing/ivector.py b/montreal_forced_aligner/multiprocessing/ivector.py index 83b815de..7051fde6 100644 --- a/montreal_forced_aligner/multiprocessing/ivector.py +++ b/montreal_forced_aligner/multiprocessing/ivector.py @@ -1,19 +1,24 @@ -"""Multiprocessing functions for ivector extraction and training""" +""" +Ivector extractor functions +--------------------------- + + +""" from __future__ import annotations import os import subprocess from typing import TYPE_CHECKING, Dict, List, Union +from ..abc import MetaDict from ..helper import load_scp from ..utils import thirdparty_binary from .helper import run_mp, run_non_mp if TYPE_CHECKING: - from ..config import ConfigDict + from ..abc import IvectorExtractor from ..corpus.classes import File, Speaker, Utterance # noqa from ..segmenter import SegmentationType, Segmenter - from ..speaker_classifier import SpeakerClassifier from ..trainers.ivector_extractor import IvectorExtractorTrainer @@ -26,6 +31,12 @@ "get_initial_segmentation", "merge_segments", "segment_vad", + "segment_vad_func", + "gmm_gselect_func", + "gauss_to_post_func", + "acc_global_stats_func", + "acc_ivector_stats_func", + "extract_ivectors_func", ] @@ -33,7 +44,7 @@ def gmm_gselect_func( log_path: str, dictionaries: List[str], feature_strings: Dict[str, str], - ivector_options: ConfigDict, + ivector_options: MetaDict, dubm_path: str, gselect_paths: Dict[str, str], ) -> None: @@ -48,7 +59,7 @@ def gmm_gselect_func( List of dictionary names feature_strings: Dict[str, str] Dictionary of feature strings per dictionary name - ivector_options: ConfigDict + ivector_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for ivector extractor training dubm_path: str Path to the DUBM file @@ -102,7 +113,7 @@ def gmm_gselect(trainer: IvectorExtractorTrainer) -> None: Parameters ---------- - trainer : :class:`~montreal_forced_aligner.trainers.ivector.IvectorExtractorTrainer` + trainer : :class:`~montreal_forced_aligner.trainers.IvectorExtractorTrainer` Ivector Extractor Trainer """ jobs = [x.gmm_gselect_arguments(trainer) for x in trainer.corpus.jobs] @@ -116,7 +127,7 @@ def acc_global_stats_func( log_path: str, dictionaries: List[str], feature_strings: Dict[str, str], - ivector_options: ConfigDict, + ivector_options: MetaDict, gselect_paths: Dict[str, str], acc_paths: Dict[str, str], dubm_path: str, @@ -132,7 +143,7 @@ def acc_global_stats_func( List of dictionary names feature_strings: Dict[str, str] Dictionary of feature strings per dictionary name - ivector_options: ConfigDict + ivector_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for ivector extractor training gselect_paths: Dict[str, str] Dictionary of gselect archives per dictionary name @@ -188,7 +199,7 @@ def acc_global_stats(trainer: IvectorExtractorTrainer) -> None: Parameters ---------- - trainer : :class:`~montreal_forced_aligner.trainers.ivector.IvectorExtractorTrainer` + trainer : :class:`~montreal_forced_aligner.trainers.IvectorExtractorTrainer` Ivector Extractor Trainer """ jobs = [x.acc_global_stats_arguments(trainer) for x in trainer.corpus.jobs] @@ -238,7 +249,7 @@ def gauss_to_post_func( log_path: str, dictionaries: List[str], feature_strings: Dict[str, str], - ivector_options: ConfigDict, + ivector_options: MetaDict, post_paths: Dict[str, str], dubm_path: str, ): @@ -253,7 +264,7 @@ def gauss_to_post_func( List of dictionary names feature_strings: Dict[str, str] Dictionary of feature strings per dictionary name - ivector_options: ConfigDict + ivector_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for ivector extractor training post_paths: Dict[str, str] Dictionary of posterior archives per dictionary name @@ -321,7 +332,7 @@ def gauss_to_post(trainer: IvectorExtractorTrainer) -> None: Parameters ---------- - trainer: :class:`~montreal_forced_aligner.trainers.ivector.IvectorExtractorTrainer` + trainer: :class:`~montreal_forced_aligner.trainers.IvectorExtractorTrainer` Ivector Extractor Trainer """ jobs = [x.gauss_to_post_arguments(trainer) for x in trainer.corpus.jobs] @@ -335,7 +346,7 @@ def acc_ivector_stats_func( log_path: str, dictionaries: List[str], feature_strings: Dict[str, str], - ivector_options: ConfigDict, + ivector_options: MetaDict, ie_path: str, post_paths: Dict[str, str], acc_init_paths: Dict[str, str], @@ -350,15 +361,15 @@ def acc_ivector_stats_func( dictionaries: List[str] List of dictionary names feature_strings: Dict[str, str] - Dictionary of feature strings per dictionary name - ivector_options: ConfigDict + PronunciationDictionary of feature strings per dictionary name + ivector_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for ivector extractor training ie_path: str Path to the ivector extractor file post_paths: Dict[str, str] - Dictionary of posterior archives per dictionary name + PronunciationDictionary of posterior archives per dictionary name acc_init_paths: Dict[str, str] - Dictionary of accumulated stats files per dictionary name + PronunciationDictionary of accumulated stats files per dictionary name """ with open(log_path, "w", encoding="utf8") as log_file: for dict_name in dictionaries: @@ -409,7 +420,7 @@ def acc_ivector_stats(trainer: IvectorExtractorTrainer) -> None: Parameters ---------- - trainer: :class:`~montreal_forced_aligner.trainers.ivector.IvectorExtractorTrainer` + trainer: :class:`~montreal_forced_aligner.trainers.IvectorExtractorTrainer` Ivector Extractor Trainer """ @@ -459,7 +470,7 @@ def extract_ivectors_func( log_path: str, dictionaries: List[str], feature_strings: Dict[str, str], - ivector_options: ConfigDict, + ivector_options: MetaDict, ali_paths: Dict[str, str], ie_path: str, ivector_paths: Dict[str, str], @@ -478,7 +489,7 @@ def extract_ivectors_func( List of dictionary names feature_strings: Dict[str, str] Dictionary of feature strings per dictionary name - ivector_options: ConfigDict + ivector_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for ivector extraction ali_paths: Dict[str, str] Dictionary of alignment archives per dictionary name @@ -576,7 +587,7 @@ def extract_ivectors_func( extract_proc.communicate() -def extract_ivectors(ivector_extractor: Union[SpeakerClassifier, IvectorExtractorTrainer]) -> None: +def extract_ivectors(ivector_extractor: IvectorExtractor) -> None: """ Multiprocessing function that extracts job_name-vectors. @@ -593,7 +604,7 @@ def extract_ivectors(ivector_extractor: Union[SpeakerClassifier, IvectorExtracto Parameters ---------- - ivector_extractor: :class:`~montreal_forced_aligner.speaker_classifier.SpeakerClassifier` or :class:`~montreal_forced_aligner.trainers.ivector.IvectorExtractorTrainer` + ivector_extractor: IvectorExtractor Ivector extractor """ @@ -695,7 +706,7 @@ def merge_segments( def segment_vad_func( dictionaries: List[str], vad_paths: Dict[str, str], - segmentation_options: ConfigDict, + segmentation_options: MetaDict, ) -> Dict[str, Utterance]: """ Multiprocessing function to generate segments from VAD output @@ -708,7 +719,7 @@ def segment_vad_func( List of dictionary names vad_paths: Dict[str, str] Dictionary of VAD archives per dictionary name - segmentation_options: ConfigDict + segmentation_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for segmentation """ diff --git a/montreal_forced_aligner/multiprocessing/pronunciations.py b/montreal_forced_aligner/multiprocessing/pronunciations.py index 219d5781..f84fb334 100644 --- a/montreal_forced_aligner/multiprocessing/pronunciations.py +++ b/montreal_forced_aligner/multiprocessing/pronunciations.py @@ -1,19 +1,20 @@ -"""Multiprocessing functions for generating pronunciation probabilities""" +""" +Pronunciation probability functions +----------------------------------- + +""" from __future__ import annotations import os import subprocess from collections import Counter, defaultdict -from typing import TYPE_CHECKING, Dict, List, Tuple +from typing import Dict, List, Tuple +from ..abc import Aligner from ..utils import thirdparty_binary from .helper import run_mp, run_non_mp -if TYPE_CHECKING: - from .alignment import AlignerType - - -__all__ = ["generate_pronunciations"] +__all__ = ["generate_pronunciations", "generate_pronunciations_func"] def generate_pronunciations_func( @@ -89,7 +90,7 @@ def generate_pronunciations_func( def generate_pronunciations( - aligner: AlignerType, + aligner: Aligner, ) -> Tuple[Dict[str, defaultdict[Counter]], Dict[str, Dict[str, List[str, ...]]]]: """ Generates pronunciations based on alignments for a corpus and calculates pronunciation probabilities diff --git a/montreal_forced_aligner/multiprocessing/transcription.py b/montreal_forced_aligner/multiprocessing/transcription.py index b616560b..76754775 100644 --- a/montreal_forced_aligner/multiprocessing/transcription.py +++ b/montreal_forced_aligner/multiprocessing/transcription.py @@ -1,4 +1,8 @@ -"""Multiprocessing functions for transcription in MFA""" +""" +Transcription functions +----------------------- + +""" from __future__ import annotations import os @@ -8,12 +12,12 @@ import sys from typing import TYPE_CHECKING, Dict, List, Optional, TextIO, Union +from ..abc import MetaDict from ..exceptions import KaldiProcessingError from ..utils import thirdparty_binary from .helper import run_mp, run_non_mp if TYPE_CHECKING: - from ..config.transcribe_config import ConfigDict from ..dictionary import MappingType from ..transcriber import Transcriber @@ -29,6 +33,15 @@ "transcribe", "transcribe_fmllr", "score_transcriptions", + "fmllr_rescore_func", + "final_fmllr_est_func", + "initial_fmllr_func", + "lat_gen_fmllr_func", + "score_func", + "lm_rescore_func", + "carpa_lm_rescore_func", + "decode_func", + "create_hclg_func", ] @@ -151,6 +164,7 @@ def compose_hclg( log_file: TextIO, ) -> None: """ + Compost HCLG.fst for a dictionary Parameters ---------- @@ -166,10 +180,6 @@ def compose_hclg( Path to save HCLGa.fst file log_file: TextIO Log file handler to output logging info to - - Returns - ------- - """ model_path = os.path.join(model_directory, "final.mdl") tree_path = os.path.join(model_directory, "tree") @@ -355,7 +365,7 @@ def create_hclg_func( model_path: str, disambig_L_path: str, disambig_int_path: str, - hclg_options: ConfigDict, + hclg_options: MetaDict, words_mapping: MappingType, ): """ @@ -385,7 +395,7 @@ def create_hclg_func( Path to L_disambig.fst file disambig_int_path: Path to dictionary's disambiguation symbols file - hclg_options: ConfigDict + hclg_options: :class:`~montreal_forced_aligner.abc.MetaDict` Configuration options for composing HCLG.fst words_mapping: Dict[str, int] Word labels to integer ID mapping @@ -479,7 +489,7 @@ def create_hclg_func( def create_hclgs(transcriber: Transcriber): """ - Create HCLG.fst files for every dictionary being used by a Transcriber + Create HCLG.fst files for every dictionary being used by a :class:`~montreal_forced_aligner.transcriber.Transcriber` Parameters ---------- @@ -511,7 +521,7 @@ def decode_func( log_path: str, dictionaries: List[str], feature_strings: Dict[str, str], - decode_options: ConfigDict, + decode_options: MetaDict, model_path: str, lat_paths: Dict[str, str], word_symbol_paths: Dict[str, str], @@ -528,7 +538,7 @@ def decode_func( List of dictionary names feature_strings: Dict[str, str] Dictionary of feature strings per dictionary name - decode_options: ConfigDict + decode_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for decoding model_path: str Path to acoustic model file @@ -582,7 +592,7 @@ def decode_func( def score_func( log_path: str, dictionaries: List[str], - score_options: ConfigDict, + score_options: MetaDict, lat_paths: Dict[str, str], rescored_lat_paths: Dict[str, str], carpa_rescored_lat_paths: Dict[str, str], @@ -598,7 +608,7 @@ def score_func( Path to save log output dictionaries: List[str] List of dictionary names - score_options: ConfigDict + score_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for scoring lat_paths: Dict[str, str] Dictionary of lattice archive paths per dictionary name @@ -664,7 +674,7 @@ def score_func( def lm_rescore_func( log_path: str, dictionaries: List[str], - lm_rescore_options: ConfigDict, + lm_rescore_options: MetaDict, lat_paths: Dict[str, str], rescored_lat_paths: Dict[str, str], old_g_paths: Dict[str, str], @@ -679,7 +689,7 @@ def lm_rescore_func( Path to save log output dictionaries: List[str] List of dictionary names - lm_rescore_options: ConfigDict + lm_rescore_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for rescoring lat_paths: Dict[str, str] Dictionary of lattice archive paths per dictionary name @@ -736,13 +746,13 @@ def carpa_lm_rescore_func( dictionaries: List[str] List of dictionary names lat_paths: Dict[str, str] - Dictionary of lattice archive paths per dictionary name + PronunciationDictionary of lattice archive paths per dictionary name rescored_lat_paths: Dict[str, str] - Dictionary of rescored lattice archive paths per dictionary name + PronunciationDictionary of rescored lattice archive paths per dictionary name old_g_paths: Dict[str, str] - Dictionary of medium G.fst paths per dictionary name + PronunciationDictionary of medium G.fst paths per dictionary name new_g_paths: Dict[str, str] - Dictionary of large G.carpa paths per dictionary name + PronunciationDictionary of large G.carpa paths per dictionary name """ with open(log_path, "w", encoding="utf8") as log_file: for dict_name in dictionaries: @@ -873,7 +883,7 @@ def initial_fmllr_func( dictionaries: List[str], feature_strings: Dict[str, str], model_path: str, - fmllr_options: ConfigDict, + fmllr_options: MetaDict, trans_paths: Dict[str, str], lat_paths: Dict[str, str], spk2utt_paths: Dict[str, str], @@ -891,7 +901,7 @@ def initial_fmllr_func( Dictionary of feature strings per dictionary name model_path: str Path to acoustic model file - fmllr_options: ConfigDict + fmllr_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for calculating fMLLR transforms trans_paths: Dict[str, str] Dictionary of transform archives per dictionary name @@ -968,7 +978,7 @@ def lat_gen_fmllr_func( dictionaries: List[str], feature_strings: Dict[str, str], model_path: str, - decode_options: ConfigDict, + decode_options: MetaDict, word_symbol_paths: Dict[str, str], hclg_paths: Dict[str, str], tmp_lat_paths: Dict[str, str], @@ -986,7 +996,8 @@ def lat_gen_fmllr_func( Dictionary of feature strings per dictionary name model_path: str Path to acoustic model file - decode_options + decode_options: :class:`~montreal_forced_aligner.abc.MetaDict` + Options for decoding word_symbol_paths: Dict[str, str] Dictionary of word symbol paths per dictionary name hclg_paths: Dict[str, str] @@ -1027,7 +1038,7 @@ def final_fmllr_est_func( dictionaries: List[str], feature_strings: Dict[str, str], model_path: str, - fmllr_options: ConfigDict, + fmllr_options: MetaDict, trans_paths: Dict[str, str], spk2utt_paths: Dict[str, str], tmp_lat_paths: Dict[str, str], @@ -1045,7 +1056,7 @@ def final_fmllr_est_func( Dictionary of feature strings per dictionary name model_path: str Path to acoustic model file - fmllr_options: ConfigDict + fmllr_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for calculating fMLLR transforms trans_paths: Dict[str, str] Dictionary of transform archives per dictionary name @@ -1138,7 +1149,7 @@ def fmllr_rescore_func( dictionaries: List[str], feature_strings: Dict[str, str], model_path: str, - fmllr_options: ConfigDict, + fmllr_options: MetaDict, tmp_lat_paths: Dict[str, str], final_lat_paths: Dict[str, str], ) -> None: @@ -1155,7 +1166,7 @@ def fmllr_rescore_func( Dictionary of feature strings per dictionary name model_path: str Path to acoustic model file - fmllr_options: ConfigDict + fmllr_options: :class:`~montreal_forced_aligner.abc.MetaDict` Options for calculating fMLLR transforms tmp_lat_paths: Dict[str, str] Dictionary of temporary lattice archive paths per dictionary name diff --git a/montreal_forced_aligner/segmenter.py b/montreal_forced_aligner/segmenter.py index 7fbf12ec..66be4159 100644 --- a/montreal_forced_aligner/segmenter.py +++ b/montreal_forced_aligner/segmenter.py @@ -1,10 +1,19 @@ -"""Class definitions for Segmentation based on voice activity in MFA""" +""" +Segmenting files +================ + +.. autosummary:: + + :toctree: generated/ + +""" from __future__ import annotations import os import shutil from typing import TYPE_CHECKING, Dict, List, Optional +from .abc import MetaDict from .config import TEMP_DIR from .exceptions import KaldiProcessingError from .multiprocessing.ivector import segment_vad @@ -13,7 +22,7 @@ if TYPE_CHECKING: from logging import Logger - from .config import ConfigDict, SegmentationConfig + from .config import SegmentationConfig from .corpus import Corpus SegmentationType = List[Dict[str, float]] @@ -27,7 +36,7 @@ class Segmenter: Parameters ---------- - corpus : :class:`~montreal_forced_aligner.corpus.TranscribeCorpus` + corpus : :class:`~montreal_forced_aligner.corpus.Corpus` Corpus object for the dataset segmentation_config : :class:`~montreal_forced_aligner.config.SegmentationConfig` Configuration for alignment @@ -72,7 +81,7 @@ def segmenter_directory(self) -> str: return os.path.join(self.temp_directory, "segmentation") @property - def vad_options(self) -> ConfigDict: + def vad_options(self) -> MetaDict: """Options for performing VAD""" return { "energy_threshold": self.segmentation_config.energy_threshold, diff --git a/montreal_forced_aligner/speaker_classifier.py b/montreal_forced_aligner/speaker_classifier.py index 5bf03632..c2ba62e2 100644 --- a/montreal_forced_aligner/speaker_classifier.py +++ b/montreal_forced_aligner/speaker_classifier.py @@ -1,4 +1,9 @@ -"""Class definitions for Speaker classification in MFA""" +""" +Speaker classification +====================== + + +""" from __future__ import annotations import logging @@ -8,6 +13,7 @@ import numpy as np +from .abc import MetaDict from .config import TEMP_DIR from .corpus.classes import Speaker from .exceptions import KaldiProcessingError @@ -17,8 +23,8 @@ if TYPE_CHECKING: from .config import SpeakerClassificationConfig - from .corpus import TranscribeCorpus - from .models import IvectorExtractor, MetaDict + from .corpus import Corpus + from .models import IvectorExtractorModel __all__ = ["SpeakerClassifier"] @@ -29,11 +35,11 @@ class SpeakerClassifier: Parameters ---------- - corpus : :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus : :class:`~montreal_forced_aligner.corpus.Corpus` Corpus object for the dataset - ivector_extractor : :class:`~montreal_forced_aligner.models.IvectorExtractor` + ivector_extractor : :class:`~montreal_forced_aligner.models.IvectorExtractorModel` Configuration for alignment - classification_config : :class:`~montreal_forced_aligner.config.speaker_classification_config.SpeakerClassificationConfig` + classification_config : :class:`~montreal_forced_aligner.config.SpeakerClassificationConfig` Configuration for alignment compute_segments: bool, optional Flag for whether segments should be created @@ -54,8 +60,8 @@ class SpeakerClassifier: def __init__( self, - corpus: TranscribeCorpus, - ivector_extractor: IvectorExtractor, + corpus: Corpus, + ivector_extractor: IvectorExtractorModel, classification_config: SpeakerClassificationConfig, compute_segments: Optional[bool] = False, num_speakers: Optional[int] = None, @@ -161,6 +167,12 @@ def use_mp(self) -> bool: """Flag for whether to use multiprocessing""" return self.classification_config.use_mp + def extract_ivectors(self) -> None: + """ + Extract ivectors for the corpus + """ + extract_ivectors(self) + def setup(self) -> None: """ Sets up the corpus and speaker classifier @@ -182,7 +194,7 @@ def setup(self) -> None: self.ivector_extractor.export_model(self.classify_directory) try: self.corpus.initialize_corpus(None, self.feature_config) - extract_ivectors(self) + self.extract_ivectors() except Exception as e: with open(dirty_path, "w"): pass diff --git a/montreal_forced_aligner/textgrid.py b/montreal_forced_aligner/textgrid.py index dd9de563..506cf0c2 100644 --- a/montreal_forced_aligner/textgrid.py +++ b/montreal_forced_aligner/textgrid.py @@ -1,35 +1,27 @@ -"""Classes and functions for working with TextGrids in MFA""" +""" +Textgrid utilities +================== + +""" from __future__ import annotations import os -import re import sys import traceback -from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union +from typing import TYPE_CHECKING, Dict, List, Optional from praatio import textgrid as tgio -from praatio.utilities.textgrid_io import Interval + +from .abc import Aligner +from .data import CtmInterval if TYPE_CHECKING: - from .aligner.base import BaseAligner from .corpus.classes import DictionaryData, File, Speaker - from .dictionary import ( - DictionaryEntryType, - IpaType, - MappingType, - PunctuationType, - ReversedMappingType, - ) + from .dictionary import ReversedMappingType from .multiprocessing.alignment import CtmType - from .trainers.base import BaseTrainer - - AlignerType = Union[BaseTrainer, BaseAligner] __all__ = [ - "CtmInterval", "process_ctm_line", - "map_to_original_pronunciation", "parse_from_word", "parse_from_phone", "parse_from_word_no_cleanup", @@ -37,59 +29,10 @@ "export_textgrid", "ctm_to_textgrid", "output_textgrid_writing_errors", - "to_int", "ctms_to_textgrids_non_mp", - "split_clitics", ] -@dataclass -class CtmInterval: - """ - Data class for intervals derived from CTM files - - Attributes - ---------- - begin: float - Start time of interval - end: float - End time of interval - label: str - Text of interval - utterance: str - Utterance ID that the interval belongs to - """ - - begin: float - end: float - label: str - utterance: str - - def shift_times(self, offset: float): - """ - Shift times of the interval based on some offset (i.e., segments in Kaldi) - - Parameters - ---------- - offset: float - Offset to add to the interval's begin and end - - """ - self.begin += offset - self.end += offset - - def to_tg_interval(self) -> Interval: - """ - Converts the CTMInterval to PraatIO's Interval class - - Returns - ------- - Interval - Derived PraatIO Interval - """ - return Interval(self.begin, self.end, self.label) - - def process_ctm_line(line: str) -> CtmInterval: """ Helper function for parsing a line of CTM file to construct a CTMInterval @@ -101,7 +44,7 @@ def process_ctm_line(line: str) -> CtmInterval: Returns ------- - CtmInterval + :class:`~montreal_forced_aligner.data.CtmInterval` Extracted data from the line """ line = line.split(" ") @@ -113,143 +56,6 @@ def process_ctm_line(line: str) -> CtmInterval: return CtmInterval(begin, end, label, utt) -def split_clitics( - item: str, - words_mapping: MappingType, - clitic_set: Set[str], - clitic_markers: PunctuationType, - compound_markers: PunctuationType, -) -> List[str]: - """ - Split a word into subwords based on dictionary information - - Parameters - ---------- - item: str - Word to split - words_mapping: Dict[str, int] - A word mapping from a Dictionary object - clitic_set: Set[str] - Set of acceptable clitics from the dictionary - clitic_markers: str - Clitic markers - compound_markers: str - Compound markers - - Returns - ------- - List[str] - List of subwords - """ - if item in words_mapping: - return [item] - if any(x in item for x in compound_markers): - s = re.split(rf"[{compound_markers}]", item) - if any(x in item for x in clitic_markers): - new_s = [] - for seg in s: - if any(x in seg for x in clitic_markers): - new_s.extend( - split_clitics( - seg, words_mapping, clitic_set, clitic_markers, compound_markers - ) - ) - else: - new_s.append(seg) - s = new_s - return s - if any(x in item and not item.endswith(x) and not item.startswith(x) for x in clitic_markers): - initial, final = re.split(rf"[{clitic_markers}]", item, maxsplit=1) - if any(x in final for x in clitic_markers): - final = split_clitics( - final, words_mapping, clitic_set, clitic_markers, compound_markers - ) - else: - final = [final] - for clitic in clitic_markers: - if initial + clitic in clitic_set: - return [initial + clitic] + final - elif clitic + final[0] in clitic_set: - final[0] = clitic + final[0] - return [initial] + final - return [item] - - -def _lookup( - item: str, - words_mapping: MappingType, - punctuation: PunctuationType, - clitic_set: Set[str], - clitic_markers: PunctuationType, - compound_markers: PunctuationType, -) -> List[str]: - """ - Look up a word and return the list of sub words if necessary - taking into account clitic and compound markers - - Parameters - ---------- - item: str - Word to look up - - Returns - ------- - List[str] - List of subwords that are in the dictionary - """ - from montreal_forced_aligner.dictionary import sanitize - - if item in words_mapping: - return [item] - sanitized = sanitize(item, punctuation, clitic_markers) - if sanitized in words_mapping: - return [sanitized] - split = split_clitics(sanitized, words_mapping, clitic_set, clitic_markers, compound_markers) - oov_count = sum(1 for x in split if x not in words_mapping) - - if oov_count < len(split): # Only returned split item if it gains us any transcribed speech - return split - return [sanitized] - - -def to_int( - item: str, - words_mapping: MappingType, - punctuation: PunctuationType, - clitic_set: Set[str], - clitic_markers: PunctuationType, - compound_markers: PunctuationType, - oov_int: int, -) -> List[int]: - """ - Convert a given word into integer IDs - - Parameters - ---------- - item: str - Word to look up - - Returns - ------- - List[int] - List of integer IDs corresponding to each subword - """ - if item == "": - return [] - sanitized = _lookup( - item, words_mapping, punctuation, clitic_set, clitic_markers, compound_markers - ) - text_int = [] - for item in sanitized: - if not item: - continue - if item not in words_mapping: - text_int.append(oov_int) - else: - text_int.append(words_mapping[item]) - return text_int - - def parse_from_word( ctm_labels: List[CtmInterval], text: List[str], dictionary_data: DictionaryData ) -> List[CtmInterval]: @@ -258,7 +64,7 @@ def parse_from_word( Parameters ---------- - ctm_labels: List[CtmInterval] + ctm_labels: List[:class:`~montreal_forced_aligner.data.CtmInterval`] CTM intervals text: List[str] The original text that was to be aligned @@ -267,22 +73,14 @@ def parse_from_word( Returns ------- - List[CtmInterval] + List[:class:`~montreal_forced_aligner.data.CtmInterval`] Correct intervals with subwords merged back into their original text """ cur_ind = 0 actual_labels = [] utterance = None - words_mapping = dictionary_data.words_mapping - punctuation = dictionary_data.punctuation - clitic_set = dictionary_data.clitic_set - clitic_markers = dictionary_data.clitic_markers - compound_markers = dictionary_data.compound_markers - oov_int = dictionary_data.oov_int for word in text: - ints = to_int( - word, words_mapping, punctuation, clitic_set, clitic_markers, compound_markers, oov_int - ) + ints = dictionary_data.to_int(word) b = 1000000 e = -1 for i in ints: @@ -309,14 +107,14 @@ def parse_from_word_no_cleanup( Parameters ---------- - ctm_labels: List[CtmInterval] - List of CtmIntervals to convert + ctm_labels: List[:class:`~montreal_forced_aligner.data.CtmInterval`] + List of :class:`~montreal_forced_aligner.data.CtmInterval` to convert reversed_word_mapping: Dict[int, str] Look up for Kaldi word IDs to convert them back to text Returns ------- - List[CtmInterval] + List[:class:`~montreal_forced_aligner.data.CtmInterval`] Parsed intervals with text rather than integer IDs """ for ctm_interval in ctm_labels: @@ -335,8 +133,8 @@ def parse_from_phone( Parameters ---------- - ctm_labels: List[CtmInterval] - CtmIntervals to convert + ctm_labels: List[:class:`~montreal_forced_aligner.data.CtmInterval`] + List of :class:`~montreal_forced_aligner.data.CtmInterval` to convert reversed_phone_mapping: Dict[int, str] Mapping to convert phone IDs to phone labels positions: List[str] @@ -344,7 +142,7 @@ def parse_from_phone( Returns ------- - List[CtmInterval] + List[:class:`~montreal_forced_aligner.data.CtmInterval`] Parsed intervals with phone labels rather than IDs """ for ctm_interval in ctm_labels: @@ -356,96 +154,13 @@ def parse_from_phone( return ctm_labels -def map_to_original_pronunciation( - phones: CtmType, subpronunciations: List[DictionaryEntryType], strip_diacritics: IpaType -) -> CtmType: - """ - Convert phone transcriptions from multilingual IPA mode to their original IPA transcription - - Parameters - ---------- - phones: List[CtmInterval] - List of aligned phones - subpronunciations: List[DictionaryEntryType] - Pronunciations of each sub word to reconstruct the transcriptions - strip_diacritics: List[str] - List of diacritics that were stripped out of the original IPA transcription - - Returns - ------- - List[CtmInterval] - Intervals with their original IPA pronunciation rather than the internal simplified form - """ - transcription = tuple(x.label for x in phones) - new_phones = [] - mapping_ind = 0 - transcription_ind = 0 - for pronunciations in subpronunciations: - pron = None - if mapping_ind >= len(phones): - break - for p in pronunciations: - if ( - "original_pronunciation" in p - and transcription == p["pronunciation"] == p["original_pronunciation"] - ) or (transcription == p["pronunciation"] and "original_pronunciation" not in p): - new_phones.extend(phones) - mapping_ind += len(phones) - break - if ( - p["pronunciation"] - == transcription[transcription_ind : transcription_ind + len(p["pronunciation"])] - and pron is None - ): - pron = p - if mapping_ind >= len(phones): - break - if not pron: - new_phones.extend(phones) - mapping_ind += len(phones) - break - to_extend = phones[transcription_ind : transcription_ind + len(pron["pronunciation"])] - transcription_ind += len(pron["pronunciation"]) - p = pron - if "original_pronunciation" not in p or p["pronunciation"] == p["original_pronunciation"]: - - new_phones.extend(to_extend) - mapping_ind += len(to_extend) - break - for pi in p["original_pronunciation"]: - if pi == phones[mapping_ind].label: - new_phones.append(phones[mapping_ind]) - else: - modded_phone = pi - new_p = phones[mapping_ind].label - for diacritic in strip_diacritics: - modded_phone = modded_phone.replace(diacritic, "") - if modded_phone == new_p: - phones[mapping_ind].label = pi - new_phones.append(phones[mapping_ind]) - elif mapping_ind != len(phones) - 1: - new_p = phones[mapping_ind].label + phones[mapping_ind + 1].label - if modded_phone == new_p: - new_phones.append( - CtmInterval( - phones[mapping_ind].begin, - phones[mapping_ind + 1].end, - new_p, - phones[mapping_ind].utterance, - ) - ) - mapping_ind += 1 - mapping_ind += 1 - return new_phones - - -def ctms_to_textgrids_non_mp(aligner: AlignerType) -> None: +def ctms_to_textgrids_non_mp(aligner: Aligner) -> None: """ Parse CTM files to TextGrids without using multiprocessing Parameters ---------- - aligner: AlignerType + aligner: :class:`~montreal_forced_aligner.aligner.base.BaseAligner` Aligner that generated the CTM files """ @@ -593,7 +308,7 @@ def generate_tiers( Returns ------- Dict[Speaker, Dict[str, CtmType]] - Tier information per speaker, with CtmIntervals split by "phones" and "words" + Tier information per speaker, with :class:`~montreal_forced_aligner.data.CtmInterval` split by "phones" and "words" """ output = {} @@ -605,39 +320,32 @@ def generate_tiers( words = [] phones = [] - if dictionary_data.multilingual_ipa and cleanup_textgrids: + if dictionary_data.dictionary_config.multilingual_ipa and cleanup_textgrids: phone_ind = 0 for interval in u.word_labels: end = interval.end word = interval.label - subwords = _lookup( + subwords = dictionary_data.lookup( word, - dictionary_data.words_mapping, - dictionary_data.punctuation, - dictionary_data.clitic_set, - dictionary_data.clitic_markers, - dictionary_data.compound_markers, ) subwords = [ - x if x in dictionary_data.words_mapping else dictionary_data.oov_code + x + if x in dictionary_data.words_mapping + else dictionary_data.dictionary_config.oov_word for x in subwords ] subprons = [dictionary_data.words[x] for x in subwords] cur_phones = [] while u.phone_labels[phone_ind].end <= end: p = u.phone_labels[phone_ind] - if p.label in dictionary_data.silences: + if p.label in dictionary_data.dictionary_config.silence_phones: phone_ind += 1 continue cur_phones.append(p) phone_ind += 1 if phone_ind > len(u.phone_labels) - 1: break - phones.extend( - map_to_original_pronunciation( - cur_phones, subprons, dictionary_data.strip_diacritics - ) - ) + phones.extend(dictionary_data.map_to_original_pronunciation(cur_phones, subprons)) if not word: continue @@ -646,7 +354,10 @@ def generate_tiers( for interval in u.word_labels: words.append(interval) for interval in u.phone_labels: - if interval.label in dictionary_data.silences and cleanup_textgrids: + if ( + interval.label in dictionary_data.dictionary_config.silence_phones + and cleanup_textgrids + ): continue phones.append(interval) if speaker not in output: @@ -673,8 +384,8 @@ def export_textgrid( File object to export output_path: str Output path of the file - speaker_data: Dict[Speaker, Dict[str, CtmType]] - Per speaker, per word/phone CtmIntervals + speaker_data: Dict[Speaker, Dict[str, List[:class:`~montreal_forced_aligner.data.CtmInterval`]] + Per speaker, per word/phone :class:`~montreal_forced_aligner.data.CtmInterval` frame_shift: int Frame shift of features, in ms first_file_write: bool, optional @@ -743,7 +454,7 @@ def export_textgrid( tg.save(output_path, includeBlankSpaces=True, format="long_textgrid", reportingMode="error") -def ctm_to_textgrid(file: File, aligner: AlignerType, first_file_write=True) -> None: +def ctm_to_textgrid(file: File, aligner: Aligner, first_file_write=True) -> None: """ Export a File to TextGrid @@ -751,7 +462,7 @@ def ctm_to_textgrid(file: File, aligner: AlignerType, first_file_write=True) -> ---------- file: File File to export - aligner: AlignerType + aligner: :class:`~montreal_forced_aligner.aligner.base.BaseAligner` or :class:`~montreal_forced_aligner.trainers.BaseTrainer` Aligner used to generate the alignments first_file_write: bool, optional Flag for whether this is the first time touching this file diff --git a/montreal_forced_aligner/trainers/__init__.py b/montreal_forced_aligner/trainers/__init__.py index e8d8a5c0..8a03609b 100644 --- a/montreal_forced_aligner/trainers/__init__.py +++ b/montreal_forced_aligner/trainers/__init__.py @@ -1,4 +1,9 @@ -"""Class definitions for acoustic model trainers in MFA""" +""" +Training acoustic models +======================== + + +""" from .base import BaseTrainer # noqa from .ivector_extractor import IvectorExtractorTrainer # noqa from .lda import LdaTrainer # noqa @@ -13,4 +18,17 @@ "MonophoneTrainer", "SatTrainer", "TriphoneTrainer", + "base", + "ivector_extractor", + "lda", + "monophone", + "sat", + "triphone", ] + +BaseTrainer.__module__ = "montreal_forced_aligner.trainers" +IvectorExtractorTrainer.__module__ = "montreal_forced_aligner.trainers" +LdaTrainer.__module__ = "montreal_forced_aligner.trainers" +MonophoneTrainer.__module__ = "montreal_forced_aligner.trainers" +SatTrainer.__module__ = "montreal_forced_aligner.trainers" +TriphoneTrainer.__module__ = "montreal_forced_aligner.trainers" diff --git a/montreal_forced_aligner/trainers/base.py b/montreal_forced_aligner/trainers/base.py index 09c1ed3b..6ba82361 100644 --- a/montreal_forced_aligner/trainers/base.py +++ b/montreal_forced_aligner/trainers/base.py @@ -9,10 +9,11 @@ from tqdm import tqdm +from ..abc import Aligner, MetaDict, Trainer from ..config import FeatureConfig from ..exceptions import KaldiProcessingError, TrainerError from ..models import AcousticModel -from ..multiprocessing import ( +from ..multiprocessing.alignment import ( acc_stats, align, compile_information, @@ -23,21 +24,26 @@ from ..utils import log_kaldi_errors, parse_logs if TYPE_CHECKING: - from ..config import ConfigDict from ..corpus import Corpus - from ..dictionary import DictionaryType - from ..models import MetaDict + from ..dictionary import MultispeakerDictionary __all__ = ["BaseTrainer"] -class BaseTrainer: +class BaseTrainer(Aligner, Trainer): """ Base trainer class for training acoustic models and ivector extractors + Parameters + ---------- + default_feature_config: :class:`~montreal_forced_aligner.config.FeatureConfig` + Default feature config + Attributes ---------- + feature_config : :class:`~montreal_forced_aligner.config.FeatureConfig` + Feature configuration num_iterations : int Number of training iterations to perform, defaults to 40 transition_scale : float @@ -78,7 +84,7 @@ class BaseTrainer: def __init__(self, default_feature_config: FeatureConfig): self.logger = None - self.dictionary = None + self.dictionary: Optional[MultispeakerDictionary] = None self.transition_scale = 1.0 self.acoustic_scale = 0.1 self.self_loop_scale = 0.1 @@ -145,12 +151,12 @@ def working_log_directory(self) -> str: return self.log_directory @property - def fmllr_options(self) -> ConfigDict: + def fmllr_options(self) -> MetaDict: """Options for fMLLR calculation, only used by SatTrainer""" raise NotImplementedError @property - def lda_options(self) -> ConfigDict: + def lda_options(self) -> MetaDict: """Options for LDA calculation, only used by LdaTrainer""" raise NotImplementedError @@ -170,6 +176,11 @@ def current_model_path(self): return os.path.join(self.working_directory, "final.mdl") return os.path.join(self.working_directory, f"{self.iteration}.mdl") + @property + def model_path(self) -> str: + """Current acoustic model path""" + return self.current_model_path + @property def next_model_path(self): """Next iteration's acoustic model path""" @@ -219,11 +230,11 @@ def gaussian_increment(self) -> int: return int((self.max_gaussians - self.initial_gaussians) / self.final_gaussian_iteration) @property - def align_options(self) -> ConfigDict: + def align_options(self) -> MetaDict: """Options for alignment""" options_silence_csl = "" if self.dictionary: - options_silence_csl = self.dictionary.optional_silence_csl + options_silence_csl = self.dictionary.config.optional_silence_csl return { "beam": self.beam, "retry_beam": self.retry_beam, @@ -276,7 +287,7 @@ def _setup_for_init( identifier: str, temporary_directory: str, corpus: Corpus, - dictionary: DictionaryType, + dictionary: MultispeakerDictionary, previous_trainer: Optional[BaseTrainer], ) -> None: """ @@ -288,11 +299,11 @@ def _setup_for_init( Identifier for the training block temporary_directory: str Root temporary directory to save - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to use - dictionary: DictionaryType - Dictionary to use - previous_trainer: :class:`~montreal_forced_aligner.trainers.base.BaseTrainer`, optional + dictionary: :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary` + MultispeakerDictionary to use + previous_trainer: :class:`~montreal_forced_aligner.trainers.BaseTrainer`, optional Previous trainer to initialize from Raises @@ -344,7 +355,7 @@ def init_training( identifier: str, temporary_directory: str, corpus: Corpus, - dictionary: DictionaryType, + dictionary: MultispeakerDictionary, previous_trainer: Optional[BaseTrainer], ) -> None: """ @@ -356,11 +367,11 @@ def init_training( Identifier for the training block temporary_directory: str Root temporary directory to save - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to use - dictionary: DictionaryType - Dictionary to use - previous_trainer: :class:`~montreal_forced_aligner.trainers.base.BaseTrainer`, optional + dictionary: :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary` + MultispeakerDictionary to use + previous_trainer: :class:`~montreal_forced_aligner.trainers.BaseTrainer`, optional Previous trainer to initialize from """ raise NotImplementedError @@ -530,16 +541,16 @@ def meta(self) -> MetaDict: from ..utils import get_mfa_version data = { - "phones": sorted(self.dictionary.nonsil_phones), + "phones": sorted(self.dictionary.config.non_silence_phones), "version": get_mfa_version(), "architecture": self.architecture, "train_date": str(datetime.now()), "features": self.feature_config.params(), - "multilingual_ipa": self.dictionary.multilingual_ipa, + "multilingual_ipa": self.dictionary.config.multilingual_ipa, } - if self.dictionary.multilingual_ipa: - data["strip_diacritics"] = self.dictionary.strip_diacritics - data["digraphs"] = self.dictionary.digraphs + if self.dictionary.config.multilingual_ipa: + data["strip_diacritics"] = self.dictionary.config.strip_diacritics + data["digraphs"] = self.dictionary.config.digraphs return data def export_textgrids(self) -> None: diff --git a/montreal_forced_aligner/trainers/ivector_extractor.py b/montreal_forced_aligner/trainers/ivector_extractor.py index f66a5eb0..0c913979 100644 --- a/montreal_forced_aligner/trainers/ivector_extractor.py +++ b/montreal_forced_aligner/trainers/ivector_extractor.py @@ -9,9 +9,10 @@ from tqdm import tqdm +from ..abc import IvectorExtractor, MetaDict from ..exceptions import KaldiProcessingError from ..helper import load_scp -from ..models import IvectorExtractor +from ..models import IvectorExtractorModel from ..multiprocessing.ivector import ( acc_global_stats, acc_ivector_stats, @@ -23,11 +24,10 @@ from .base import BaseTrainer if TYPE_CHECKING: + from ..abc import Dictionary from ..aligner import PretrainedAligner from ..config import FeatureConfig from ..corpus import Corpus - from ..dictionary import DictionaryType - from ..models import MetaDict IvectorConfigType = Dict[str, Any] @@ -36,7 +36,7 @@ __all__ = ["IvectorExtractorTrainer"] -class IvectorExtractorTrainer(BaseTrainer): +class IvectorExtractorTrainer(BaseTrainer, IvectorExtractor): """ Trainer for IvectorExtractor @@ -123,7 +123,7 @@ def ivector_options(self) -> MetaDict: "silence_weight": self.silence_weight, "max_count": self.max_count, "ivector_dimension": self.ivector_dimension, - "sil_phones": self.dictionary.silence_csl, + "sil_phones": self.dictionary.config.silence_csl, } @property @@ -268,7 +268,7 @@ def init_training( identifier: str, temporary_directory: str, corpus: Corpus, - dictionary: DictionaryType, + dictionary: Dictionary, previous_trainer: Optional[PretrainedAligner] = None, ) -> None: """ @@ -280,11 +280,11 @@ def init_training( Identifier for the training block temporary_directory: str Root temporary directory to save - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to use - dictionary: DictionaryType - Dictionary to use - previous_trainer: :class:`~montreal_forced_aligner.trainers.base.BaseTrainer`, optional + dictionary: :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary` + MultispeakerDictionary to use + previous_trainer: :class:`~montreal_forced_aligner.trainers.BaseTrainer`, optional Previous trainer to initialize from """ self._setup_for_init(identifier, temporary_directory, corpus, dictionary, previous_trainer) @@ -359,6 +359,12 @@ def align(self, subset: Optional[int] = None): """Overwrite align function to export IvectorExtractor to align directory""" self.save(os.path.join(self.align_directory, "ivector_extractor.zip")) + def extract_ivectors(self) -> None: + """ + Extract ivectors for the corpus + """ + extract_ivectors(self) + def training_iteration(self): """ Run an iteration of training @@ -424,7 +430,7 @@ def save(self, path: str, root_directory: Optional[str] = None): """ directory, filename = os.path.split(path) basename, _ = os.path.splitext(filename) - ivector_extractor = IvectorExtractor.empty(basename, root_directory) + ivector_extractor = IvectorExtractorModel.empty(basename, root_directory) ivector_extractor.add_meta_file(self) ivector_extractor.add_model(self.train_directory) os.makedirs(directory, exist_ok=True) diff --git a/montreal_forced_aligner/trainers/lda.py b/montreal_forced_aligner/trainers/lda.py index 59979dac..7f436d31 100644 --- a/montreal_forced_aligner/trainers/lda.py +++ b/montreal_forced_aligner/trainers/lda.py @@ -5,6 +5,7 @@ import time from typing import TYPE_CHECKING, Optional +from ..abc import MetaDict, Trainer from ..exceptions import KaldiProcessingError from ..multiprocessing import ( acc_stats, @@ -19,8 +20,7 @@ if TYPE_CHECKING: from ..config import FeatureConfig from ..corpus import Corpus - from ..dictionary import DictionaryType - from .base import MetaDict, TrainerType + from ..dictionary import MultispeakerDictionary __all__ = ["LdaTrainer"] @@ -81,7 +81,7 @@ def lda_options(self) -> MetaDict: "lda_dimension": self.lda_dimension, "boost_silence": self.boost_silence, "random_prune": self.random_prune, - "silence_csl": self.dictionary.silence_csl, + "silence_csl": self.dictionary.config.silence_csl, } def init_training( @@ -89,8 +89,8 @@ def init_training( identifier: str, temporary_directory: str, corpus: Corpus, - dictionary: DictionaryType, - previous_trainer: Optional[TrainerType], + dictionary: MultispeakerDictionary, + previous_trainer: Optional[Trainer], ): """ Initialize LDA training @@ -101,11 +101,11 @@ def init_training( Identifier for the training block temporary_directory: str Root temporary directory to save - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to use - dictionary: DictionaryType - Dictionary to use - previous_trainer: TrainerType, optional + dictionary: Dictionary + Pronunciation dictionary to use + previous_trainer: Trainer, optional Previous trainer to initialize from Raises diff --git a/montreal_forced_aligner/trainers/monophone.py b/montreal_forced_aligner/trainers/monophone.py index 2216a6db..dc35d686 100644 --- a/montreal_forced_aligner/trainers/monophone.py +++ b/montreal_forced_aligner/trainers/monophone.py @@ -15,7 +15,7 @@ if TYPE_CHECKING: from ..config import FeatureConfig from ..corpus import Corpus - from ..dictionary import DictionaryType + from ..dictionary import MultispeakerDictionary __all__ = ["MonophoneTrainer"] @@ -59,33 +59,12 @@ def phone_type(self) -> str: """Phone type""" return "monophone" - def get_num_gauss(self) -> int: - """ - Get the number of gaussians for a monophone model - - Returns - ------- - int - Initial number of gaussians - """ - with open(os.devnull, "w") as devnull: - proc = subprocess.Popen( - [thirdparty_binary("gmm-info"), "--print-args=false", self.current_model_path], - stderr=devnull, - stdout=subprocess.PIPE, - ) - stdout, stderr = proc.communicate() - num = stdout.decode("utf8") - matches = re.search(r"gaussians (\d+)", num) - num = int(matches.groups()[0]) - return num - def init_training( self, identifier: str, temporary_directory: str, corpus: Corpus, - dictionary: DictionaryType, + dictionary: MultispeakerDictionary, previous_trainer: Optional[BaseTrainer] = None, ) -> None: """ @@ -97,11 +76,11 @@ def init_training( Identifier for the training block temporary_directory: str Root temporary directory to save - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to use - dictionary: DictionaryType - Dictionary to use - previous_trainer: TrainerType, optional + dictionary: :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary` + MultispeakerDictionary to use + previous_trainer: Trainer, optional Previous trainer to initialize from Raises @@ -123,7 +102,9 @@ def init_training( feat_dim = corpus.get_feat_dim() feature_string = corpus.jobs[0].construct_base_feature_string(corpus) - shared_phones_path = os.path.join(dictionary.phones_dir, "sets.int") + shared_phones_path = os.path.join( + dictionary.get_dictionary("default").phones_dir, "sets.int" + ) init_log_path = os.path.join(self.log_directory, "init.log") temp_feats_path = os.path.join(self.train_directory, "temp_feats") with open(init_log_path, "w") as log_file: @@ -141,17 +122,27 @@ def init_training( thirdparty_binary("gmm-init-mono"), f"--shared-phones={shared_phones_path}", f"--train-feats=ark:{temp_feats_path}", - os.path.join(dictionary.output_directory, "topo"), + os.path.join( + dictionary.get_dictionary("default").output_directory, "topo" + ), str(feat_dim), self.current_model_path, tree_path, ], stderr=log_file, ) + proc = subprocess.Popen( + [thirdparty_binary("gmm-info"), "--print-args=false", self.current_model_path], + stderr=log_file, + stdout=subprocess.PIPE, + ) + stdout, stderr = proc.communicate() + num = stdout.decode("utf8") + matches = re.search(r"gaussians (\d+)", num) + num_gauss = int(matches.groups()[0]) if os.path.exists(self.current_model_path): os.remove(init_log_path) os.remove(temp_feats_path) - num_gauss = self.get_num_gauss() self.initial_gaussians = num_gauss self.current_gaussians = num_gauss compile_train_graphs(self) diff --git a/montreal_forced_aligner/trainers/sat.py b/montreal_forced_aligner/trainers/sat.py index 0ef29988..8da160e9 100644 --- a/montreal_forced_aligner/trainers/sat.py +++ b/montreal_forced_aligner/trainers/sat.py @@ -7,6 +7,7 @@ import time from typing import TYPE_CHECKING, Optional +from ..abc import MetaDict from ..exceptions import KaldiProcessingError from ..multiprocessing import ( acc_stats, @@ -23,10 +24,9 @@ from .triphone import TriphoneTrainer if TYPE_CHECKING: - from ..config import ConfigDict, FeatureConfig + from ..abc import Dictionary, Trainer + from ..config import FeatureConfig from ..corpus import Corpus - from ..dictionary import DictionaryType - from .base import TrainerType __all__ = ["SatTrainer"] @@ -77,13 +77,13 @@ def train_type(self) -> str: return "sat" @property - def fmllr_options(self) -> ConfigDict: + def fmllr_options(self) -> MetaDict: """Options for calculating fMLLR transforms""" return { "fmllr_update_type": self.fmllr_update_type, "debug": self.debug, "initial": self.initial_fmllr, - "silence_csl": self.dictionary.silence_csl, + "silence_csl": self.dictionary.config.silence_csl, } @property @@ -215,8 +215,8 @@ def init_training( identifier: str, temporary_directory: str, corpus: Corpus, - dictionary: DictionaryType, - previous_trainer: Optional[TrainerType], + dictionary: Dictionary, + previous_trainer: Optional[Trainer], ) -> None: """ Initialize speaker-adapted triphone training @@ -227,11 +227,11 @@ def init_training( Identifier for the training block temporary_directory: str Root temporary directory to save - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to use - dictionary: DictionaryType - Dictionary to use - previous_trainer: :class:`~montreal_forced_aligner.trainers.base.BaseTrainer`, optional + dictionary: :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary` + MultispeakerDictionary to use + previous_trainer: Trainer, optional Previous trainer to initialize from """ self.feature_config.fmllr = False @@ -262,7 +262,7 @@ def init_training( extra_question_int_path = os.path.join( self.dictionary.phones_dir, "extra_questions.int" ) - topo_path = os.path.join(self.dictionary.output_directory, "topo") + topo_path = self.dictionary.topo_path questions_path = os.path.join(self.train_directory, "questions.int") questions_qst_path = os.path.join(self.train_directory, "questions.qst") with open(log_path, "w") as log_file: diff --git a/montreal_forced_aligner/trainers/triphone.py b/montreal_forced_aligner/trainers/triphone.py index bd4a7900..7c7efff5 100644 --- a/montreal_forced_aligner/trainers/triphone.py +++ b/montreal_forced_aligner/trainers/triphone.py @@ -12,10 +12,9 @@ from .base import BaseTrainer if TYPE_CHECKING: + from ..abc import Dictionary, Trainer from ..config import FeatureConfig from ..corpus import Corpus - from ..dictionary import DictionaryType - from .base import TrainerType __all__ = ["TriphoneTrainer"] @@ -29,8 +28,6 @@ class TriphoneTrainer(BaseTrainer): ---------- num_iterations : int Number of training iterations to perform, defaults to 40 - max_gaussians : int - Total number of gaussians, defaults to 1000 num_leaves : int Number of states in the decision tree, defaults to 1000 max_gaussians : int @@ -88,7 +85,7 @@ def _setup_tree(self) -> None: extra_question_int_path = os.path.join( self.dictionary.phones_dir, "extra_questions.int" ) - topo_path = os.path.join(self.dictionary.output_directory, "topo") + topo_path = self.dictionary.topo_path questions_path = os.path.join(self.train_directory, "questions.int") questions_qst_path = os.path.join(self.train_directory, "questions.qst") with open(log_path, "w") as log_file: @@ -184,8 +181,8 @@ def init_training( identifier: str, temporary_directory: str, corpus: Corpus, - dictionary: DictionaryType, - previous_trainer: Optional[TrainerType], + dictionary: Dictionary, + previous_trainer: Optional[Trainer], ): """ Initialize triphone training @@ -196,11 +193,11 @@ def init_training( Identifier for the training block temporary_directory: str Root temporary directory to save - corpus: :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus: :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to use - dictionary: DictionaryType + dictionary: MultispeakerDictionary Dictionary to use - previous_trainer: TrainerType, optional + previous_trainer: Trainer, optional Previous trainer to initialize from """ self._setup_for_init(identifier, temporary_directory, corpus, dictionary, previous_trainer) diff --git a/montreal_forced_aligner/transcriber.py b/montreal_forced_aligner/transcriber.py index 4d4042b7..b7a8a289 100644 --- a/montreal_forced_aligner/transcriber.py +++ b/montreal_forced_aligner/transcriber.py @@ -1,4 +1,8 @@ -"""Class definitions for the MFA transcriber""" +""" +Transcription +============= + +""" from __future__ import annotations import multiprocessing as mp @@ -7,6 +11,7 @@ import subprocess from typing import TYPE_CHECKING, Optional, Tuple +from .abc import Transcriber as ABCTranscriber from .config import TEMP_DIR from .exceptions import KaldiProcessingError from .helper import score @@ -22,22 +27,22 @@ from logging import Logger from .config.transcribe_config import TranscribeConfig - from .corpus import TranscribeCorpus - from .dictionary import DictionaryType + from .corpus import Corpus + from .dictionary import MultispeakerDictionary from .models import AcousticModel, LanguageModel __all__ = ["Transcriber"] -class Transcriber: +class Transcriber(ABCTranscriber): """ Class for performing transcription. Parameters ---------- - corpus : :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus : :class:`~montreal_forced_aligner.corpus.Corpus` Corpus to transcribe - dictionary: :class:`~montreal_forced_aligner.dictionary.Dictionary` + dictionary: :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary` Pronunciation dictionary to use as a lexicon acoustic_model : :class:`~montreal_forced_aligner.models.AcousticModel` Acoustic model to use @@ -64,8 +69,8 @@ class Transcriber: def __init__( self, - corpus: TranscribeCorpus, - dictionary: DictionaryType, + corpus: Corpus, + dictionary: MultispeakerDictionary, acoustic_model: AcousticModel, language_model: LanguageModel, transcribe_config: TranscribeConfig, @@ -153,7 +158,7 @@ def alignment_model_path(self) -> str: def fmllr_options(self): """Options for computing fMLLR transforms""" data = self.transcribe_config.fmllr_options - data["sil_phones"] = self.dictionary.silence_csl + data["sil_phones"] = self.dictionary.config.silence_csl return data @property diff --git a/montreal_forced_aligner/utils.py b/montreal_forced_aligner/utils.py index 2c94e113..d045c876 100644 --- a/montreal_forced_aligner/utils.py +++ b/montreal_forced_aligner/utils.py @@ -1,4 +1,8 @@ -"""Utility functions for Montreal Forced Aligner""" +""" +Utility functions +================= + +""" from __future__ import annotations import logging @@ -122,7 +126,7 @@ def log_kaldi_errors(error_logs: List[str], logger: logging.Logger) -> None: logger: :class:`~logging.Logger` Logger to output to """ - logger.debug("There were {} kaldi processing files that had errors:".format(len(error_logs))) + logger.debug(f"There were {len(error_logs)} kaldi processing files that had errors:") for path in error_logs: logger.debug("") logger.debug(path) @@ -444,7 +448,7 @@ def setup_logger( handler.setLevel(getattr(logging, console_level.upper())) handler.setFormatter(CustomFormatter()) logger.addHandler(handler) - + logger.debug(f"Set up logger for MFA version: {get_mfa_version()}") return logger @@ -456,7 +460,7 @@ def log_config(logger: logging.Logger, config: Union[Dict[str, Any], BaseConfig] ---------- logger: :class:`~logging.Logger` Logger to save to - config: Dict[str, Any] + config: Dict[str, Any] or :class:`~montreal_forced_aligner.config.BaseConfig` Configuration to dump """ stream = yaml.dump(config) diff --git a/montreal_forced_aligner/validator.py b/montreal_forced_aligner/validator.py index 118403e4..c8fd7b84 100644 --- a/montreal_forced_aligner/validator.py +++ b/montreal_forced_aligner/validator.py @@ -1,4 +1,8 @@ -"""Class definition for MFA's validator""" +""" +Validating corpora +================== + +""" from __future__ import annotations import logging @@ -6,6 +10,7 @@ from decimal import Decimal from typing import TYPE_CHECKING, Optional +from .abc import AcousticModelWorker from .aligner.pretrained import PretrainedAligner from .config import FeatureConfig from .exceptions import CorpusError, KaldiProcessingError @@ -16,23 +21,23 @@ from .utils import log_kaldi_errors if TYPE_CHECKING: - from .corpus import Corpus - from .dictionary import DictionaryType + from .corpus.base import Corpus + from .dictionary import MultispeakerDictionary __all__ = ["CorpusValidator"] -class CorpusValidator: +class CorpusValidator(AcousticModelWorker): """ - Aligner that aligns and trains acoustics models on a large dataset + Validator class for checking whether a corpus, a dictionary, and (optionally) an acoustic model work together Parameters ---------- - corpus : :class:`~montreal_forced_aligner.corpus.base.Corpus` + corpus : :class:`~montreal_forced_aligner.corpus.Corpus` Corpus object for the dataset - dictionary : :class:`~montreal_forced_aligner.dictionary.Dictionary` - Dictionary object for the pronunciation dictionary + dictionary : :class:`~montreal_forced_aligner.dictionary.MultispeakerDictionary` + MultispeakerDictionary object for the pronunciation dictionary temp_directory : str, optional Specifies the temporary directory root to save files need for Kaldi. If not specified, it will be set to ``~/Documents/MFA`` @@ -44,6 +49,15 @@ class CorpusValidator: Flag for whether to use multiprocessing logger: :class:`~logging.Logger`, optional Logger to use + + Attributes + ---------- + corpus_analysis_template: str + Template for output message + alignment_analysis_template: str + Template for output message + transcription_analysis_template: str + Template for output message """ corpus_analysis_template = """ @@ -98,15 +112,14 @@ class CorpusValidator: def __init__( self, corpus: Corpus, - dictionary: DictionaryType, + dictionary: MultispeakerDictionary, temp_directory: Optional[str] = None, ignore_acoustics: bool = False, test_transcriptions: bool = False, use_mp: bool = True, logger: Optional[logging.Logger] = None, ): - self.dictionary = dictionary - self.corpus = corpus + super().__init__(corpus, dictionary) self.temp_directory = temp_directory self.test_transcriptions = test_transcriptions self.ignore_acoustics = ignore_acoustics @@ -116,6 +129,14 @@ def __init__( self.trainer.update({"use_mp": use_mp}) self.setup() + @property + def working_directory(self) -> str: + return os.path.join(self.temp_directory, "validation") + + @property + def working_log_directory(self) -> str: + return os.path.join(self.working_directory, "log") + def setup(self): """ Set up the corpus and validator @@ -442,11 +463,13 @@ def test_utterance_transcriptions(self): run_non_mp(test_utterances_func, jobs, log_directory) self.logger.info("Finished decoding utterances!") - word_mapping = self.dictionary.reversed_word_mapping errors = {} for job in jobs: for dict_name in job.dictionaries: + word_mapping = self.dictionary.dictionary_mapping[ + dict_name + ].reversed_word_mapping aligned_int = load_scp(job.out_int_paths[dict_name]) for utt, line in sorted(aligned_int.items()): text = [] diff --git a/rtd_environment.yml b/rtd_environment.yml index 44ab6744..611790d0 100644 --- a/rtd_environment.yml +++ b/rtd_environment.yml @@ -10,10 +10,12 @@ dependencies: - colorama - pyyaml - pip - - interrogate - - sphinx - - sphinx_rtd_theme - - sphinx-automodapi - pip: - praatio >= 5.0 - sphinxemoji + - sphinxcontrib-autoprogram + - git+https://github.com/pydata/pydata-sphinx-theme.git + - sphinx-panels + - interrogate + - sphinx + - numpydoc diff --git a/setup.cfg b/setup.cfg index e36be858..da4dc1c1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,6 @@ [metadata] name = Montreal Forced Aligner -description = Montreal Forced Aligner is a package for aligning speech corpora through the use of acoustic models and - dictionaries using Kaldi functionality. +description = Montreal Forced Aligner is a package for aligning speech corpora using Kaldi functionality. long_description = file: README.md long_description_content_type = text/markdown url = https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner diff --git a/tests/conftest.py b/tests/conftest.py index 943e1019..f7b87b6b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,7 +18,7 @@ from montreal_forced_aligner.config import align_yaml_to_config, train_yaml_to_config from montreal_forced_aligner.corpus import Corpus -from montreal_forced_aligner.dictionary import Dictionary +from montreal_forced_aligner.dictionary import MultispeakerDictionary @pytest.fixture(scope="session") @@ -87,12 +87,19 @@ def english_acoustic_model(): @pytest.fixture(scope="session") -def english_pretrained_dictionary(): - from montreal_forced_aligner.command_line.model import download_model - from montreal_forced_aligner.utils import get_dictionary_path +def english_dictionary(): + from montreal_forced_aligner.command_line.model import download_model, get_pretrained_path download_model("dictionary", "english") - return get_dictionary_path("english") + return get_pretrained_path("dictionary", "english") + + +@pytest.fixture(scope="session") +def basic_dictionary_config(): + from montreal_forced_aligner.config.dictionary_config import DictionaryConfig + + config = DictionaryConfig(debug=True) + return config @pytest.fixture(scope="session") @@ -105,10 +112,10 @@ def english_ipa_acoustic_model(): @pytest.fixture(scope="session") def english_us_ipa_dictionary(): - from montreal_forced_aligner.command_line.model import download_model + from montreal_forced_aligner.command_line.model import download_model, get_pretrained_path download_model("dictionary", "english_us_ipa") - return "english_us_ipa" + return get_pretrained_path("dictionary", "english_us_ipa") @pytest.fixture(scope="session") @@ -524,11 +531,6 @@ def basic_rootstxt_path(expected_dict_path): return os.path.join(expected_dict_path, "roots.txt") -# @pytest.fixture(scope='session') -# def basic_roots_path(expected_dict_path): -# return os.path.join(expected_dict_path, 'roots.txt') - - @pytest.fixture(scope="session") def basic_setsint_path(expected_dict_path): return os.path.join(expected_dict_path, "sets.int") @@ -594,17 +596,18 @@ def acoustic_corpus_textgrid_path(basic_dir): @pytest.fixture(scope="session") -def sick_dict(sick_dict_path, generated_dir): +def sick_dict(sick_dict_path, generated_dir, basic_dictionary_config): output_directory = os.path.join(generated_dir, "sickcorpus") - dictionary = Dictionary(sick_dict_path, output_directory) + + dictionary = MultispeakerDictionary(sick_dict_path, output_directory, basic_dictionary_config) dictionary.write() return dictionary @pytest.fixture(scope="session") -def sick_corpus(basic_corpus_dir, generated_dir): +def sick_corpus(basic_corpus_dir, generated_dir, basic_dictionary_config): output_directory = os.path.join(generated_dir, "sickcorpus") - corpus = Corpus(basic_corpus_dir, output_directory, num_jobs=2) + corpus = Corpus(basic_corpus_dir, output_directory, basic_dictionary_config, num_jobs=2) return corpus @@ -613,66 +616,11 @@ def textgrid_directory(test_dir): return os.path.join(test_dir, "textgrid") -@pytest.fixture(scope="session") -def large_dataset_directory(): - if os.environ.get("TRAVIS", False): - directory = os.path.expanduser("~/tools/mfa_test_data") - else: - test_dir = os.path.dirname(os.path.abspath(__file__)) - repo_dir = os.path.dirname(test_dir) - root_dir = os.path.dirname(repo_dir) - directory = os.path.join(root_dir, "mfa_test_data") - if not os.path.exists(directory): - pytest.skip("Couldn't find the mfa_test_data directory") - else: - return directory - - -@pytest.fixture(scope="session") -def large_dataset_dictionary(large_dataset_directory): - return os.path.join(large_dataset_directory, "librispeech-lexicon.txt") - - -@pytest.fixture(scope="session") -def large_prosodylab_format_directory(large_dataset_directory): - return os.path.join(large_dataset_directory, "prosodylab_format") - - -@pytest.fixture(scope="session") -def large_textgrid_format_directory(large_dataset_directory): - return os.path.join(large_dataset_directory, "textgrid_format") - - -@pytest.fixture(scope="session") -def prosodylab_output_directory(generated_dir): - return os.path.join(generated_dir, "prosodylab_output") - - -@pytest.fixture(scope="session") -def textgrid_output_directory(generated_dir): - return os.path.join(generated_dir, "textgrid_output") - - @pytest.fixture(scope="session") def mono_output_directory(generated_dir): return os.path.join(generated_dir, "mono_output") -@pytest.fixture(scope="session") -def single_speaker_prosodylab_format_directory(large_prosodylab_format_directory): - return os.path.join(large_prosodylab_format_directory, "121") - - -@pytest.fixture(scope="session") -def single_speaker_textgrid_format_directory(large_textgrid_format_directory): - return os.path.join(large_textgrid_format_directory, "121") - - -@pytest.fixture(scope="session") -def prosodylab_output_model_path(generated_dir): - return os.path.join(generated_dir, "prosodylab_output_model.zip") - - @pytest.fixture(scope="session") def textgrid_output_model_path(generated_dir): return os.path.join(generated_dir, "textgrid_output_model.zip") @@ -789,7 +737,7 @@ def mono_align_config_path(config_directory): @pytest.fixture(scope="session") def mono_align_config(mono_align_config_path): - return align_yaml_to_config(mono_align_config_path) + return align_yaml_to_config(mono_align_config_path)[0] @pytest.fixture(scope="session") diff --git a/tests/test_aligner.py b/tests/test_aligner.py index f84d3504..d45f2c34 100644 --- a/tests/test_aligner.py +++ b/tests/test_aligner.py @@ -14,8 +14,7 @@ def test_sick_mono( mono_align_config, mono_output_directory, ): - mono_train_config, align_config = mono_train_config - print(mono_train_config.training_configs[0].feature_config.use_mp) + mono_train_config, align_config, dictionary_config = mono_train_config data_directory = os.path.join(generated_dir, "temp", "mono_train_test") shutil.rmtree(data_directory, ignore_errors=True) a = TrainableAligner( @@ -36,7 +35,7 @@ def test_sick_mono( def test_sick_tri(sick_dict, sick_corpus, generated_dir, tri_train_config): - tri_train_config, align_config = tri_train_config + tri_train_config, align_config, dictionary_config = tri_train_config data_directory = os.path.join(generated_dir, "temp", "tri_test") shutil.rmtree(data_directory, ignore_errors=True) a = TrainableAligner( @@ -46,7 +45,7 @@ def test_sick_tri(sick_dict, sick_corpus, generated_dir, tri_train_config): def test_sick_lda(sick_dict, sick_corpus, generated_dir, lda_train_config): - lda_train_config, align_config = lda_train_config + lda_train_config, align_config, dictionary_config = lda_train_config data_directory = os.path.join(generated_dir, "temp", "lda_test") shutil.rmtree(data_directory, ignore_errors=True) a = TrainableAligner( @@ -56,7 +55,7 @@ def test_sick_lda(sick_dict, sick_corpus, generated_dir, lda_train_config): def test_sick_sat(sick_dict, sick_corpus, generated_dir, sat_train_config): - sat_train_config, align_config = sat_train_config + sat_train_config, align_config, dictionary_config = sat_train_config data_directory = os.path.join(generated_dir, "temp", "sat_test") shutil.rmtree(data_directory, ignore_errors=True) a = TrainableAligner( diff --git a/tests/test_commandline_adapt.py b/tests/test_commandline_adapt.py index ae16dd5f..90c22db9 100644 --- a/tests/test_commandline_adapt.py +++ b/tests/test_commandline_adapt.py @@ -8,7 +8,7 @@ def test_adapt_basic( basic_corpus_dir, sick_dict_path, generated_dir, - large_dataset_dictionary, + english_dictionary, temp_dir, basic_align_config, english_acoustic_model, @@ -17,7 +17,7 @@ def test_adapt_basic( command = [ "adapt", basic_corpus_dir, - large_dataset_dictionary, + english_dictionary, english_acoustic_model, adapted_model_path, "-t", diff --git a/tests/test_commandline_align.py b/tests/test_commandline_align.py index 6cbd52b8..c0ec166b 100644 --- a/tests/test_commandline_align.py +++ b/tests/test_commandline_align.py @@ -24,7 +24,7 @@ def test_align_arguments( basic_corpus_dir, sick_dict_path, generated_dir, - large_dataset_dictionary, + english_dictionary, temp_dir, english_acoustic_model, ): @@ -32,7 +32,7 @@ def test_align_arguments( command = [ "align", basic_corpus_dir, - large_dataset_dictionary, + english_dictionary, "english", os.path.join(generated_dir, "basic_output"), "-t", @@ -44,7 +44,7 @@ def test_align_arguments( ] args, unknown_args = parser.parse_known_args(command) print(args, unknown_args) - align_config = load_basic_align() + align_config, dictionary_config = load_basic_align() assert not align_config.disable_sat if unknown_args: align_config.update_from_unknown_args(unknown_args) @@ -56,7 +56,7 @@ def test_align_basic( basic_corpus_dir, sick_dict_path, generated_dir, - large_dataset_dictionary, + english_dictionary, temp_dir, basic_align_config, english_acoustic_model, @@ -82,7 +82,7 @@ def test_align_basic( command = [ "align", basic_corpus_dir, - large_dataset_dictionary, + english_dictionary, "english", output_directory, "-t", @@ -121,7 +121,7 @@ def test_align_basic( command = [ "align", basic_corpus_dir, - large_dataset_dictionary, + english_dictionary, "english", output_directory, "-t", @@ -144,7 +144,7 @@ def test_align_basic( command = [ "align", basic_corpus_dir, - large_dataset_dictionary, + english_dictionary, "english", output_directory, "-t", @@ -281,7 +281,7 @@ def test_align_stereo( stereo_corpus_dir, sick_dict_path, generated_dir, - large_dataset_dictionary, + english_dictionary, temp_dir, basic_align_config, english_acoustic_model, @@ -290,7 +290,7 @@ def test_align_stereo( command = [ "align", stereo_corpus_dir, - large_dataset_dictionary, + english_dictionary, "english", output_dir, "-t", diff --git a/tests/test_commandline_configure.py b/tests/test_commandline_configure.py index 62fd2021..bde41a3d 100644 --- a/tests/test_commandline_configure.py +++ b/tests/test_commandline_configure.py @@ -14,7 +14,7 @@ def test_configure( basic_corpus_dir, sick_dict_path, generated_dir, - large_dataset_dictionary, + english_dictionary, basic_align_config, english_acoustic_model, ): diff --git a/tests/test_commandline_g2p.py b/tests/test_commandline_g2p.py index b09396df..a54eb8fa 100644 --- a/tests/test_commandline_g2p.py +++ b/tests/test_commandline_g2p.py @@ -5,11 +5,14 @@ from montreal_forced_aligner.command_line.g2p import run_g2p from montreal_forced_aligner.command_line.mfa import parser from montreal_forced_aligner.command_line.train_g2p import run_train_g2p -from montreal_forced_aligner.dictionary import Dictionary +from montreal_forced_aligner.dictionary import PronunciationDictionary from montreal_forced_aligner.g2p.generator import G2P_DISABLED +from montreal_forced_aligner.models import DictionaryModel -def test_generate_pretrained(english_g2p_model, basic_corpus_dir, temp_dir, generated_dir): +def test_generate_pretrained( + english_g2p_model, basic_corpus_dir, temp_dir, generated_dir, basic_dictionary_config +): if G2P_DISABLED: pytest.skip("No Pynini found") output_path = os.path.join(generated_dir, "g2p_out.txt") @@ -30,7 +33,7 @@ def test_generate_pretrained(english_g2p_model, basic_corpus_dir, temp_dir, gene args, unknown = parser.parse_known_args(command) run_g2p(args, unknown) assert os.path.exists(output_path) - d = Dictionary(output_path, temp_dir) + d = PronunciationDictionary(DictionaryModel(output_path), temp_dir, basic_dictionary_config) assert len(d.words) > 0 @@ -56,7 +59,12 @@ def test_train_g2p(sick_dict_path, sick_g2p_model_path, temp_dir, train_g2p_conf def test_generate_dict( - basic_corpus_dir, sick_g2p_model_path, g2p_sick_output, temp_dir, g2p_config + basic_corpus_dir, + sick_g2p_model_path, + g2p_sick_output, + temp_dir, + g2p_config, + basic_dictionary_config, ): if G2P_DISABLED: pytest.skip("No Pynini found") @@ -76,12 +84,19 @@ def test_generate_dict( args, unknown = parser.parse_known_args(command) run_g2p(args, unknown) assert os.path.exists(g2p_sick_output) - d = Dictionary(g2p_sick_output, temp_dir) + d = PronunciationDictionary( + DictionaryModel(g2p_sick_output), temp_dir, basic_dictionary_config + ) assert len(d.words) > 0 def test_generate_dict_text_only( - basic_split_dir, sick_g2p_model_path, g2p_sick_output, temp_dir, g2p_config + basic_split_dir, + sick_g2p_model_path, + g2p_sick_output, + temp_dir, + g2p_config, + basic_dictionary_config, ): if G2P_DISABLED: pytest.skip("No Pynini found") @@ -102,11 +117,15 @@ def test_generate_dict_text_only( args, unknown = parser.parse_known_args(command) run_g2p(args, unknown) assert os.path.exists(g2p_sick_output) - d = Dictionary(g2p_sick_output, temp_dir) + d = PronunciationDictionary( + DictionaryModel(g2p_sick_output), temp_dir, basic_dictionary_config + ) assert len(d.words) > 0 -def test_generate_orthography_dict(basic_corpus_dir, orth_sick_output, temp_dir): +def test_generate_orthography_dict( + basic_corpus_dir, orth_sick_output, temp_dir, basic_dictionary_config +): if G2P_DISABLED: pytest.skip("No Pynini found") command = [ @@ -124,5 +143,7 @@ def test_generate_orthography_dict(basic_corpus_dir, orth_sick_output, temp_dir) args, unknown = parser.parse_known_args(command) run_g2p(args, unknown) assert os.path.exists(orth_sick_output) - d = Dictionary(orth_sick_output, temp_dir) + d = PronunciationDictionary( + DictionaryModel(orth_sick_output), temp_dir, basic_dictionary_config + ) assert len(d.words) > 0 diff --git a/tests/test_commandline_train.py b/tests/test_commandline_train.py index 80afbfa7..b598276c 100644 --- a/tests/test_commandline_train.py +++ b/tests/test_commandline_train.py @@ -1,12 +1,9 @@ import os -import pytest - from montreal_forced_aligner.command_line.mfa import parser from montreal_forced_aligner.command_line.train_acoustic_model import run_train_acoustic_model -# @pytest.mark.skip(reason='Optimization') def test_train_and_align_basic( basic_corpus_dir, sick_dict_path, @@ -37,7 +34,6 @@ def test_train_and_align_basic( assert os.path.exists(textgrid_output_model_path) -@pytest.mark.skip(reason="Optimization") def test_train_and_align_basic_speaker_dict( basic_corpus_dir, speaker_dictionary_path, diff --git a/tests/test_commandline_train_ivector.py b/tests/test_commandline_train_ivector.py index 3b20f6b2..498b4251 100644 --- a/tests/test_commandline_train_ivector.py +++ b/tests/test_commandline_train_ivector.py @@ -6,11 +6,10 @@ ) -# @pytest.mark.skip(reason='Optimization') def test_basic_ivector( basic_corpus_dir, generated_dir, - large_dataset_dictionary, + english_dictionary, temp_dir, train_ivector_config, english_acoustic_model, @@ -19,7 +18,7 @@ def test_basic_ivector( command = [ "train_ivector", basic_corpus_dir, - large_dataset_dictionary, + english_dictionary, "english", ivector_output_model_path, "-t", diff --git a/tests/test_commandline_validate.py b/tests/test_commandline_validate.py index 231c8abd..2906d86b 100644 --- a/tests/test_commandline_validate.py +++ b/tests/test_commandline_validate.py @@ -2,13 +2,15 @@ from montreal_forced_aligner.command_line.validate import run_validate_corpus -def test_validate_corpus(large_prosodylab_format_directory, large_dataset_dictionary, temp_dir): +def test_validate_corpus( + multilingual_ipa_tg_corpus_dir, english_ipa_acoustic_model, english_us_ipa_dictionary, temp_dir +): command = [ "validate", - large_prosodylab_format_directory, - large_dataset_dictionary, - "english", + multilingual_ipa_tg_corpus_dir, + english_us_ipa_dictionary, + english_ipa_acoustic_model, "-t", temp_dir, "-q", @@ -16,8 +18,6 @@ def test_validate_corpus(large_prosodylab_format_directory, large_dataset_dictio "--debug", "--disable_mp", "--test_transcriptions", - "-j", - "0", ] args, unknown = parser.parse_known_args(command) run_validate_corpus(args) diff --git a/tests/test_config.py b/tests/test_config.py index 3736ef3a..93bd3d4c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -62,7 +62,7 @@ def test_load_align(config_directory, mono_align_config_path): def test_load_basic_train(config_directory, basic_train_config): - training_config, align_config = train_yaml_to_config(basic_train_config) + training_config, align_config, dictioanry_config = train_yaml_to_config(basic_train_config) assert align_config.beam == 100 assert align_config.retry_beam == 400 assert align_config.align_options["beam"] == 100 @@ -76,7 +76,7 @@ def test_load_basic_train(config_directory, basic_train_config): def test_load_mono_train(config_directory, mono_train_config_path): - train, align = train_yaml_to_config(mono_train_config_path) + train, align, dictioanry_config = train_yaml_to_config(mono_train_config_path) for t in train.training_configs: assert not t.use_mp assert not t.feature_config.use_mp @@ -87,7 +87,7 @@ def test_load_mono_train(config_directory, mono_train_config_path): def test_load_ivector_train(config_directory, train_ivector_config): - train, align = train_yaml_to_config(train_ivector_config) + train, align, dictioanry_config = train_yaml_to_config(train_ivector_config) for t in train.training_configs: assert not t.use_mp assert not t.feature_config.use_mp @@ -98,7 +98,7 @@ def test_load_ivector_train(config_directory, train_ivector_config): def test_load(config_directory): path = os.path.join(config_directory, "basic_train_config.yaml") - train, align = train_yaml_to_config(path) + train, align, dictionary_config = train_yaml_to_config(path) assert len(train.training_configs) == 4 assert isinstance(train.training_configs[0], MonophoneTrainer) assert isinstance(train.training_configs[1], TriphoneTrainer) @@ -106,14 +106,14 @@ def test_load(config_directory): path = os.path.join(config_directory, "out_of_order_config.yaml") with pytest.raises(ConfigError): - train, align = train_yaml_to_config(path) + train, align, dictionary_config = train_yaml_to_config(path) def test_multilingual_ipa(config_directory): - from montreal_forced_aligner.config.base_config import DEFAULT_STRIP_DIACRITICS + from montreal_forced_aligner.config.dictionary_config import DEFAULT_STRIP_DIACRITICS path = os.path.join(config_directory, "basic_ipa_config.yaml") - train, align = train_yaml_to_config(path) - assert align.multilingual_ipa - assert set(align.strip_diacritics) == set(DEFAULT_STRIP_DIACRITICS) - assert align.digraphs == ["[dt][szʒʃʐʑʂɕç]", "[a][job_name][u]"] + train, align, dictionary_config = train_yaml_to_config(path) + assert dictionary_config.multilingual_ipa + assert set(dictionary_config.strip_diacritics) == set(DEFAULT_STRIP_DIACRITICS) + assert dictionary_config.digraphs == ["[dt][szʒʃʐʑʂɕç]", "[a][job_name][u]"] diff --git a/tests/test_corpus.py b/tests/test_corpus.py index e1f1c750..bf8e6efd 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -7,7 +7,7 @@ from montreal_forced_aligner.corpus import Corpus from montreal_forced_aligner.corpus.classes import File, Speaker, Utterance from montreal_forced_aligner.corpus.helper import get_wav_info -from montreal_forced_aligner.dictionary import Dictionary +from montreal_forced_aligner.dictionary import MultispeakerDictionary from montreal_forced_aligner.exceptions import SoxError @@ -46,41 +46,41 @@ def test_basic(basic_dict_path, basic_corpus_dir, generated_dir, default_feature output_directory = os.path.join(generated_dir, "basic") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, output_directory) + dictionary = MultispeakerDictionary(basic_dict_path, output_directory) dictionary.write() c = Corpus(basic_corpus_dir, output_directory, use_mp=True) c.initialize_corpus(dictionary, default_feature_config) for speaker in c.speakers.values(): data = speaker.dictionary.data() - assert dictionary.silences == data.silences - assert dictionary.multilingual_ipa == data.multilingual_ipa - assert dictionary.words_mapping == data.words_mapping - assert dictionary.punctuation == data.punctuation - assert dictionary.clitic_markers == data.clitic_markers - assert dictionary.oov_int == data.oov_int - assert dictionary.words == data.words + assert speaker.dictionary.config.silence_phones == data.dictionary_config.silence_phones + assert ( + speaker.dictionary.config.multilingual_ipa == data.dictionary_config.multilingual_ipa + ) + assert speaker.dictionary.words_mapping == data.words_mapping + assert speaker.dictionary.config.punctuation == data.dictionary_config.punctuation + assert speaker.dictionary.config.clitic_markers == data.dictionary_config.clitic_markers + assert speaker.dictionary.oov_int == data.oov_int + assert speaker.dictionary.words == data.words assert c.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_basic_txt(basic_corpus_txt_dir, basic_dict_path, generated_dir, default_feature_config): output_directory = os.path.join(generated_dir, "basic") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(generated_dir, "basic")) dictionary.write() c = Corpus(basic_corpus_txt_dir, output_directory, use_mp=False) print(c.no_transcription_files) assert len(c.no_transcription_files) == 0 c.initialize_corpus(dictionary, default_feature_config) assert c.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_alignable_from_temp( basic_corpus_txt_dir, basic_dict_path, generated_dir, default_feature_config ): - dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(generated_dir, "basic")) dictionary.write() output_directory = os.path.join(generated_dir, "basic") if os.path.exists(output_directory): @@ -94,13 +94,12 @@ def test_alignable_from_temp( assert len(c.no_transcription_files) == 0 c.initialize_corpus(dictionary, default_feature_config) assert c.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_transcribe_from_temp( basic_corpus_txt_dir, basic_dict_path, generated_dir, default_feature_config ): - dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(generated_dir, "basic")) dictionary.write() output_directory = os.path.join(generated_dir, "basic") if os.path.exists(output_directory): @@ -112,7 +111,6 @@ def test_transcribe_from_temp( c = Corpus(basic_corpus_txt_dir, output_directory, use_mp=False) c.initialize_corpus(dictionary, default_feature_config) assert c.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_extra(sick_dict, extra_corpus_dir, generated_dir): @@ -127,12 +125,11 @@ def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir, default_feature_co temp = os.path.join(temp_dir, "stereo") if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(stereo_corpus_dir, temp, use_mp=False) d.initialize_corpus(dictionary, default_feature_config) assert d.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_stereo_short_tg( @@ -141,24 +138,22 @@ def test_stereo_short_tg( temp = os.path.join(temp_dir, "stereo_tg") if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(stereo_corpus_short_tg_dir, temp, use_mp=False) d.initialize_corpus(dictionary, default_feature_config) assert d.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_flac(basic_dict_path, flac_corpus_dir, temp_dir, default_feature_config): temp = os.path.join(temp_dir, "flac") if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(flac_corpus_dir, temp, use_mp=False) d.initialize_corpus(dictionary, default_feature_config) assert d.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_audio_directory(basic_dict_path, basic_split_dir, temp_dir, default_feature_config): @@ -166,7 +161,7 @@ def test_audio_directory(basic_dict_path, basic_split_dir, temp_dir, default_fea audio_dir, text_dir = basic_split_dir if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(text_dir, temp, use_mp=False, audio_directory=audio_dir) assert len(d.no_transcription_files) == 0 @@ -176,71 +171,65 @@ def test_audio_directory(basic_dict_path, basic_split_dir, temp_dir, default_fea if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(text_dir, temp, use_mp=True, audio_directory=audio_dir) assert len(d.no_transcription_files) == 0 assert len(d.files) > 0 d.initialize_corpus(dictionary, default_feature_config) assert d.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_flac_mp(basic_dict_path, flac_corpus_dir, temp_dir, default_feature_config): temp = os.path.join(temp_dir, "flac") if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(flac_corpus_dir, temp, use_mp=True) d.initialize_corpus(dictionary, default_feature_config) assert d.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_flac_tg(basic_dict_path, flac_tg_corpus_dir, temp_dir, default_feature_config): temp = os.path.join(temp_dir, "flac") if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(flac_tg_corpus_dir, temp, use_mp=False) d.initialize_corpus(dictionary, default_feature_config) assert d.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_flac_tg_mp(basic_dict_path, flac_tg_corpus_dir, temp_dir, default_feature_config): temp = os.path.join(temp_dir, "flac") if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(flac_tg_corpus_dir, temp, use_mp=True) d.initialize_corpus(dictionary, default_feature_config) assert d.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_flac_tg_transcribe(basic_dict_path, flac_tg_corpus_dir, temp_dir, default_feature_config): temp = os.path.join(temp_dir, "flac_tg") if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(flac_tg_corpus_dir, temp, use_mp=False) d.initialize_corpus(dictionary, default_feature_config) assert d.get_feat_dim() == 39 - dictionary.cleanup_logger() if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(flac_tg_corpus_dir, temp, use_mp=True) d.initialize_corpus(dictionary, default_feature_config) assert d.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_flac_transcribe( @@ -249,22 +238,20 @@ def test_flac_transcribe( temp = os.path.join(temp_dir, "flac_transcribe") if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(flac_transcribe_corpus_dir, temp, use_mp=True) d.initialize_corpus(dictionary, default_feature_config) assert d.get_feat_dim() == 39 - dictionary.cleanup_logger() if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary(basic_dict_path, os.path.join(temp, "basic")) dictionary.write() d = Corpus(flac_transcribe_corpus_dir, temp, use_mp=False) d.initialize_corpus(dictionary, default_feature_config) assert d.get_feat_dim() == 39 - dictionary.cleanup_logger() def test_24bit_wav(transcribe_corpus_24bit_dir, temp_dir, default_feature_config): @@ -284,7 +271,7 @@ def test_short_segments( temp = os.path.join(temp_dir, "short_segments") if os.path.exists(temp): shutil.rmtree(temp, ignore_errors=True) - dictionary = Dictionary(basic_dict_path, temp) + dictionary = MultispeakerDictionary(basic_dict_path, temp) dictionary.write() corpus = Corpus(shortsegments_corpus_dir, temp, use_mp=False) corpus.initialize_corpus(dictionary, default_feature_config) @@ -293,69 +280,65 @@ def test_short_segments( assert len([x for x in corpus.utterances.values() if x.features is not None]) == 1 assert len([x for x in corpus.utterances.values() if x.ignored]) == 2 assert len([x for x in corpus.utterances.values() if x.features is None]) == 2 - dictionary.cleanup_logger() def test_speaker_groupings( - large_prosodylab_format_directory, temp_dir, large_dataset_dictionary, default_feature_config + multilingual_ipa_corpus_dir, temp_dir, english_us_ipa_dictionary, default_feature_config ): - output_directory = os.path.join(temp_dir, "large") + output_directory = os.path.join(temp_dir, "speaker_groupings") if os.path.exists(output_directory): shutil.rmtree(output_directory, ignore_errors=True) - dictionary = Dictionary(large_dataset_dictionary, output_directory) + dictionary = MultispeakerDictionary(english_us_ipa_dictionary, output_directory) dictionary.write() - c = Corpus(large_prosodylab_format_directory, output_directory, use_mp=False) + c = Corpus(multilingual_ipa_corpus_dir, output_directory, use_mp=True) c.initialize_corpus(dictionary, default_feature_config) - speakers = os.listdir(large_prosodylab_format_directory) + speakers = os.listdir(multilingual_ipa_corpus_dir) for s in speakers: assert any(s in x.speakers for x in c.jobs) - for _, _, files in os.walk(large_prosodylab_format_directory): + for _, _, files in os.walk(multilingual_ipa_corpus_dir): for f in files: name, ext = os.path.splitext(f) assert name in c.files shutil.rmtree(output_directory, ignore_errors=True) dictionary.write() - c = Corpus(large_prosodylab_format_directory, output_directory, num_jobs=2, use_mp=False) + c = Corpus(multilingual_ipa_corpus_dir, output_directory, num_jobs=1, use_mp=True) c.initialize_corpus(dictionary, default_feature_config) for s in speakers: assert any(s in x.speakers for x in c.jobs) - for _, _, files in os.walk(large_prosodylab_format_directory): + for _, _, files in os.walk(multilingual_ipa_corpus_dir): for f in files: name, ext = os.path.splitext(f) assert name in c.files - dictionary.cleanup_logger() - def test_subset( - large_prosodylab_format_directory, temp_dir, large_dataset_dictionary, default_feature_config + multilingual_ipa_corpus_dir, temp_dir, english_us_ipa_dictionary, default_feature_config ): output_directory = os.path.join(temp_dir, "large_subset") shutil.rmtree(output_directory, ignore_errors=True) - dictionary = Dictionary(large_dataset_dictionary, output_directory) + dictionary = MultispeakerDictionary(english_us_ipa_dictionary, output_directory) dictionary.write() - c = Corpus(large_prosodylab_format_directory, output_directory, use_mp=False) + c = Corpus(multilingual_ipa_corpus_dir, output_directory, use_mp=False) c.initialize_corpus(dictionary, default_feature_config) sd = c.split_directory - s = c.subset_directory(10) + s = c.subset_directory(5) assert os.path.exists(sd) assert os.path.exists(s) - dictionary.cleanup_logger() def test_weird_words(weird_words_dir, temp_dir, sick_dict_path): output_directory = os.path.join(temp_dir, "weird_words") shutil.rmtree(output_directory, ignore_errors=True) - dictionary = Dictionary(sick_dict_path, output_directory) - assert "i’m" not in dictionary.words - assert "’m" not in dictionary.words - assert dictionary.words["i'm"][0]["pronunciation"] == ("ay", "m", "ih") - assert dictionary.words["i'm"][1]["pronunciation"] == ("ay", "m") - assert dictionary.words["'m"][0]["pronunciation"] == ("m",) + dictionary = MultispeakerDictionary(sick_dict_path, output_directory) + assert "i’m" not in dictionary.default_dictionary.words + assert "’m" not in dictionary.default_dictionary.words + assert dictionary.default_dictionary.words["i'm"][0]["pronunciation"] == ("ay", "m", "ih") + assert dictionary.default_dictionary.words["i'm"][1]["pronunciation"] == ("ay", "m") + assert dictionary.default_dictionary.words["'m"][0]["pronunciation"] == ("m",) dictionary.write() c = Corpus(weird_words_dir, output_directory, use_mp=False) c.initialize_corpus(dictionary) @@ -367,65 +350,62 @@ def test_weird_words(weird_words_dir, temp_dir, sick_dict_path): dictionary.set_word_set(c.word_set) for w in ["i'm", "this'm", "sdsdsds'm", "'m"]: - _ = dictionary.to_int(w) + _ = dictionary.default_dictionary.to_int(w) print(dictionary.oovs_found) assert "'m" not in dictionary.oovs_found - dictionary.cleanup_logger() def test_punctuated(punctuated_dir, temp_dir, sick_dict_path): output_directory = os.path.join(temp_dir, "punctuated") shutil.rmtree(output_directory, ignore_errors=True) - dictionary = Dictionary(sick_dict_path, output_directory) + dictionary = MultispeakerDictionary(sick_dict_path, output_directory) dictionary.write() - c = Corpus(punctuated_dir, output_directory, use_mp=False) + c = Corpus(punctuated_dir, output_directory, dictionary_config=dictionary.config, use_mp=False) c.initialize_corpus(dictionary) assert ( c.utterances["punctuated-punctuated"].text == "oh yes they they you know they love her and so i mean" ) - dictionary.cleanup_logger() def test_alternate_punctuation( punctuated_dir, temp_dir, sick_dict_path, different_punctuation_config ): - train_config, align_config = train_yaml_to_config(different_punctuation_config) + train_config, align_config, dictionary_config = train_yaml_to_config( + different_punctuation_config + ) output_directory = os.path.join(temp_dir, "punctuated") shutil.rmtree(output_directory, ignore_errors=True) - print(align_config.punctuation) - dictionary = Dictionary(sick_dict_path, output_directory, punctuation=align_config.punctuation) + print(dictionary_config.punctuation) + dictionary = MultispeakerDictionary(sick_dict_path, output_directory, dictionary_config) dictionary.write() c = Corpus( - punctuated_dir, output_directory, use_mp=False, punctuation=align_config.punctuation + punctuated_dir, + output_directory, + dictionary_config, + use_mp=False, ) - print(c.punctuation) c.initialize_corpus(dictionary) assert ( c.utterances["punctuated-punctuated"].text == "oh yes, they they, you know, they love her and so i mean" ) - dictionary.cleanup_logger() def test_xsampa_corpus( xsampa_corpus_dir, xsampa_dict_path, temp_dir, generated_dir, different_punctuation_config ): - train_config, align_config = train_yaml_to_config(different_punctuation_config) + train_config, align_config, dictionary_config = train_yaml_to_config( + different_punctuation_config + ) output_directory = os.path.join(temp_dir, "xsampa") shutil.rmtree(output_directory, ignore_errors=True) - print(align_config.punctuation) - dictionary = Dictionary( - xsampa_dict_path, output_directory, punctuation=align_config.punctuation - ) + print(dictionary_config.punctuation) + dictionary = MultispeakerDictionary(xsampa_dict_path, output_directory, dictionary_config) dictionary.write() - c = Corpus( - xsampa_corpus_dir, output_directory, use_mp=False, punctuation=align_config.punctuation - ) - print(c.punctuation) + c = Corpus(xsampa_corpus_dir, output_directory, dictionary_config, use_mp=False) c.initialize_corpus(dictionary) assert ( c.utterances["xsampa-michael"].text == r"@bUr\tOU {bstr\{kt {bSaIr\ Abr\utseIzi {br\@geItIN @bor\n {b3kr\Ambi {bI5s@`n Ar\g thr\Ip@5eI Ar\dvAr\k".lower() ) - dictionary.cleanup_logger() diff --git a/tests/test_dict.py b/tests/test_dict.py index bab172a4..a98c20eb 100644 --- a/tests/test_dict.py +++ b/tests/test_dict.py @@ -1,12 +1,8 @@ import os -from montreal_forced_aligner.dictionary import ( - Dictionary, - MultispeakerDictionary, - parse_ipa, - sanitize, -) -from montreal_forced_aligner.textgrid import split_clitics +from montreal_forced_aligner.config.dictionary_config import DictionaryConfig +from montreal_forced_aligner.config.train_config import train_yaml_to_config +from montreal_forced_aligner.dictionary import MultispeakerDictionary, PronunciationDictionary def ListLines(path): @@ -21,10 +17,10 @@ def ListLines(path): def test_basic(basic_dict_path, generated_dir): - d = Dictionary(basic_dict_path, os.path.join(generated_dir, "basic")) + d = PronunciationDictionary(basic_dict_path, os.path.join(generated_dir, "basic")) d.write() - assert set(d.phones) == {"sil", "sp", "spn", "phonea", "phoneb", "phonec"} - assert set(d.positional_nonsil_phones) == { + assert set(d.config.phones) == {"sil", "sp", "spn", "phonea", "phoneb", "phonec"} + assert set(d.config.kaldi_non_silence_phones) == { "phonea_B", "phonea_I", "phonea_E", @@ -41,28 +37,27 @@ def test_basic(basic_dict_path, generated_dir): def test_extra_annotations(extra_annotations_path, generated_dir): - d = Dictionary(extra_annotations_path, os.path.join(generated_dir, "extra")) + d = PronunciationDictionary(extra_annotations_path, os.path.join(generated_dir, "extra")) assert "{" in d.graphemes d.write() def test_basic_noposition(basic_dict_path, generated_dir): - d = Dictionary( - basic_dict_path, os.path.join(generated_dir, "basic"), position_dependent_phones=False - ) + config = DictionaryConfig(position_dependent_phones=False) + d = PronunciationDictionary(basic_dict_path, os.path.join(generated_dir, "basic"), config) d.write() - assert set(d.phones) == {"sil", "sp", "spn", "phonea", "phoneb", "phonec"} + assert set(d.config.phones) == {"sil", "sp", "spn", "phonea", "phoneb", "phonec"} def test_frclitics(frclitics_dict_path, generated_dir): - d = Dictionary(frclitics_dict_path, os.path.join(generated_dir, "frclitics")) + d = PronunciationDictionary(frclitics_dict_path, os.path.join(generated_dir, "frclitics")) d.write() data = d.data() - assert d.silences == data.silences - assert d.multilingual_ipa == data.multilingual_ipa + assert d.silences == data.dictionary_config.silence_phones + assert d.config.multilingual_ipa == data.dictionary_config.multilingual_ipa assert d.words_mapping == data.words_mapping - assert d.punctuation == data.punctuation - assert d.clitic_markers == data.clitic_markers + assert d.config.punctuation == data.dictionary_config.punctuation + assert d.config.clitic_markers == data.dictionary_config.clitic_markers assert d.oov_int == data.oov_int assert d.words == data.words assert not d.check_word("aujourd") @@ -70,56 +65,16 @@ def test_frclitics(frclitics_dict_path, generated_dir): assert d.check_word("m'appelle") assert not d.check_word("purple-people-eater") assert d.split_clitics("aujourd") == ["aujourd"] - assert split_clitics( - "aujourd", d.words_mapping, d.clitic_set, d.clitic_markers, d.compound_markers - ) == ["aujourd"] assert d.split_clitics("aujourd'hui") == ["aujourd'hui"] - assert split_clitics( - "aujourd'hui", d.words_mapping, d.clitic_set, d.clitic_markers, d.compound_markers - ) == ["aujourd'hui"] assert d.split_clitics("vingt-six") == ["vingt", "six"] - assert split_clitics( - "vingt-six", d.words_mapping, d.clitic_set, d.clitic_markers, d.compound_markers - ) == ["vingt", "six"] assert d.split_clitics("m'appelle") == ["m'", "appelle"] - assert split_clitics( - "m'appelle", d.words_mapping, d.clitic_set, d.clitic_markers, d.compound_markers - ) == ["m'", "appelle"] assert d.split_clitics("m'm'appelle") == ["m'", "m'", "appelle"] - assert split_clitics( - "m'm'appelle", d.words_mapping, d.clitic_set, d.clitic_markers, d.compound_markers - ) == ["m'", "m'", "appelle"] assert d.split_clitics("c'est") == ["c'est"] - assert split_clitics( - "c'est", d.words_mapping, d.clitic_set, d.clitic_markers, d.compound_markers - ) == ["c'est"] assert d.split_clitics("m'c'est") == ["m'", "c'est"] - assert split_clitics( - "m'c'est", d.words_mapping, d.clitic_set, d.clitic_markers, d.compound_markers - ) == ["m'", "c'est"] assert d.split_clitics("purple-people-eater") == ["purple", "people", "eater"] - assert split_clitics( - "purple-people-eater", d.words_mapping, d.clitic_set, d.clitic_markers, d.compound_markers - ) == ["purple", "people", "eater"] assert d.split_clitics("m'appele") == ["m'", "appele"] - assert split_clitics( - "m'appele", d.words_mapping, d.clitic_set, d.clitic_markers, d.compound_markers - ) == ["m'", "appele"] assert d.split_clitics("m'ving-sic") == ["m'", "ving", "sic"] - assert split_clitics( - "m'ving-sic", d.words_mapping, d.clitic_set, d.clitic_markers, d.compound_markers - ) == ["m'", "ving", "sic"] assert d.split_clitics("flying'purple-people-eater") == ["flying'purple", "people", "eater"] - assert ( - split_clitics( - "flying'purple-people-eater", - d.words_mapping, - d.clitic_set, - d.clitic_markers, - d.compound_markers, - ) - == ["flying'purple", "people", "eater"] - ) assert d.to_int("aujourd") == [d.oov_int] assert d.to_int("aujourd'hui") == [d.words_mapping["aujourd'hui"]] @@ -138,9 +93,11 @@ def test_frclitics(frclitics_dict_path, generated_dir): assert d.to_int("flying'purple-people-eater") == [d.oov_int] -def test_english_clitics(english_pretrained_dictionary, generated_dir): - d = Dictionary( - english_pretrained_dictionary, os.path.join(generated_dir, "english_clitic_test") +def test_english_clitics(english_dictionary, generated_dir, basic_dictionary_config): + d = PronunciationDictionary( + english_dictionary, + os.path.join(generated_dir, "english_clitic_test"), + basic_dictionary_config, ) d.write() assert d.split_clitics("l'orme's") == ["l'", "orme's"] @@ -148,48 +105,58 @@ def test_english_clitics(english_pretrained_dictionary, generated_dir): assert d.to_int("l'orme's") == [d.words_mapping["l'"], d.words_mapping["orme's"]] -def test_devanagari(): +def test_devanagari(basic_dictionary_config): test_cases = ["हैं", "हूं", "हौं"] for tc in test_cases: - assert tc == sanitize(tc) + assert tc == basic_dictionary_config.sanitize(tc) -def test_japanese(): - assert "かぎ括弧" == sanitize("「かぎ括弧」") - assert "二重かぎ括弧" == sanitize("『二重かぎ括弧』") +def test_japanese(basic_dictionary_config): + assert "かぎ括弧" == basic_dictionary_config.sanitize("「かぎ括弧」") + assert "二重かぎ括弧" == basic_dictionary_config.sanitize("『二重かぎ括弧』") -def test_multilingual_ipa(): +def test_multilingual_ipa(basic_dictionary_config): input_transcription = "m æ ŋ g oʊ dʒ aɪ".split() expected = tuple("m æ ŋ ɡ o ʊ d ʒ a ɪ".split()) - assert parse_ipa(input_transcription) == expected + assert basic_dictionary_config.parse_ipa(input_transcription) == expected input_transcription = "n ɔː ɹ job_name".split() expected = tuple("n ɔ ɹ job_name".split()) - assert parse_ipa(input_transcription) == expected + assert basic_dictionary_config.parse_ipa(input_transcription) == expected input_transcription = "t ʌ tʃ ə b l̩".split() expected = tuple("t ʌ t ʃ ə b l".split()) - assert parse_ipa(input_transcription) == expected + assert basic_dictionary_config.parse_ipa(input_transcription) == expected + +def test_xsampa_dir(xsampa_dict_path, generated_dir, different_punctuation_config): -def test_xsampa_dir(xsampa_dict_path, generated_dir): - d = Dictionary(xsampa_dict_path, os.path.join(generated_dir, "xsampa")) + train_config, align_config, dictionary_config = train_yaml_to_config( + different_punctuation_config + ) + d = PronunciationDictionary( + xsampa_dict_path, os.path.join(generated_dir, "xsampa"), dictionary_config + ) d.write() print(d.words) - assert not d.clitic_set + assert not d.config.clitic_set assert d.split_clitics(r"r\{und") == [r"r\{und"] assert d.split_clitics("{bI5s@`n") == ["{bI5s@`n"] assert d.words[r"r\{und"] -def test_multispeaker_config(multispeaker_dictionary_config, generated_dir): +def test_multispeaker_config( + multispeaker_dictionary_config, sick_corpus, basic_dictionary_config, generated_dir +): dictionary = MultispeakerDictionary( - multispeaker_dictionary_config, os.path.join(generated_dir, "multispeaker") + multispeaker_dictionary_config, + os.path.join(generated_dir, "multispeaker"), + basic_dictionary_config, + word_set=sick_corpus.word_set, ) dictionary.write() for d in dictionary.dictionary_mapping.values(): - assert d.sil_phones.issubset(dictionary.sil_phones) - assert d.nonsil_phones.issubset(dictionary.nonsil_phones) - assert set(d.words.keys()).issubset(dictionary.words) + assert d.silences.issubset(dictionary.config.silence_phones) + assert d.config.non_silence_phones.issubset(dictionary.config.non_silence_phones) diff --git a/tests/test_g2p.py b/tests/test_g2p.py index 532dd7a8..e876d2a4 100644 --- a/tests/test_g2p.py +++ b/tests/test_g2p.py @@ -2,8 +2,8 @@ import pytest +from montreal_forced_aligner.config.dictionary_config import DictionaryConfig from montreal_forced_aligner.config.train_g2p_config import load_basic_train_g2p_config -from montreal_forced_aligner.dictionary import check_bracketed from montreal_forced_aligner.g2p.generator import PyniniDictionaryGenerator, clean_up_word from montreal_forced_aligner.g2p.trainer import G2P_DISABLED, PyniniTrainer from montreal_forced_aligner.models import G2PModel @@ -21,13 +21,15 @@ def test_check_bracketed(): """Checks if the brackets are removed correctly and handling an empty string works""" word_set = ["uh", "(the)", "sick", "", "[a]", "{cold}", ""] expected_result = ["uh", "sick", ""] - assert [x for x in word_set if not check_bracketed(x)] == expected_result + dictionary_config = DictionaryConfig() + assert [x for x in word_set if not dictionary_config.check_bracketed(x)] == expected_result def test_training(sick_dict, sick_g2p_model_path, temp_dir): if G2P_DISABLED: pytest.skip("No Pynini found") - train_config = load_basic_train_g2p_config() + train_config, dictionary_config = load_basic_train_g2p_config() + sick_dict = sick_dict.default_dictionary train_config.random_starts = 1 train_config.max_iterations = 5 trainer = PyniniTrainer( @@ -39,16 +41,19 @@ def test_training(sick_dict, sick_g2p_model_path, temp_dir): model = G2PModel(sick_g2p_model_path, root_directory=temp_dir) assert model.meta["version"] == get_mfa_version() assert model.meta["architecture"] == "pynini" - assert model.meta["phones"] == sick_dict.nonsil_phones + assert model.meta["phones"] == sick_dict.config.non_silence_phones def test_generator(sick_g2p_model_path, sick_corpus, g2p_sick_output): if G2P_DISABLED: pytest.skip("No Pynini found") model = G2PModel(sick_g2p_model_path) + dictionary_config = DictionaryConfig() assert not model.validate(sick_corpus.word_set) - assert model.validate([x for x in sick_corpus.word_set if not check_bracketed(x)]) + assert model.validate( + [x for x in sick_corpus.word_set if not dictionary_config.check_bracketed(x)] + ) gen = PyniniDictionaryGenerator(model, sick_corpus.word_set) gen.output(g2p_sick_output) assert os.path.exists(g2p_sick_output) diff --git a/tests/test_gui.py b/tests/test_gui.py index 027cc529..4bc815cb 100644 --- a/tests/test_gui.py +++ b/tests/test_gui.py @@ -1,22 +1,34 @@ import os from montreal_forced_aligner.corpus import Corpus -from montreal_forced_aligner.dictionary import Dictionary +from montreal_forced_aligner.dictionary import MultispeakerDictionary -def test_save_text_lab(basic_dict_path, basic_corpus_dir, generated_dir, default_feature_config): - dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, "basic")) +def test_save_text_lab( + basic_dict_path, + basic_corpus_dir, + generated_dir, + default_feature_config, + basic_dictionary_config, +): + dictionary = MultispeakerDictionary( + basic_dict_path, os.path.join(generated_dir, "basic"), basic_dictionary_config + ) dictionary.write() output_directory = os.path.join(generated_dir, "basic") - c = Corpus(basic_corpus_dir, output_directory, use_mp=True) + c = Corpus(basic_corpus_dir, output_directory, basic_dictionary_config, use_mp=True) c.initialize_corpus(dictionary) c.files["acoustic_corpus"].save() -def test_flac_tg(basic_dict_path, flac_tg_corpus_dir, temp_dir, default_feature_config): +def test_flac_tg( + basic_dict_path, flac_tg_corpus_dir, temp_dir, default_feature_config, basic_dictionary_config +): temp = os.path.join(temp_dir, "flac_tg_corpus") - dictionary = Dictionary(basic_dict_path, os.path.join(temp, "basic")) + dictionary = MultispeakerDictionary( + basic_dict_path, os.path.join(temp, "basic"), basic_dictionary_config + ) dictionary.write() - c = Corpus(flac_tg_corpus_dir, temp, use_mp=False) + c = Corpus(flac_tg_corpus_dir, temp, basic_dictionary_config, use_mp=False) c.initialize_corpus(dictionary) c.files["61-70968-0000"].save() diff --git a/tests/test_textgrid.py b/tests/test_textgrid.py index e79a964c..f2ab50de 100644 --- a/tests/test_textgrid.py +++ b/tests/test_textgrid.py @@ -1,8 +1,15 @@ -from montreal_forced_aligner.config.base_config import DEFAULT_STRIP_DIACRITICS -from montreal_forced_aligner.textgrid import CtmInterval, map_to_original_pronunciation +import os +from montreal_forced_aligner.dictionary import PronunciationDictionary +from montreal_forced_aligner.models import DictionaryModel +from montreal_forced_aligner.textgrid import CtmInterval -def test_mapping(): + +def test_mapping(english_us_ipa_dictionary, generated_dir, basic_dictionary_config): + output_directory = os.path.join(generated_dir, "ipa_temp") + d = PronunciationDictionary( + DictionaryModel(english_us_ipa_dictionary), output_directory, basic_dictionary_config + ) u = "utt" cur_phones = [ CtmInterval(2.25, 2.33, "t", u), @@ -38,7 +45,7 @@ def test_mapping(): } ], ] - new_phones = map_to_original_pronunciation(cur_phones, subprons, DEFAULT_STRIP_DIACRITICS) + new_phones = d.data().map_to_original_pronunciation(cur_phones, subprons) assert new_phones == [ CtmInterval(2.25, 2.43, "tʃ", u), CtmInterval(2.43, 2.55, "æ", u), diff --git a/tox.ini b/tox.ini index c2b61d97..1db35516 100644 --- a/tox.ini +++ b/tox.ini @@ -70,7 +70,7 @@ skip_install=true conda_env = rtd_environment.yml commands = interrogate -v --config {toxinidir}/pyproject.toml - sphinx-build -E -a -n -T -b html docs/source docs/build + sphinx-build -v -E -a -n -T -b html docs/source docs/build [testenv:manifest] basepython = python3.8