From 8abae27a20ea4e31ff64f086677e43941aee2fb5 Mon Sep 17 00:00:00 2001 From: Michael McAuliffe Date: Fri, 8 Oct 2021 00:00:42 -0700 Subject: [PATCH] Model export fixes (#338) Update model export and better support for xsampa orthography --- docs/source/changelog.rst | 8 + docs/source/commands.rst | 1 + docs/source/configuration.rst | 7 +- montreal_forced_aligner/__init__.py | 2 +- montreal_forced_aligner/aligner/pretrained.py | 5 +- montreal_forced_aligner/command_line/mfa.py | 68 ++- montreal_forced_aligner/config/__init__.py | 29 +- .../config/train_config.py | 5 +- montreal_forced_aligner/dictionary.py | 12 +- montreal_forced_aligner/features/config.py | 5 +- montreal_forced_aligner/models.py | 16 +- .../multiprocessing/__init__.py | 4 +- .../multiprocessing/alignment.py | 70 ++- montreal_forced_aligner/thirdparty/kaldi.py | 2 +- montreal_forced_aligner/trainers/base.py | 2 +- montreal_forced_aligner/trainers/sat.py | 10 +- tests/conftest.py | 17 + tests/data/dictionaries/xsampa.txt | 412 ++++++++++++++++++ tests/data/lab/xsampa.lab | 1 + tests/test_corpus.py | 14 + tests/test_dict.py | 10 + 21 files changed, 654 insertions(+), 46 deletions(-) create mode 100644 tests/data/dictionaries/xsampa.txt create mode 100644 tests/data/lab/xsampa.lab diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 75c60bd1..02d2919a 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -8,6 +8,14 @@ Changelog ========= +2.0.0b3 +------- + +- Fixed a bug involving non-escaped orthographic characters +- Improved SAT alignment with speaker-independent alignment model +- Fixed a bug where models would not function properly if they were renamed +- Added a history subcommand to list previous commands + 2.0.0b1 ------- diff --git a/docs/source/commands.rst b/docs/source/commands.rst index bbbcc733..12f39e29 100644 --- a/docs/source/commands.rst +++ b/docs/source/commands.rst @@ -55,6 +55,7 @@ Other utilities "download", "Download a model trained by MFA developers", :ref:`pretrained_models` "thirdparty", "Download and validate new third party binaries", :ref:`installation` "configure", "Configure MFA to use customized defaults for command line arguments", :ref:`configuration` + "history", "List previous MFA commands run locally", Grapheme-to-phoneme diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 5e8142ad..8be4f1fc 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -74,13 +74,8 @@ Options available: Display help message for the command - - -Configuration of commands -========================= - .. toctree:: - :maxdepth: 1 + :maxdepth: 2 configuration_align.rst configuration_transcription.rst diff --git a/montreal_forced_aligner/__init__.py b/montreal_forced_aligner/__init__.py index 3b6dfffb..f1a04455 100644 --- a/montreal_forced_aligner/__init__.py +++ b/montreal_forced_aligner/__init__.py @@ -1,6 +1,6 @@ __ver_major__ = 2 __ver_minor__ = 0 -__ver_patch__ = '0b2' +__ver_patch__ = '0b3' __version__ = "{}.{}.{}".format(__ver_major__, __ver_minor__, __ver_patch__) __all__ = ['aligner', 'command_line', 'models', 'corpus', 'config', 'dictionary', 'exceptions', diff --git a/montreal_forced_aligner/aligner/pretrained.py b/montreal_forced_aligner/aligner/pretrained.py index 152c5a89..2f71070c 100644 --- a/montreal_forced_aligner/aligner/pretrained.py +++ b/montreal_forced_aligner/aligner/pretrained.py @@ -86,15 +86,18 @@ def align(self, subset=None): log_dir = os.path.join(self.align_directory, 'log') os.makedirs(log_dir, exist_ok=True) + self.logger.info('Performing first-pass alignment...') align('final', self.align_directory, self.align_config.data_directory, self.dictionary.optional_silence_csl, - self.corpus.num_jobs, self.align_config) + self.corpus.num_jobs, self.align_config, speaker_independent=True) unaligned, average_log_like = compile_information(self.align_directory, self.corpus, self.corpus.num_jobs, self) self.logger.debug(f'Prior to SAT, average per frame likelihood (this might not actually mean anything): {average_log_like}') if not self.align_config.disable_sat and self.acoustic_model.feature_config.fmllr \ and not os.path.exists(os.path.join(self.align_directory, 'trans.0')): calc_fmllr(self.align_directory, self.align_config.data_directory, self.dictionary.optional_silence_csl, self.corpus.num_jobs, self.align_config, initial=True, iteration='final') + + self.logger.info('Performing second-pass alignment...') align('final', self.align_directory, self.align_config.data_directory, self.dictionary.optional_silence_csl, self.corpus.num_jobs, self.align_config) diff --git a/montreal_forced_aligner/command_line/mfa.py b/montreal_forced_aligner/command_line/mfa.py index 35d0a842..19d39385 100644 --- a/montreal_forced_aligner/command_line/mfa.py +++ b/montreal_forced_aligner/command_line/mfa.py @@ -1,7 +1,9 @@ +import atexit import sys import os import time import argparse +from datetime import datetime import multiprocessing as mp from montreal_forced_aligner import __version__ @@ -23,9 +25,53 @@ from montreal_forced_aligner.command_line.train_dictionary import run_train_dictionary from montreal_forced_aligner.command_line.create_segments import run_create_segments from montreal_forced_aligner.exceptions import MFAError -from montreal_forced_aligner.config import update_global_config, load_global_config, update_command_history +from montreal_forced_aligner.config import update_global_config, load_global_config, update_command_history, load_command_history + + +class ExitHooks(object): + def __init__(self): + self.exit_code = None + self.exception = None + + def hook(self): + self._orig_exit = sys.exit + sys.exit = self.exit + sys.excepthook = self.exc_handler + + def exit(self, code=0): + self.exit_code = code + self._orig_exit(code) + + def exc_handler(self, exc_type, exc, *args): + self.exception = exc + +hooks = ExitHooks() +hooks.hook() BEGIN = time.time() +BEGIN_DATE = datetime.now() + + +def history_save_handler(): + history_data = { + 'command': ' '.join(sys.argv), + 'execution_time': time.time() - BEGIN, + 'date': BEGIN_DATE, + 'version': __version__ + } + + if hooks.exit_code is not None: + history_data['exit_code'] = hooks.exit_code + history_data['exception'] = '' + elif hooks.exception is not None: + history_data['exit_code'] = 1 + history_data['exception'] = hooks.exception + else: + history_data['exception'] = '' + history_data['exit_code'] = 0 + update_command_history(history_data) + +atexit.register(history_save_handler) def fix_path(): from montreal_forced_aligner.config import TEMP_DIR @@ -295,6 +341,11 @@ def add_global_options(subparser, textgrid_output=False): "silences and recombines compound words and clitics", action='store_true') + history_parser = subparsers.add_parser('history') + + history_parser.add_argument('depth', help='Number of commands to list', nargs='?', default=10) + history_parser.add_argument('--verbose', help="Flag for whether to output additional information", action='store_true') + annotator_parser = subparsers.add_parser('annotator') anchor_parser = subparsers.add_parser('anchor') @@ -391,6 +442,21 @@ def main(): update_global_config(args) global GLOBAL_CONFIG GLOBAL_CONFIG = load_global_config() + elif args.subcommand == 'history': + depth = args.depth + history = load_command_history()[-depth:] + for h in history: + if args.verbose: + print('command\tDate\tExecution time\tVersion\tExit code\tException') + for h in history: + execution_time = time.strftime('%H:%M:%S', time.gmtime(h['execution_time'])) + d = h['date'].isoformat() + print(f"{h['command']}\t{d}\t{execution_time}\t{h['version']}\t{h['exit_code']}\t{h['exception']}") + pass + else: + for h in history: + print(h['command']) + elif args.subcommand == 'version': print(__version__) except MFAError as e: diff --git a/montreal_forced_aligner/config/__init__.py b/montreal_forced_aligner/config/__init__.py index 2ab150b2..0ed9f60b 100644 --- a/montreal_forced_aligner/config/__init__.py +++ b/montreal_forced_aligner/config/__init__.py @@ -16,11 +16,32 @@ def generate_config_path(): return os.path.join(TEMP_DIR, 'global_config.yaml') def generate_command_history_path(): - return os.path.join(TEMP_DIR, 'command_history') + return os.path.join(TEMP_DIR, 'command_history.yaml') -def update_command_history(command, duration, exit_code, exception): - with open(generate_command_history_path(), 'a', encoding='utf8') as f: - f.write(f'{command}\t{duration}\t{exit_code}\t{exception}\n') +def load_command_history(): + path = generate_command_history_path() + if os.path.exists(path): + with open(path, 'r', encoding='utf8') as f: + history = yaml.safe_load(f) + else: + history = [] + if not history: + history = [] + return history + + +def update_command_history(command_data): + try: + if command_data['command'].split(' ')[1] == 'history': + return + except Exception: + return + history = load_command_history() + path = generate_command_history_path() + history.append(command_data) + history = history[-50:] + with open(path, 'w', encoding='utf8') as f: + yaml.safe_dump(history, f) def update_global_config(args): global_configuration_file = generate_config_path() diff --git a/montreal_forced_aligner/config/train_config.py b/montreal_forced_aligner/config/train_config.py index 8788e503..8f4d28f8 100644 --- a/montreal_forced_aligner/config/train_config.py +++ b/montreal_forced_aligner/config/train_config.py @@ -28,8 +28,9 @@ def __init__(self, training_configs): self.compound_markers = DEFAULT_COMPOUND_MARKERS def update_from_align(self, align_config): - self.training_configs[-1].overwrite = align_config.overwrite - self.training_configs[-1].cleanup_textgrids = align_config.cleanup_textgrids + for tc in self.training_configs: + tc.overwrite = align_config.overwrite + tc.cleanup_textgrids = align_config.cleanup_textgrids def update(self, data): for k, v in data.items(): diff --git a/montreal_forced_aligner/dictionary.py b/montreal_forced_aligner/dictionary.py index de40a13f..3aec87ec 100644 --- a/montreal_forced_aligner/dictionary.py +++ b/montreal_forced_aligner/dictionary.py @@ -15,15 +15,9 @@ def compile_graphemes(graphemes): - if '-' in graphemes: - base = r'^\W*([-{}]+)\W*' - else: - base = r'^\W*([{}]+)\W*' - graphemes = list(graphemes) - for i in range(len(graphemes)): - if graphemes[i] == ']': - graphemes[i] = r'\]' - string = ''.join(x for x in graphemes if x != '-') + + base = r'^\W*([{}]+)\W*' + string = re.escape(''.join(graphemes)) try: return re.compile(base.format(string)) except Exception: diff --git a/montreal_forced_aligner/features/config.py b/montreal_forced_aligner/features/config.py index 25840e94..3f604301 100644 --- a/montreal_forced_aligner/features/config.py +++ b/montreal_forced_aligner/features/config.py @@ -145,7 +145,8 @@ def generate_base_features(self, corpus, logger=None, compute_cmvn=True): log_func('Calculating CMVN...') calc_cmvn(corpus) - def construct_feature_proc_string(self, data_directory, model_directory, job_name, splice=False, voiced=False, cmvn=True): + def construct_feature_proc_string(self, data_directory, model_directory, job_name, splice=False, voiced=False, + cmvn=True, speaker_independent=False): if self.directory is None: self.directory = data_directory lda_mat_path = None @@ -190,7 +191,7 @@ def construct_feature_proc_string(self, data_directory, model_directory, job_nam elif self.deltas: feats += " add-deltas ark:- ark:- |" - if fmllr_trans_path is not None: + if fmllr_trans_path is not None and not speaker_independent: if not os.path.exists(fmllr_trans_path): raise Exception('Could not find {}'.format(fmllr_trans_path)) feats += " transform-feats --utt2spk=ark:{} ark:{} ark:- ark:- |".format(utt2spk_path, fmllr_trans_path) diff --git a/montreal_forced_aligner/models.py b/montreal_forced_aligner/models.py index f17e58b3..23246f48 100644 --- a/montreal_forced_aligner/models.py +++ b/montreal_forced_aligner/models.py @@ -1,7 +1,7 @@ import os import yaml -from shutil import copy, copyfile, rmtree, make_archive, unpack_archive +from shutil import copy, copyfile, rmtree, make_archive, unpack_archive, move from . import __version__ from .exceptions import PronunciationAcousticMismatchError @@ -31,11 +31,15 @@ def __init__(self, source, root_directory=None): if os.path.isdir(source): self.dirname = os.path.abspath(source) else: - base = root_directory self.dirname = os.path.join(root_directory, self.name) if not os.path.exists(self.dirname): os.makedirs(root_directory, exist_ok=True) - unpack_archive(source, base) + unpack_archive(source, self.dirname) + files = os.listdir(self.dirname) + old_dir_path = os.path.join(self.dirname, files[0]) + if len(files) == 1 and os.path.isdir(old_dir_path): # Backwards compatibility + for f in os.listdir(old_dir_path): + move(os.path.join(old_dir_path, f), os.path.join(self.dirname, f)) @property def meta(self): @@ -76,16 +80,16 @@ def __repr__(self): def clean_up(self): rmtree(self.dirname) - def dump(self, sink, archive_fmt=FORMAT): + def dump(self, path, archive_fmt=FORMAT): """ Write archive to disk, and return the name of final archive """ - return make_archive(sink, archive_fmt, + return make_archive(os.path.splitext(path)[0], archive_fmt, *os.path.split(self.dirname)) class AcousticModel(Archive): - files = ['final.mdl', 'final.occs', 'lda.mat', 'tree'] + files = ['final.mdl', 'final.alimdl', 'final.occs', 'lda.mat', 'tree'] def add_meta_file(self, aligner): with open(os.path.join(self.dirname, 'meta.yaml'), 'w', encoding='utf8') as f: yaml.dump(aligner.meta, f) diff --git a/montreal_forced_aligner/multiprocessing/__init__.py b/montreal_forced_aligner/multiprocessing/__init__.py index 3a8b4c35..0f2ede8a 100644 --- a/montreal_forced_aligner/multiprocessing/__init__.py +++ b/montreal_forced_aligner/multiprocessing/__init__.py @@ -1,6 +1,8 @@ from .helper import run_mp, run_non_mp, Stopped, Counter from .alignment import align, compute_alignment_improvement, convert_ali_to_textgrids, compile_information, acc_stats, \ - lda_acc_stats, mono_align_equal, compile_train_graphs, tree_stats, convert_alignments, calc_lda_mllt, calc_fmllr + lda_acc_stats, mono_align_equal, compile_train_graphs, tree_stats, convert_alignments, calc_lda_mllt, calc_fmllr, \ + create_align_model + from .transcription import transcribe, transcribe_fmllr from .ivector import gmm_gselect, acc_global_stats, acc_ivector_stats, extract_ivectors, gauss_to_post, segment_vad, \ classify_speakers diff --git a/montreal_forced_aligner/multiprocessing/alignment.py b/montreal_forced_aligner/multiprocessing/alignment.py index 9d6b634e..5d5c0155 100644 --- a/montreal_forced_aligner/multiprocessing/alignment.py +++ b/montreal_forced_aligner/multiprocessing/alignment.py @@ -138,7 +138,7 @@ def compile_train_graphs(directory, lang_directory, split_directory, num_jobs, a num_jobs : int The number of processes to use """ - aligner.logger.info('Compiling training graphs...') + aligner.logger.debug('Compiling training graphs...') begin = time.time() log_directory = os.path.join(directory, 'log') os.makedirs(log_directory, exist_ok=True) @@ -234,7 +234,8 @@ def align_func(directory, iteration, job_name, mdl, config, feature_string, outp align_proc.communicate() -def align(iteration, directory, split_directory, optional_silence, num_jobs, config, output_directory=None): +def align(iteration, directory, split_directory, optional_silence, num_jobs, config, + output_directory=None, speaker_independent=False): """ Multiprocessing function that aligns based on the current model @@ -261,17 +262,18 @@ def align(iteration, directory, split_directory, optional_silence, num_jobs, con config : :class:`~aligner.config.MonophoneConfig`, :class:`~aligner.config.TriphoneConfig` or :class:`~aligner.config.TriphoneFmllrConfig` Configuration object for training """ - config.logger.info('Performing alignment...') begin = time.time() if output_directory is None: output_directory = directory log_directory = os.path.join(output_directory, 'log') - mdl_path = os.path.join(directory, '{}.mdl'.format(iteration)) + align_model_path = os.path.join(directory, '{}.alimdl'.format(iteration)) + if not speaker_independent or not os.path.exists(align_model_path): + align_model_path = os.path.join(directory, '{}.mdl'.format(iteration)) if config.boost_silence != 1.0: mdl = "{} --boost={} {} {} - |".format(thirdparty_binary('gmm-boost-silence'), - config.boost_silence, optional_silence, make_path_safe(mdl_path)) + config.boost_silence, optional_silence, make_path_safe(align_model_path)) else: - mdl = mdl_path + mdl = align_model_path jobs = [(directory, iteration, x, mdl, config.align_options, config.feature_config.construct_feature_proc_string(split_directory, directory, x), @@ -345,8 +347,9 @@ def compile_information(model_directory, corpus, num_jobs, config): total_frames = sum(data['total_frames'] for data in alignment_info.values()) average_log_like = 0 for x, data in alignment_info.items(): - weight = data['total_frames'] / total_frames - average_log_like += data['log_like'] * weight + if total_frames: + weight = data['total_frames'] / total_frames + average_log_like += data['log_like'] * weight for u in data['unaligned']: unaligned[u] = 'Beam too narrow' for u in data['too_short']: @@ -1429,6 +1432,57 @@ def calc_fmllr(directory, split_directory, sil_phones, num_jobs, config, config.logger.debug(f'Fmllr calculation took {time.time() - begin}') +def acc_stats_two_feats_func(directory, model_path, feature_string, si_feature_string, job_name): + log_path = os.path.join(directory, 'log', 'align_model_est.{}.log'.format(job_name)) + acc_path = os.path.join(directory, 'align_model.{}.acc'.format(job_name)) + with open(log_path, 'w', encoding='utf8') as log_file: + ali_to_post_proc = subprocess.Popen([thirdparty_binary('ali-to-post'), + 'ark:' + os.path.join(directory, 'ali.{}'.format(job_name)), + 'ark:-'], + stderr=log_file, stdout=subprocess.PIPE) + acc_proc = subprocess.Popen([thirdparty_binary('gmm-acc-stats-twofeats'), model_path, + feature_string, si_feature_string, "ark,s,cs:-", acc_path], + stderr=log_file, stdin=ali_to_post_proc.stdout) + acc_proc.communicate() + + + +def create_align_model(directory, split_directory, num_jobs, config): + config.logger.info('Creating alignment model for speaker-independent features...') + begin = time.time() + log_directory = os.path.join(directory, 'log') + + model_name = 'final' + model_path = os.path.join(directory, '{}.mdl'.format(model_name)) + align_model_path = os.path.join(directory, '{}.alimdl'.format(model_name)) + jobs = [(directory, model_path, + config.feature_config.construct_feature_proc_string(split_directory, directory, x), + config.feature_config.construct_feature_proc_string(split_directory, directory, x, speaker_independent=True), + x) for x in range(num_jobs)] + if config.use_mp: + run_mp(acc_stats_two_feats_func, jobs, log_directory) + else: + run_non_mp(acc_stats_two_feats_func, jobs, log_directory) + + log_path = os.path.join(directory, 'log', 'align_model_est.final.log') + with open(log_path, 'w', encoding='utf8') as log_file: + acc_files = [os.path.join(directory, 'align_model.{}.acc'.format(x)) + for x in range(num_jobs)] + est_proc = subprocess.Popen([thirdparty_binary('gmm-est'), + "--remove-low-count-gaussians=false", '--power=' + str(config.power), + model_path, + "{} - {}|".format(thirdparty_binary('gmm-sum-accs'), + ' '.join(map(make_path_safe, acc_files))), + align_model_path], + stderr=log_file) + est_proc.communicate() + if not config.debug: + for f in acc_files: + os.remove(f) + + config.logger.debug(f'Alignment model creation took {time.time() - begin}') + + def lda_acc_stats_func(directory, feature_string, align_directory, config, ci_phones, i): log_path = os.path.join(directory, 'log', 'ali_to_post.{}.log'.format(i)) with open(log_path, 'w', encoding='utf8') as log_file: diff --git a/montreal_forced_aligner/thirdparty/kaldi.py b/montreal_forced_aligner/thirdparty/kaldi.py index f421f93d..1639df08 100644 --- a/montreal_forced_aligner/thirdparty/kaldi.py +++ b/montreal_forced_aligner/thirdparty/kaldi.py @@ -21,7 +21,7 @@ 'compile-train-graphs', 'compile-train-graphs-fsts', 'compose-transforms', 'compute-cmvn-stats', 'compute-mfcc-feats', 'convert-ali', 'copy-feats', 'est-lda', 'est-mllt', 'extract-segments', 'feat-to-dim', 'feat-to-len', 'gmm-acc-mllt', 'gmm-acc-stats-ali', - 'gmm-align-compiled', + 'gmm-align-compiled', 'gmm-acc-stats-twofeats', 'gmm-boost-silence', 'gmm-est', 'gmm-est-fmllr', 'gmm-info', 'gmm-init-model', 'gmm-init-mono', 'gmm-latgen-faster', 'gmm-mixup', 'gmm-sum-accs', 'gmm-transform-means', diff --git a/montreal_forced_aligner/trainers/base.py b/montreal_forced_aligner/trainers/base.py index 3bf3ede4..cebd14ba 100644 --- a/montreal_forced_aligner/trainers/base.py +++ b/montreal_forced_aligner/trainers/base.py @@ -393,4 +393,4 @@ def save(self, path, root_directory=None): if directory: os.makedirs(directory, exist_ok=True) basename, _ = os.path.splitext(path) - acoustic_model.dump(basename) + acoustic_model.dump(path) diff --git a/montreal_forced_aligner/trainers/sat.py b/montreal_forced_aligner/trainers/sat.py index 293fa2e0..8614f330 100644 --- a/montreal_forced_aligner/trainers/sat.py +++ b/montreal_forced_aligner/trainers/sat.py @@ -6,7 +6,8 @@ from ..multiprocessing import (align, compile_train_graphs, acc_stats, tree_stats, convert_alignments, - calc_fmllr, compute_alignment_improvement, compile_information) + calc_fmllr, compute_alignment_improvement, compile_information, + create_align_model) from ..helper import thirdparty_binary, make_path_safe, log_kaldi_errors, parse_logs, load_scp from ..exceptions import KaldiProcessingError @@ -111,7 +112,7 @@ def train(self, call_back=None): os.path.join(self.train_directory, 'final.mdl')) shutil.copy(os.path.join(self.train_directory, '{}.occs'.format(self.num_iterations)), os.path.join(self.train_directory, 'final.occs')) - + create_align_model(self.train_directory, self.corpus.split_directory(), self.corpus.num_jobs, self) if not self.debug: for i in range(1, self.num_iterations): model_path = os.path.join(self.train_directory, '{}.mdl'.format(i)) @@ -159,6 +160,9 @@ def align(self, subset, call_back=None): shutil.copy(os.path.join(self.train_directory, 'tree'), self.align_directory) shutil.copyfile(os.path.join(self.train_directory, 'final.mdl'), os.path.join(self.align_directory, 'final.mdl')) + if os.path.exists(os.path.join(self.train_directory, 'final.alimdl')): + shutil.copyfile(os.path.join(self.train_directory, 'final.alimdl'), + os.path.join(self.align_directory, 'final.alimdl')) if os.path.exists(os.path.join(self.train_directory, 'lda.mat')): shutil.copyfile(os.path.join(self.train_directory, 'lda.mat'), @@ -173,7 +177,7 @@ def align(self, subset, call_back=None): os.path.join(self.align_directory, 'trans.{}'.format(i))) align('final', self.align_directory, align_data_directory, self.dictionary.optional_silence_csl, - self.corpus.num_jobs, self, self.align_directory) + self.corpus.num_jobs, self, self.align_directory, speaker_independent=True) unaligned, average_log_like = compile_information(self.align_directory, self.corpus, self.corpus.num_jobs, self) diff --git a/tests/conftest.py b/tests/conftest.py index be67dc76..3b5d9091 100755 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -180,6 +180,18 @@ def basic_corpus_dir(corpus_root_dir, wav_dir, lab_dir): return path +@pytest.fixture(scope='session') +def xsampa_corpus_dir(corpus_root_dir, wav_dir, lab_dir): + path = os.path.join(corpus_root_dir, 'basic') + os.makedirs(path, exist_ok=True) + + s_dir = os.path.join(path, 'michael') + os.makedirs(s_dir, exist_ok=True) + shutil.copyfile(os.path.join(wav_dir, 'acoustic_corpus.wav'), os.path.join(s_dir, 'xsampa.wav')) + shutil.copyfile(os.path.join(lab_dir, 'xsampa.lab'), os.path.join(s_dir, 'xsampa.lab')) + return path + + @pytest.fixture(scope='session') def basic_split_dir(corpus_root_dir, wav_dir, lab_dir, textgrid_dir): path = os.path.join(corpus_root_dir, 'split') @@ -377,6 +389,11 @@ def frclitics_dict_path(dict_dir): return os.path.join(dict_dir, 'frclitics.txt') +@pytest.fixture(scope='session') +def xsampa_dict_path(dict_dir): + return os.path.join(dict_dir, 'xsampa.txt') + + @pytest.fixture(scope='session') def expected_dict_path(dict_dir): return os.path.join(dict_dir, 'expected') diff --git a/tests/data/dictionaries/xsampa.txt b/tests/data/dictionaries/xsampa.txt new file mode 100644 index 00000000..7854c916 --- /dev/null +++ b/tests/data/dictionaries/xsampa.txt @@ -0,0 +1,412 @@ +b{T b { T +VD@` V D @` +b{t b { T +vd@` V D @` +A5OU A 5 OU +@pAstr\@fi @ p A s t r\ @ f i +bh{Ut bh {U t +kh@z kh @ z +khor\s kh or\ s +khjuz kh j u z +@m @ m +EndkwOUt E n d k w OU t +fr\IskOU f r\ I s k OU +ghEn gh E n +In3kwOUt I n 3 k w OU t +kheI kh eI +@m @ m +@n @ n +khwOUt kh w OU t +r\{Und r\ {U n d +Es E s +sINg@5kwOUt s I N g @5 k w OU t +thI5 th I 5 +thIz th I z +thwVz th w V z +lA l A +@ @ +eIz eI z +eI eI +eIz eI z +eIz eI z +eIfor\tuwVntueIt eI f or\ t u w V n t u eI t +eIeI eI eI +thr\Ip@5eI th r\ I p @5 eI +Ab3g A b 3 g +Ak@n A k @ n +Ak@n3 A k @ n 3 +A A +Ak3 A k 3 +A5iA A 5 i A +A5sET A 5 s E T +Am@t A m @ t +Ankor\ A n k or\ +Ar\dEm@ Ar\ d E m @ +Ar\dvAr\k Ar\ d v Ar\ k +Ar\dvAr\ks Ar\ d v Ar\ k s +Ar\g Ar\ g +Er\@n Er\ @ n +Er\@nz Er\ @ n z +Er\@nz Er\ @ n z +Er\@ns@n Er\ @ n s @ n +Er\@ns@nz Er\ @ n s @ n z +Ar\ti Ar\ t i +As A s +As@n A s @ n +{b { b +eIbieI eI b i eI +@bAb@ @ b A b @ +{b@k@ { b @ k @ +@b{k @ b { k +{b@kOU { b @ k OU +{b@k@s { b @ k @ s +@bAd @ b A d +@b{d@k@ @ b { d @ k @ +@b{di @ b { d i +@b{di @ b { d i +@bEr\ @ b Er\ +@bA5kIn @ b A 5 k I n +{b@5OUni { b @5 OU n i +{b@5OUniz { b @5 OU n i z +AbA5OUz A b A 5 OU z +@bE@nd@n @ b E@ n d @ n +@bE@nd@nd @ b E@ n d @ n d +@bE@nd@nIN @ b E@ n d @ n I N +@bE@nd@nm@nt @ b E@ n d @ n m @ n t +@bE@nd@nm@nts @ b E@ n d @ n m @ n t s +@bE@nd@nz @ b E@ n d @ n z +@bE@ntOU @ b E@ n t OU +@bAr\k@ @ b Ar\ k @ +AbAr\i A b Ar\ i +{b@sk@5 { b @ s k @5 +@b{S @ b { S +@b{St @ b { S t +@beIZj@ @ b eI Z j @ +@beIt @ b eI t +@beItId @ b eI t I d +@beItm@nt @ b eI t m @ n t +@beItm@nts @ b eI t m @ n t s +@beIts @ b eI t s +@beItIN @ b eI t I N +{b@ { b @ +@bAdOU @ b A d OU +@bAs @ b A s +AbAsi A b A s i +AbeIt A b eI t +AbAtiE5OU A b A t i E 5 OU +{bi { b i +{b@nh{Us { b @ n h {U s +@bEt @ b E t +{bvI5 { b v I 5 +{bi { b i +{biz { b i z +{bi { b i +{bIt { b I t +{b@t { b @ t +{b@tst{Un { b @ t s t {U n +{b@t { b @ t +{b@ts { b @ t s +{b@tst{Un { b @ t s t {U n +@bud @ b u d +@br\ivieIt @ b r\ i v i eI t +@br\ivieItId @ b r\ i v i eI t I d +@br\ivieIts @ b r\ i v i eI t s +@br\ivieItIN @ b r\ i v i eI t I N +@br\ivieIS@n @ b r\ i v i eI S @ n +@br\ivieIS@nz @ b r\ i v i eI S @ n z +Abr\utseIzi A b r\ u t s eI z i +{bz { b z +{bi { b i +eIbisi eI b i s i +eIbisiz eI b i s i z +{bkOU { b k OU +{bkOUtEk { b k OU t E k +eIbisiz eI b i s i z +{bd{5@ { b d { 5 @ +{bd{5@ { b d { 5 @ +{bdE5 { b d E 5 +{bdE5@ { b d E 5 @ +{bd@keIt { b d @ k eI t +{bd@keIt@d { b d @ k eI t @ d +{bd@keIts { b d @ k eI t s +{bdIkeItIN { b d I k eI t I N +{bdIkeIS@n { b d I k eI S @ n +{bdn3 { b d n 3 +{bdOU { b d OU +{bdA5@ { b d A 5 @ +{bdOUm@n { b d OU m @ n +{bdAm@n@5 { b d A m @ n @5 +{bdVkt { b d V k t +{bdVktId { b d V k t I d +{bd@kti { b d @ k t i +{bd@ktiz { b d @ k t i z +{bdVktIN { b d V k t I N +{bdVkS@n { b d V k S @ n +{bdVkS@nz { b d V k S @ n z +{bdVkt3 { b d V k t 3 +{bdVkt3z { b d V k t 3 z +{bdVkts { b d V k t s +{bdu5 { b d u 5 +{bdu5@ziz { b d u 5 @ z i z +Abdu5@ A b d u 5 @ +{bdV5@ { b d V 5 @ +eIb eI b +@bEd @ b E d +@bEdi @ b E d i +@bi @ b i +eIb@5 eI b @5 +AbE5@ A b E 5 @ +{bI53d { b I 5 3 d +@bE5 @ b E 5 +@bE5z @ b E 5 z +eIb@5 eI b @5 +@bE5@ @ b E 5 @ +{bI5n { b I 5 n +{b@5OU { b @5 OU +eIb@5z eI b @5 z +{bI5s@`n { b I 5 s @` n +{bEnd { b E n d +{bIndr\OT { b I n d r\ O T +eIb3 eI b 3 +{b3kr\Ambi { b 3 k r\ A m b i +{b3din { b 3 d i n +eIb3f3d eI b 3 f 3 d +{b3g { b 3 g +{b3@5 { b 3 @5 +{b3mIn { b 3 m I n +{b3n{Ti { b 3 n { T i +{b3nETi { b 3 n E T i +{bEr\@nt { b Er\ @ n t +{b3eIS@n { b 3 eI S @ n +{b3eIS@n@5 { b 3 eI S @ n @5 +{b3eIS@nz { b 3 eI S @ n z +{b3t { b 3 t +@bEt @ b E t +@bEtId @ b E t I d +@bEtIN @ b E t I N +eIbEks eI b E k s +@beI@ns @ b eI @ n s +AbeIt@ A b eI t @ +{bhor\ { b h or\ +@bhor\d @ b h or\ d +@bhor\@ns @ b h or\ @ n s +{bhor\@nt { b h or\ @ n t +@bhor\z @ b h or\ z +eIbi@m eI b i @ m +eIbi@mz eI b i @ m z +eIbId eI b I d +@baId @ b aI d +@baIdId @ b aI d I d +@baIdz @ b aI d z +@baIdIN @ b aI d I N +{bIdZAn { b I dZ A n +{bi { b i +{b@geI5 { b @ g eI 5 +Abi5@ A b i 5 @ +{bI5in { b I 5 i n +@bI5@tiz @ b I 5 @ t i z +@bI5@ti @ b I 5 @ t i +{bImeI5 { b I m eI 5 +{bImeI5z { b I m eI 5 z +{bINd@n { b I N d @ n +{bINt@n { b I N t @ n +AbiOU A b i OU +AbiOU5@ A b i OU 5 @ +AbiOU5@z A b i OU 5 @ z +eIbi@mEd eI b i @ m E d +@bIkju @ b I k j u +{bItibi { b I t i b i +{bIts { b I t s +{bdZEkt { b dZ E k t +{bkAzj@ { b k A z j @ +{bkAzi@n { b k A z i @ n +{bkAzi@nz { b k A z i @ n z +@b5eIz @ b 5 eI z +eIb@5 eI b @5 +eIb@5bAdid eI b @5 b A d i d +eIb@5d eI b @5 d +eIb@53 eI b @5 3 +eIb@5z eI b @5 z +eIb@5st eI b @5 s t +@b5um @ b 5 u m +eIb5i eI b 5 i +{bnEgeIS@n { b n E g eI S @ n +{bn3 { b n 3 +{bni { b n i +{bnor\m@5 { b n or\ m @5 +{bnor\m{5@tiz { b n or\ m { 5 @ t i z +{bnor\m{5@ti { b n or\ m { 5 @ t i +{bnor\m@5i { b n or\ m @5 i +AbOU A b OU +AbOUz A b OU z +@bor\d @ b or\ d +@bOUd @ b OU d +{b@h@5im@ { b @ h @5 i m @ +@bA5IS @ b A 5 I S +@bA5ISt @ b A 5 I S t +@bA5ISIz @ b A 5 I S I z +@bA5ISIN @ b A 5 I S I N +{b@5IS@n { b @5 I S @ n +{b@5IS@nIz@m { b @5 I S @ n I z @ m +{b@5IS@n@st { b @5 I S @ n @ s t +{b@5IS@n@sts { b @5 I S @ n @ s t s +@bAm@n@b@5 @ b A m @ n @ b @5 +@bAm@neIS@n @ b A m @ n eI S @ n +@bAm@neIS@nz @ b A m @ n eI S @ n z +@bud @ b u d +@budi @ b u d i +@bor\ @ b or\ +{b3IdZ@n@5 { b 3 I dZ @ n @5 +{b3IdZ@ni { b 3 I dZ @ n i +{b3IdZ@niz { b 3 I dZ @ n i z +@bor\n @ b or\ n +@bor\t @ b or\ t +@bor\tId @ b or\ t I d +@bor\t@feIS@nt @ b or\ t @ f eI S @ n t +@bor\t@feIS@nts @ b or\ t @ f eI S @ n t s +@bor\tIN @ b or\ t I N +@bor\S@n @ b or\ S @ n +@bor\S@nIst @ b or\ S @ n I s t +@bor\S@nIsts @ b or\ S @ n I s t s +@bor\S@nz @ b or\ S @ n z +@bor\tIv @ b or\ t I v +@bor\ts @ b or\ t s +@bAt @ b A t +@bu @ b u +Abud A b u d +AbuhA5im@ A b u h A 5 i m @ +AbuhA5im@z A b u h A 5 i m @ z +@b{Und @ b {U n d +@b{UndId @ b {U n d I d +@b{UndIN @ b {U n d I N +@b{Undz @ b {U n d z +@b{Ut @ b {U t +@b{Uts @ b {U t s +@bVv @ b V v +@bVvz @ b V v z +@bVvbor\d @ b V v b or\ d +{bp5@n{5p { b p 5 @ n { 5 p +Abr\@ A b r\ @ +{br\@k@d{br\@ { b r\ @ k @ d { b r\ @ +eIbr\@hE@m eI b r\ @ h E@ m +{br\@heImi@n { b r\ @ h eI m i @ n +eIbr\@hE@mz eI b r\ @ h E@ m z +{br\@hE@ms@n { b r\ @ h E@ m s @ n +@br\{h@ms@n @ b r\ { h @ m s @ n +@br\E@m @ b r\ E@ m +eIbr\@mz eI b r\ @ m z +Abr\@mtSIk A b r\ @ m tS I k +Abr\AmOU A b r\ A m OU +@br\Am@vIts @ b r\ A m @ v I t s +@br\Am@vItS @ b r\ A m @ v I tS +@br\Am@wIts @ b r\ A m @ w I t s +eIbr\@mz eI b r\ @ m z +eIbr\@mzIz eI b r\ @ m z I z +eIbr\@ms@n eI b r\ @ m s @ n +@br\eIZ@n @ b r\ eI Z @ n +@br\eIZ@nz @ b r\ eI Z @ n z +@br\eIsIv @ b r\ eI s I v +@br\eIsIvz @ b r\ eI s I v z +@br\{ks@ @ b r\ { k s @ +@br\{ks@z @ b r\ { k s @ z +@br\{ks@z @ b r\ { k s @ z +@br\Est @ b r\ E s t +Abr\EgOU A b r\ E g OU +@br\u @ b r\ u +@br\IdZ @ b r\ I dZ +@br\IdZd @ b r\ I dZ d +@br\IdZm@nt @ b r\ I dZ m @ n t +@br\IdZ@z @ b r\ I dZ @ z +@br\IdZIN @ b r\ I dZ I N +@br\I5 @ b r\ I 5 +@br\Od @ b r\ O d +{br\@geIt { b r\ @ g eI t +{br\@geItId { b r\ @ g eI t I d +{br\@geItIN { b r\ @ g eI t I N +{br\@geIS@n { b r\ @ g eI S @ n +@br\OU5 @ b r\ OU 5 +@br\An @ b r\ A n +@br\Vpt @ b r\ V p t +@br\Vpt5i @ b r\ V p t 5 i +@br\Vptn@s @ b r\ V p t n @ s +eIbr\utIn eI b r\ u t I n +Abr\utseIzi A b r\ u t s eI z i +Abr\uzOU A b r\ u z OU +eIbiEs eI b i E s +{bs@5@m { b s @5 @ m +{bsAr\@k@ { b s Ar\ @ k @ +{bskE@m { b s k E@ m +{bsEs { b s E s +{bskAnd { b s k A n d +{bskAnd@d { b s k A n d @ d +{bskAndIN { b s k A n d I N +{bskAndz { b s k A n d z +{bs@kOn { b s @ k O n +{bs@ns { b s @ n s +{bs@nsIz { b s @ n s I z +{bs@nt { b s @ n t +{bs@nti { b s @ n t i +{bs@ntiIz@m { b s @ n t i I z @ m +{bs@ntiz { b s @ n t i z +{bsEnS@ { b s E n S @ +{bS3 { b S 3 +{bSi3 { b S i 3 +{bSaIr\ { b S aI r\ +{bsInT { b s I n T +{bsOU { b s OU +{bs@5@m { b s @5 @ m +{bs@5ut { b s @5 u t +{bs@5uts { b s @5 u t s +{bs@5ut { b s @5 u t +{bs@5ut5i { b s @5 u t 5 i +{bs@5utn@s { b s @5 u t n @ s +{bs@5uts { b s @5 u t s +{bs@5uS@n { b s @5 u S @ n +{bs@5utIz@m { b s @5 u t I z @ m +{bs@5utIst { b s @5 u t I s t +@bzA5v @ b z A 5 v +@bzA5vd @ b z A 5 v d +@bzA5vz @ b z A 5 v z +@bzA5vIN @ b z A 5 v I N +@bzor\b @ b z or\ b +@bzor\bd @ b z or\ b d +@bzor\b@nsi @ b z or\ b @ n s i +@bzor\b@nt @ b z or\ b @ n t +@bzor\b3 @ b z or\ b 3 +@bzor\b3z @ b z or\ b 3 z +@bzor\bIN @ b z or\ b I N +@bzor\bz @ b z or\ b z +@bzor\pS@n @ b z or\ p S @ n +@bsteIn @ b s t eI n +@bsteInd @ b s t eI n d +@bsteInIN @ b s t eI n I N +@bstEntS@n @ b s t E n tS @ n +@bstEntS@nz @ b s t E n tS @ n z +{bst@n@ns { b s t @ n @ n s +{bst@n@nt { b s t @ n @ n t +{bst@n { b s t @ n +{bstr\{kt { b s t r\ { k t +{bstr\{ktId { b s t r\ { k t I d +{bstr\{kS@n { b s t r\ { k S @ n +{bstr\{kS@nz { b s t r\ { k S @ n z +{bstr\{kts { b s t r\ { k t s +@bstr\us @ b s t r\ u s +@bs3d @ b s 3 d +@bs3dIst @ b s 3 d I s t +@bs3d@tiz @ b s 3 d @ t i z +@bs3d@ti @ b s 3 d @ t i +@bs3d5i @ b s 3 d 5 i +{bt { b t +{bts { b t s +{bu { b u +@bVdr\@m @ b V d r\ @ m +@bwE5@ @ b w E 5 @ +@bwE5@z @ b w E 5 @ z +{bju5{dzi { b j u 5 { d z i +@bVnd@ns @ b V n d @ n s +@bVnd@nt @ b V n d @ n t +@bVnd@nt5i @ b V n d @ n t 5 i +@bUr\tOU @ b Ur\ t OU +@bUr\tOUz @ b Ur\ t OU z +@bjus @ b j u s +@bjuzd @ b j u z d \ No newline at end of file diff --git a/tests/data/lab/xsampa.lab b/tests/data/lab/xsampa.lab new file mode 100644 index 00000000..f73fe41b --- /dev/null +++ b/tests/data/lab/xsampa.lab @@ -0,0 +1 @@ +@bUr\tOU {bstr\{kt {bSaIr\ Abr\utseIzi {br\@geItIN @bor\n {b3kr\Ambi {bI5s@`n Ar\g thr\Ip@5eI Ar\dvAr\k \ No newline at end of file diff --git a/tests/test_corpus.py b/tests/test_corpus.py index b9741d02..3155d0fa 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -403,4 +403,18 @@ def test_alternate_punctuation(punctuated_dir, temp_dir, sick_dict_path, differe c.initialize_corpus(dictionary) print(c.text_mapping['punctuated']) assert c.text_mapping['punctuated'] == 'oh yes, they they, you know, they love her and so i mean' + dictionary.cleanup_logger() + +def test_xsampa_corpus(xsampa_corpus_dir, xsampa_dict_path, temp_dir, generated_dir, different_punctuation_config): + train_config, align_config = train_yaml_to_config(different_punctuation_config) + output_directory = os.path.join(temp_dir, 'xsampa_corpus') + shutil.rmtree(output_directory, ignore_errors=True) + print(align_config.punctuation) + dictionary = Dictionary(xsampa_dict_path, output_directory, punctuation=align_config.punctuation) + dictionary.write() + c = AlignableCorpus(xsampa_corpus_dir, output_directory, use_mp=False, punctuation=align_config.punctuation) + print(c.punctuation) + c.initialize_corpus(dictionary) + print(c.text_mapping['michael-xsampa']) + assert c.text_mapping['michael-xsampa'] == r'@bUr\tOU {bstr\{kt {bSaIr\ Abr\utseIzi {br\@geItIN @bor\n {b3kr\Ambi {bI5s@`n Ar\g thr\Ip@5eI Ar\dvAr\k'.lower() dictionary.cleanup_logger() \ No newline at end of file diff --git a/tests/test_dict.py b/tests/test_dict.py index ab54a5a9..41314753 100644 --- a/tests/test_dict.py +++ b/tests/test_dict.py @@ -113,6 +113,16 @@ def test_multilingual_ipa(): assert parse_ipa(input_transcription) == expected +def test_xsampa_dir(xsampa_dict_path, generated_dir): + d = Dictionary(xsampa_dict_path, os.path.join(generated_dir, 'xsampa')) + d.write() + + print(d.words) + assert not d.clitic_set + assert d.split_clitics('r\{und') == ['r\{und'] + assert d.split_clitics('{bI5s@`n') == ['{bI5s@`n'] + assert d.words[r'r\{und'] + def test_multispeaker_config(multispeaker_dictionary_config, generated_dir): dictionary = MultispeakerDictionary(multispeaker_dictionary_config, os.path.join(generated_dir, 'multispeaker')) dictionary.write()