From 8abae27a20ea4e31ff64f086677e43941aee2fb5 Mon Sep 17 00:00:00 2001
From: Michael McAuliffe <michael.e.mcauliffe@gmail.com>
Date: Fri, 8 Oct 2021 00:00:42 -0700
Subject: [PATCH] Model export fixes (#338)

Update model export and better support for xsampa orthography
---
 docs/source/changelog.rst                     |   8 +
 docs/source/commands.rst                      |   1 +
 docs/source/configuration.rst                 |   7 +-
 montreal_forced_aligner/__init__.py           |   2 +-
 montreal_forced_aligner/aligner/pretrained.py |   5 +-
 montreal_forced_aligner/command_line/mfa.py   |  68 ++-
 montreal_forced_aligner/config/__init__.py    |  29 +-
 .../config/train_config.py                    |   5 +-
 montreal_forced_aligner/dictionary.py         |  12 +-
 montreal_forced_aligner/features/config.py    |   5 +-
 montreal_forced_aligner/models.py             |  16 +-
 .../multiprocessing/__init__.py               |   4 +-
 .../multiprocessing/alignment.py              |  70 ++-
 montreal_forced_aligner/thirdparty/kaldi.py   |   2 +-
 montreal_forced_aligner/trainers/base.py      |   2 +-
 montreal_forced_aligner/trainers/sat.py       |  10 +-
 tests/conftest.py                             |  17 +
 tests/data/dictionaries/xsampa.txt            | 412 ++++++++++++++++++
 tests/data/lab/xsampa.lab                     |   1 +
 tests/test_corpus.py                          |  14 +
 tests/test_dict.py                            |  10 +
 21 files changed, 654 insertions(+), 46 deletions(-)
 create mode 100644 tests/data/dictionaries/xsampa.txt
 create mode 100644 tests/data/lab/xsampa.lab

diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
index 75c60bd1..02d2919a 100644
--- a/docs/source/changelog.rst
+++ b/docs/source/changelog.rst
@@ -8,6 +8,14 @@
 Changelog
 =========
 
+2.0.0b3
+-------
+
+- Fixed a bug involving non-escaped orthographic characters
+- Improved SAT alignment with speaker-independent alignment model
+- Fixed a bug where models would not function properly if they were renamed
+- Added a history subcommand to list previous commands
+
 2.0.0b1
 -------
 
diff --git a/docs/source/commands.rst b/docs/source/commands.rst
index bbbcc733..12f39e29 100644
--- a/docs/source/commands.rst
+++ b/docs/source/commands.rst
@@ -55,6 +55,7 @@ Other utilities
    "download", "Download a model trained by MFA developers", :ref:`pretrained_models`
    "thirdparty", "Download and validate new third party binaries", :ref:`installation`
    "configure", "Configure MFA to use customized defaults for command line arguments", :ref:`configuration`
+   "history", "List previous MFA commands run locally",
 
 
 Grapheme-to-phoneme
diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst
index 5e8142ad..8be4f1fc 100644
--- a/docs/source/configuration.rst
+++ b/docs/source/configuration.rst
@@ -74,13 +74,8 @@ Options available:
 
    Display help message for the command
 
-
-
-Configuration of commands
-=========================
-
 .. toctree::
-   :maxdepth: 1
+   :maxdepth: 2
 
    configuration_align.rst
    configuration_transcription.rst
diff --git a/montreal_forced_aligner/__init__.py b/montreal_forced_aligner/__init__.py
index 3b6dfffb..f1a04455 100644
--- a/montreal_forced_aligner/__init__.py
+++ b/montreal_forced_aligner/__init__.py
@@ -1,6 +1,6 @@
 __ver_major__ = 2
 __ver_minor__ = 0
-__ver_patch__ = '0b2'
+__ver_patch__ = '0b3'
 __version__ = "{}.{}.{}".format(__ver_major__, __ver_minor__, __ver_patch__)
 
 __all__ = ['aligner', 'command_line', 'models', 'corpus', 'config', 'dictionary', 'exceptions',
diff --git a/montreal_forced_aligner/aligner/pretrained.py b/montreal_forced_aligner/aligner/pretrained.py
index 152c5a89..2f71070c 100644
--- a/montreal_forced_aligner/aligner/pretrained.py
+++ b/montreal_forced_aligner/aligner/pretrained.py
@@ -86,15 +86,18 @@ def align(self, subset=None):
             log_dir = os.path.join(self.align_directory, 'log')
             os.makedirs(log_dir, exist_ok=True)
 
+            self.logger.info('Performing first-pass alignment...')
             align('final', self.align_directory, self.align_config.data_directory,
                   self.dictionary.optional_silence_csl,
-                  self.corpus.num_jobs, self.align_config)
+                  self.corpus.num_jobs, self.align_config, speaker_independent=True)
             unaligned, average_log_like = compile_information(self.align_directory, self.corpus, self.corpus.num_jobs, self)
             self.logger.debug(f'Prior to SAT, average per frame likelihood (this might not actually mean anything): {average_log_like}')
             if not self.align_config.disable_sat and self.acoustic_model.feature_config.fmllr \
                     and not os.path.exists(os.path.join(self.align_directory, 'trans.0')):
                 calc_fmllr(self.align_directory, self.align_config.data_directory,
                       self.dictionary.optional_silence_csl, self.corpus.num_jobs, self.align_config, initial=True, iteration='final')
+
+                self.logger.info('Performing second-pass alignment...')
                 align('final', self.align_directory, self.align_config.data_directory,
                       self.dictionary.optional_silence_csl,
                       self.corpus.num_jobs, self.align_config)
diff --git a/montreal_forced_aligner/command_line/mfa.py b/montreal_forced_aligner/command_line/mfa.py
index 35d0a842..19d39385 100644
--- a/montreal_forced_aligner/command_line/mfa.py
+++ b/montreal_forced_aligner/command_line/mfa.py
@@ -1,7 +1,9 @@
+import atexit
 import sys
 import os
 import time
 import argparse
+from datetime import datetime
 import multiprocessing as mp
 
 from montreal_forced_aligner import __version__
@@ -23,9 +25,53 @@
 from montreal_forced_aligner.command_line.train_dictionary import run_train_dictionary
 from montreal_forced_aligner.command_line.create_segments import run_create_segments
 from montreal_forced_aligner.exceptions import MFAError
-from montreal_forced_aligner.config import update_global_config, load_global_config, update_command_history
+from montreal_forced_aligner.config import update_global_config, load_global_config, update_command_history, load_command_history
+
+
+class ExitHooks(object):
+    def __init__(self):
+        self.exit_code = None
+        self.exception = None
+
+    def hook(self):
+        self._orig_exit = sys.exit
+        sys.exit = self.exit
+        sys.excepthook = self.exc_handler
+
+    def exit(self, code=0):
+        self.exit_code = code
+        self._orig_exit(code)
+
+    def exc_handler(self, exc_type, exc, *args):
+        self.exception = exc
+
+hooks = ExitHooks()
+hooks.hook()
 
 BEGIN = time.time()
+BEGIN_DATE = datetime.now()
+
+
+def history_save_handler():
+    history_data = {
+        'command': ' '.join(sys.argv),
+        'execution_time': time.time() - BEGIN,
+        'date': BEGIN_DATE,
+        'version': __version__
+                    }
+
+    if hooks.exit_code is not None:
+        history_data['exit_code'] = hooks.exit_code
+        history_data['exception'] = ''
+    elif hooks.exception is not None:
+        history_data['exit_code'] = 1
+        history_data['exception'] = hooks.exception
+    else:
+        history_data['exception'] = ''
+        history_data['exit_code'] = 0
+    update_command_history(history_data)
+
+atexit.register(history_save_handler)
 
 def fix_path():
     from montreal_forced_aligner.config import TEMP_DIR
@@ -295,6 +341,11 @@ def add_global_options(subparser, textgrid_output=False):
                                                                   "silences and recombines compound words and clitics",
                                action='store_true')
 
+    history_parser = subparsers.add_parser('history')
+
+    history_parser.add_argument('depth', help='Number of commands to list', nargs='?', default=10)
+    history_parser.add_argument('--verbose', help="Flag for whether to output additional information", action='store_true')
+
     annotator_parser = subparsers.add_parser('annotator')
     anchor_parser = subparsers.add_parser('anchor')
 
@@ -391,6 +442,21 @@ def main():
             update_global_config(args)
             global GLOBAL_CONFIG
             GLOBAL_CONFIG = load_global_config()
+        elif args.subcommand == 'history':
+            depth = args.depth
+            history = load_command_history()[-depth:]
+            for h in history:
+                if args.verbose:
+                    print('command\tDate\tExecution time\tVersion\tExit code\tException')
+                    for h in history:
+                        execution_time = time.strftime('%H:%M:%S', time.gmtime(h['execution_time']))
+                        d = h['date'].isoformat()
+                        print(f"{h['command']}\t{d}\t{execution_time}\t{h['version']}\t{h['exit_code']}\t{h['exception']}")
+                    pass
+                else:
+                    for h in history:
+                        print(h['command'])
+
         elif args.subcommand == 'version':
             print(__version__)
     except MFAError as e:
diff --git a/montreal_forced_aligner/config/__init__.py b/montreal_forced_aligner/config/__init__.py
index 2ab150b2..0ed9f60b 100644
--- a/montreal_forced_aligner/config/__init__.py
+++ b/montreal_forced_aligner/config/__init__.py
@@ -16,11 +16,32 @@ def generate_config_path():
     return os.path.join(TEMP_DIR, 'global_config.yaml')
 
 def generate_command_history_path():
-    return os.path.join(TEMP_DIR, 'command_history')
+    return os.path.join(TEMP_DIR, 'command_history.yaml')
 
-def update_command_history(command, duration, exit_code, exception):
-    with open(generate_command_history_path(), 'a', encoding='utf8') as f:
-        f.write(f'{command}\t{duration}\t{exit_code}\t{exception}\n')
+def load_command_history():
+    path = generate_command_history_path()
+    if os.path.exists(path):
+        with open(path, 'r', encoding='utf8') as f:
+            history = yaml.safe_load(f)
+    else:
+        history = []
+    if not history:
+        history = []
+    return history
+
+
+def update_command_history(command_data):
+    try:
+        if command_data['command'].split(' ')[1] == 'history':
+            return
+    except Exception:
+        return
+    history = load_command_history()
+    path = generate_command_history_path()
+    history.append(command_data)
+    history = history[-50:]
+    with open(path, 'w', encoding='utf8') as f:
+        yaml.safe_dump(history, f)
 
 def update_global_config(args):
     global_configuration_file = generate_config_path()
diff --git a/montreal_forced_aligner/config/train_config.py b/montreal_forced_aligner/config/train_config.py
index 8788e503..8f4d28f8 100644
--- a/montreal_forced_aligner/config/train_config.py
+++ b/montreal_forced_aligner/config/train_config.py
@@ -28,8 +28,9 @@ def __init__(self, training_configs):
         self.compound_markers = DEFAULT_COMPOUND_MARKERS
 
     def update_from_align(self, align_config):
-        self.training_configs[-1].overwrite = align_config.overwrite
-        self.training_configs[-1].cleanup_textgrids = align_config.cleanup_textgrids
+        for tc in self.training_configs:
+            tc.overwrite = align_config.overwrite
+            tc.cleanup_textgrids = align_config.cleanup_textgrids
 
     def update(self, data):
         for k, v in data.items():
diff --git a/montreal_forced_aligner/dictionary.py b/montreal_forced_aligner/dictionary.py
index de40a13f..3aec87ec 100644
--- a/montreal_forced_aligner/dictionary.py
+++ b/montreal_forced_aligner/dictionary.py
@@ -15,15 +15,9 @@
 
 
 def compile_graphemes(graphemes):
-    if '-' in graphemes:
-        base = r'^\W*([-{}]+)\W*'
-    else:
-        base = r'^\W*([{}]+)\W*'
-    graphemes = list(graphemes)
-    for i in range(len(graphemes)):
-        if graphemes[i] == ']':
-            graphemes[i] = r'\]'
-    string = ''.join(x for x in graphemes if x != '-')
+
+    base = r'^\W*([{}]+)\W*'
+    string = re.escape(''.join(graphemes))
     try:
         return re.compile(base.format(string))
     except Exception:
diff --git a/montreal_forced_aligner/features/config.py b/montreal_forced_aligner/features/config.py
index 25840e94..3f604301 100644
--- a/montreal_forced_aligner/features/config.py
+++ b/montreal_forced_aligner/features/config.py
@@ -145,7 +145,8 @@ def generate_base_features(self, corpus, logger=None, compute_cmvn=True):
                 log_func('Calculating CMVN...')
                 calc_cmvn(corpus)
 
-    def construct_feature_proc_string(self, data_directory, model_directory, job_name, splice=False, voiced=False, cmvn=True):
+    def construct_feature_proc_string(self, data_directory, model_directory, job_name, splice=False, voiced=False,
+                                      cmvn=True, speaker_independent=False):
         if self.directory is None:
             self.directory = data_directory
         lda_mat_path = None
@@ -190,7 +191,7 @@ def construct_feature_proc_string(self, data_directory, model_directory, job_nam
             elif self.deltas:
                 feats += " add-deltas ark:- ark:- |"
 
-            if fmllr_trans_path is not None:
+            if fmllr_trans_path is not None and not speaker_independent:
                 if not os.path.exists(fmllr_trans_path):
                     raise Exception('Could not find {}'.format(fmllr_trans_path))
                 feats += " transform-feats --utt2spk=ark:{} ark:{} ark:- ark:- |".format(utt2spk_path, fmllr_trans_path)
diff --git a/montreal_forced_aligner/models.py b/montreal_forced_aligner/models.py
index f17e58b3..23246f48 100644
--- a/montreal_forced_aligner/models.py
+++ b/montreal_forced_aligner/models.py
@@ -1,7 +1,7 @@
 import os
 import yaml
 
-from shutil import copy, copyfile, rmtree, make_archive, unpack_archive
+from shutil import copy, copyfile, rmtree, make_archive, unpack_archive, move
 
 from . import __version__
 from .exceptions import PronunciationAcousticMismatchError
@@ -31,11 +31,15 @@ def __init__(self, source, root_directory=None):
         if os.path.isdir(source):
             self.dirname = os.path.abspath(source)
         else:
-            base = root_directory
             self.dirname = os.path.join(root_directory, self.name)
             if not os.path.exists(self.dirname):
                 os.makedirs(root_directory, exist_ok=True)
-                unpack_archive(source, base)
+                unpack_archive(source, self.dirname)
+                files = os.listdir(self.dirname)
+                old_dir_path = os.path.join(self.dirname, files[0])
+                if len(files) == 1 and os.path.isdir(old_dir_path): # Backwards compatibility
+                        for f in os.listdir(old_dir_path):
+                            move(os.path.join(old_dir_path, f), os.path.join(self.dirname, f))
 
     @property
     def meta(self):
@@ -76,16 +80,16 @@ def __repr__(self):
     def clean_up(self):
         rmtree(self.dirname)
 
-    def dump(self, sink, archive_fmt=FORMAT):
+    def dump(self, path, archive_fmt=FORMAT):
         """
         Write archive to disk, and return the name of final archive
         """
-        return make_archive(sink, archive_fmt,
+        return make_archive(os.path.splitext(path)[0], archive_fmt,
                             *os.path.split(self.dirname))
 
 
 class AcousticModel(Archive):
-    files = ['final.mdl', 'final.occs', 'lda.mat', 'tree']
+    files = ['final.mdl', 'final.alimdl', 'final.occs', 'lda.mat', 'tree']
     def add_meta_file(self, aligner):
         with open(os.path.join(self.dirname, 'meta.yaml'), 'w', encoding='utf8') as f:
             yaml.dump(aligner.meta, f)
diff --git a/montreal_forced_aligner/multiprocessing/__init__.py b/montreal_forced_aligner/multiprocessing/__init__.py
index 3a8b4c35..0f2ede8a 100644
--- a/montreal_forced_aligner/multiprocessing/__init__.py
+++ b/montreal_forced_aligner/multiprocessing/__init__.py
@@ -1,6 +1,8 @@
 from .helper import run_mp, run_non_mp, Stopped, Counter
 from .alignment import align, compute_alignment_improvement, convert_ali_to_textgrids, compile_information, acc_stats, \
-    lda_acc_stats, mono_align_equal, compile_train_graphs, tree_stats, convert_alignments, calc_lda_mllt, calc_fmllr
+    lda_acc_stats, mono_align_equal, compile_train_graphs, tree_stats, convert_alignments, calc_lda_mllt, calc_fmllr, \
+    create_align_model
+
 from .transcription import transcribe, transcribe_fmllr
 from .ivector import gmm_gselect, acc_global_stats, acc_ivector_stats, extract_ivectors, gauss_to_post, segment_vad, \
     classify_speakers
diff --git a/montreal_forced_aligner/multiprocessing/alignment.py b/montreal_forced_aligner/multiprocessing/alignment.py
index 9d6b634e..5d5c0155 100644
--- a/montreal_forced_aligner/multiprocessing/alignment.py
+++ b/montreal_forced_aligner/multiprocessing/alignment.py
@@ -138,7 +138,7 @@ def compile_train_graphs(directory, lang_directory, split_directory, num_jobs, a
     num_jobs : int
         The number of processes to use
     """
-    aligner.logger.info('Compiling training graphs...')
+    aligner.logger.debug('Compiling training graphs...')
     begin = time.time()
     log_directory = os.path.join(directory, 'log')
     os.makedirs(log_directory, exist_ok=True)
@@ -234,7 +234,8 @@ def align_func(directory, iteration, job_name, mdl, config, feature_string, outp
         align_proc.communicate()
 
 
-def align(iteration, directory, split_directory, optional_silence, num_jobs, config, output_directory=None):
+def align(iteration, directory, split_directory, optional_silence, num_jobs, config,
+          output_directory=None, speaker_independent=False):
     """
     Multiprocessing function that aligns based on the current model
 
@@ -261,17 +262,18 @@ def align(iteration, directory, split_directory, optional_silence, num_jobs, con
     config : :class:`~aligner.config.MonophoneConfig`, :class:`~aligner.config.TriphoneConfig` or :class:`~aligner.config.TriphoneFmllrConfig`
         Configuration object for training
     """
-    config.logger.info('Performing alignment...')
     begin = time.time()
     if output_directory is None:
         output_directory = directory
     log_directory = os.path.join(output_directory, 'log')
-    mdl_path = os.path.join(directory, '{}.mdl'.format(iteration))
+    align_model_path = os.path.join(directory, '{}.alimdl'.format(iteration))
+    if not speaker_independent or not os.path.exists(align_model_path):
+        align_model_path = os.path.join(directory, '{}.mdl'.format(iteration))
     if config.boost_silence != 1.0:
         mdl = "{} --boost={} {} {} - |".format(thirdparty_binary('gmm-boost-silence'),
-                                               config.boost_silence, optional_silence, make_path_safe(mdl_path))
+                                               config.boost_silence, optional_silence, make_path_safe(align_model_path))
     else:
-        mdl = mdl_path
+        mdl = align_model_path
 
     jobs = [(directory, iteration, x, mdl, config.align_options,
              config.feature_config.construct_feature_proc_string(split_directory, directory, x),
@@ -345,8 +347,9 @@ def compile_information(model_directory, corpus, num_jobs, config):
     total_frames = sum(data['total_frames'] for data in alignment_info.values())
     average_log_like = 0
     for x, data in alignment_info.items():
-        weight = data['total_frames'] / total_frames
-        average_log_like += data['log_like'] * weight
+        if total_frames:
+            weight = data['total_frames'] / total_frames
+            average_log_like += data['log_like'] * weight
         for u in data['unaligned']:
             unaligned[u] = 'Beam too narrow'
         for u in data['too_short']:
@@ -1429,6 +1432,57 @@ def calc_fmllr(directory, split_directory, sil_phones, num_jobs, config,
     config.logger.debug(f'Fmllr calculation took {time.time() - begin}')
 
 
+def acc_stats_two_feats_func(directory, model_path, feature_string, si_feature_string, job_name):
+    log_path = os.path.join(directory, 'log', 'align_model_est.{}.log'.format(job_name))
+    acc_path = os.path.join(directory, 'align_model.{}.acc'.format(job_name))
+    with open(log_path, 'w', encoding='utf8') as log_file:
+        ali_to_post_proc = subprocess.Popen([thirdparty_binary('ali-to-post'),
+                                             'ark:' + os.path.join(directory, 'ali.{}'.format(job_name)),
+                                             'ark:-'],
+                                            stderr=log_file, stdout=subprocess.PIPE)
+        acc_proc = subprocess.Popen([thirdparty_binary('gmm-acc-stats-twofeats'), model_path,
+                                     feature_string, si_feature_string, "ark,s,cs:-", acc_path],
+                                    stderr=log_file, stdin=ali_to_post_proc.stdout)
+        acc_proc.communicate()
+
+
+
+def create_align_model(directory, split_directory, num_jobs, config):
+    config.logger.info('Creating alignment model for speaker-independent features...')
+    begin = time.time()
+    log_directory = os.path.join(directory, 'log')
+
+    model_name = 'final'
+    model_path = os.path.join(directory, '{}.mdl'.format(model_name))
+    align_model_path = os.path.join(directory, '{}.alimdl'.format(model_name))
+    jobs = [(directory, model_path,
+             config.feature_config.construct_feature_proc_string(split_directory, directory, x),
+             config.feature_config.construct_feature_proc_string(split_directory, directory, x, speaker_independent=True),
+             x) for x in range(num_jobs)]
+    if config.use_mp:
+        run_mp(acc_stats_two_feats_func, jobs, log_directory)
+    else:
+        run_non_mp(acc_stats_two_feats_func, jobs, log_directory)
+
+    log_path = os.path.join(directory, 'log', 'align_model_est.final.log')
+    with open(log_path, 'w', encoding='utf8') as log_file:
+        acc_files = [os.path.join(directory, 'align_model.{}.acc'.format(x))
+                     for x in range(num_jobs)]
+        est_proc = subprocess.Popen([thirdparty_binary('gmm-est'),
+                                     "--remove-low-count-gaussians=false", '--power=' + str(config.power),
+                                     model_path,
+                                     "{} - {}|".format(thirdparty_binary('gmm-sum-accs'),
+                                                       ' '.join(map(make_path_safe, acc_files))),
+                                     align_model_path],
+                                    stderr=log_file)
+        est_proc.communicate()
+        if not config.debug:
+            for f in acc_files:
+                os.remove(f)
+
+    config.logger.debug(f'Alignment model creation took {time.time() - begin}')
+
+
 def lda_acc_stats_func(directory, feature_string, align_directory, config, ci_phones, i):
     log_path = os.path.join(directory, 'log', 'ali_to_post.{}.log'.format(i))
     with open(log_path, 'w', encoding='utf8') as log_file:
diff --git a/montreal_forced_aligner/thirdparty/kaldi.py b/montreal_forced_aligner/thirdparty/kaldi.py
index f421f93d..1639df08 100644
--- a/montreal_forced_aligner/thirdparty/kaldi.py
+++ b/montreal_forced_aligner/thirdparty/kaldi.py
@@ -21,7 +21,7 @@
                        'compile-train-graphs', 'compile-train-graphs-fsts', 'compose-transforms', 'compute-cmvn-stats',
                        'compute-mfcc-feats', 'convert-ali', 'copy-feats', 'est-lda', 'est-mllt',
                        'extract-segments', 'feat-to-dim', 'feat-to-len', 'gmm-acc-mllt', 'gmm-acc-stats-ali',
-                       'gmm-align-compiled',
+                       'gmm-align-compiled', 'gmm-acc-stats-twofeats',
                        'gmm-boost-silence', 'gmm-est', 'gmm-est-fmllr', 'gmm-info', 'gmm-init-model', 'gmm-init-mono',
                        'gmm-latgen-faster', 'gmm-mixup',
                        'gmm-sum-accs', 'gmm-transform-means',
diff --git a/montreal_forced_aligner/trainers/base.py b/montreal_forced_aligner/trainers/base.py
index 3bf3ede4..cebd14ba 100644
--- a/montreal_forced_aligner/trainers/base.py
+++ b/montreal_forced_aligner/trainers/base.py
@@ -393,4 +393,4 @@ def save(self, path, root_directory=None):
         if directory:
             os.makedirs(directory, exist_ok=True)
         basename, _ = os.path.splitext(path)
-        acoustic_model.dump(basename)
+        acoustic_model.dump(path)
diff --git a/montreal_forced_aligner/trainers/sat.py b/montreal_forced_aligner/trainers/sat.py
index 293fa2e0..8614f330 100644
--- a/montreal_forced_aligner/trainers/sat.py
+++ b/montreal_forced_aligner/trainers/sat.py
@@ -6,7 +6,8 @@
 
 from ..multiprocessing import (align, compile_train_graphs,
                                acc_stats, tree_stats, convert_alignments,
-                               calc_fmllr, compute_alignment_improvement, compile_information)
+                               calc_fmllr, compute_alignment_improvement, compile_information,
+                               create_align_model)
 from ..helper import thirdparty_binary, make_path_safe, log_kaldi_errors, parse_logs, load_scp
 from ..exceptions import KaldiProcessingError
 
@@ -111,7 +112,7 @@ def train(self, call_back=None):
                         os.path.join(self.train_directory, 'final.mdl'))
             shutil.copy(os.path.join(self.train_directory, '{}.occs'.format(self.num_iterations)),
                         os.path.join(self.train_directory, 'final.occs'))
-
+            create_align_model(self.train_directory, self.corpus.split_directory(), self.corpus.num_jobs, self)
             if not self.debug:
                 for i in range(1, self.num_iterations):
                     model_path = os.path.join(self.train_directory, '{}.mdl'.format(i))
@@ -159,6 +160,9 @@ def align(self, subset, call_back=None):
                 shutil.copy(os.path.join(self.train_directory, 'tree'), self.align_directory)
                 shutil.copyfile(os.path.join(self.train_directory, 'final.mdl'),
                                 os.path.join(self.align_directory, 'final.mdl'))
+                if os.path.exists(os.path.join(self.train_directory, 'final.alimdl')):
+                    shutil.copyfile(os.path.join(self.train_directory, 'final.alimdl'),
+                                    os.path.join(self.align_directory, 'final.alimdl'))
 
                 if os.path.exists(os.path.join(self.train_directory, 'lda.mat')):
                     shutil.copyfile(os.path.join(self.train_directory, 'lda.mat'),
@@ -173,7 +177,7 @@ def align(self, subset, call_back=None):
                                     os.path.join(self.align_directory, 'trans.{}'.format(i)))
                 align('final', self.align_directory, align_data_directory,
                       self.dictionary.optional_silence_csl,
-                      self.corpus.num_jobs, self, self.align_directory)
+                      self.corpus.num_jobs, self, self.align_directory, speaker_independent=True)
 
                 unaligned, average_log_like = compile_information(self.align_directory, self.corpus,
                                                                   self.corpus.num_jobs, self)
diff --git a/tests/conftest.py b/tests/conftest.py
index be67dc76..3b5d9091 100755
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -180,6 +180,18 @@ def basic_corpus_dir(corpus_root_dir, wav_dir, lab_dir):
     return path
 
 
+@pytest.fixture(scope='session')
+def xsampa_corpus_dir(corpus_root_dir, wav_dir, lab_dir):
+    path = os.path.join(corpus_root_dir, 'basic')
+    os.makedirs(path, exist_ok=True)
+
+    s_dir = os.path.join(path, 'michael')
+    os.makedirs(s_dir, exist_ok=True)
+    shutil.copyfile(os.path.join(wav_dir, 'acoustic_corpus.wav'), os.path.join(s_dir, 'xsampa.wav'))
+    shutil.copyfile(os.path.join(lab_dir, 'xsampa.lab'), os.path.join(s_dir, 'xsampa.lab'))
+    return path
+
+
 @pytest.fixture(scope='session')
 def basic_split_dir(corpus_root_dir, wav_dir, lab_dir, textgrid_dir):
     path = os.path.join(corpus_root_dir, 'split')
@@ -377,6 +389,11 @@ def frclitics_dict_path(dict_dir):
     return os.path.join(dict_dir, 'frclitics.txt')
 
 
+@pytest.fixture(scope='session')
+def xsampa_dict_path(dict_dir):
+    return os.path.join(dict_dir, 'xsampa.txt')
+
+
 @pytest.fixture(scope='session')
 def expected_dict_path(dict_dir):
     return os.path.join(dict_dir, 'expected')
diff --git a/tests/data/dictionaries/xsampa.txt b/tests/data/dictionaries/xsampa.txt
new file mode 100644
index 00000000..7854c916
--- /dev/null
+++ b/tests/data/dictionaries/xsampa.txt
@@ -0,0 +1,412 @@
+b{T b { T
+VD@` V D @`
+b{t b { T
+vd@` V D @`
+A5OU	A 5 OU
+@pAstr\@fi	@ p A s t r\ @ f i
+bh{Ut	bh {U t
+kh@z	kh @ z
+khor\s	kh or\ s
+khjuz	kh j u z
+@m	@ m
+EndkwOUt	E n d k w OU t
+fr\IskOU	f r\ I s k OU
+ghEn	gh E n
+In3kwOUt	I n 3 k w OU t
+kheI	kh eI
+@m	@ m
+@n	@ n
+khwOUt	kh w OU t
+r\{Und	r\ {U n d
+Es	E s
+sINg@5kwOUt	s I N g @5 k w OU t
+thI5	th I 5
+thIz	th I z
+thwVz	th w V z
+lA	l A
+@	@
+eIz	eI z
+eI	eI
+eIz	eI z
+eIz	eI z
+eIfor\tuwVntueIt	eI f or\ t u w V n t u eI t
+eIeI	eI eI
+thr\Ip@5eI	th r\ I p @5 eI
+Ab3g	A b 3 g
+Ak@n	A k @ n
+Ak@n3	A k @ n 3
+A	A
+Ak3	A k 3
+A5iA	A 5 i A
+A5sET	A 5 s E T
+Am@t	A m @ t
+Ankor\	A n k or\
+Ar\dEm@	Ar\ d E m @
+Ar\dvAr\k	Ar\ d v Ar\ k
+Ar\dvAr\ks	Ar\ d v Ar\ k s
+Ar\g	Ar\ g
+Er\@n	Er\ @ n
+Er\@nz	Er\ @ n z
+Er\@nz	Er\ @ n z
+Er\@ns@n	Er\ @ n s @ n
+Er\@ns@nz	Er\ @ n s @ n z
+Ar\ti	Ar\ t i
+As	A s
+As@n	A s @ n
+{b	{ b
+eIbieI	eI b i eI
+@bAb@	@ b A b @
+{b@k@	{ b @ k @
+@b{k	@ b { k
+{b@kOU	{ b @ k OU
+{b@k@s	{ b @ k @ s
+@bAd	@ b A d
+@b{d@k@	@ b { d @ k @
+@b{di	@ b { d i
+@b{di	@ b { d i
+@bEr\	@ b Er\
+@bA5kIn	@ b A 5 k I n
+{b@5OUni	{ b @5 OU n i
+{b@5OUniz	{ b @5 OU n i z
+AbA5OUz	A b A 5 OU z
+@bE@nd@n	@ b E@ n d @ n
+@bE@nd@nd	@ b E@ n d @ n d
+@bE@nd@nIN	@ b E@ n d @ n I N
+@bE@nd@nm@nt	@ b E@ n d @ n m @ n t
+@bE@nd@nm@nts	@ b E@ n d @ n m @ n t s
+@bE@nd@nz	@ b E@ n d @ n z
+@bE@ntOU	@ b E@ n t OU
+@bAr\k@	@ b Ar\ k @
+AbAr\i	A b Ar\ i
+{b@sk@5	{ b @ s k @5
+@b{S	@ b { S
+@b{St	@ b { S t
+@beIZj@	@ b eI Z j @
+@beIt	@ b eI t
+@beItId	@ b eI t I d
+@beItm@nt	@ b eI t m @ n t
+@beItm@nts	@ b eI t m @ n t s
+@beIts	@ b eI t s
+@beItIN	@ b eI t I N
+{b@	{ b @
+@bAdOU	@ b A d OU
+@bAs	@ b A s
+AbAsi	A b A s i
+AbeIt	A b eI t
+AbAtiE5OU	A b A t i E 5 OU
+{bi	{ b i
+{b@nh{Us	{ b @ n h {U s
+@bEt	@ b E t
+{bvI5	{ b v I 5
+{bi	{ b i
+{biz	{ b i z
+{bi	{ b i
+{bIt	{ b I t
+{b@t	{ b @ t
+{b@tst{Un	{ b @ t s t {U n
+{b@t	{ b @ t
+{b@ts	{ b @ t s
+{b@tst{Un	{ b @ t s t {U n
+@bud	@ b u d
+@br\ivieIt	@ b r\ i v i eI t
+@br\ivieItId	@ b r\ i v i eI t I d
+@br\ivieIts	@ b r\ i v i eI t s
+@br\ivieItIN	@ b r\ i v i eI t I N
+@br\ivieIS@n	@ b r\ i v i eI S @ n
+@br\ivieIS@nz	@ b r\ i v i eI S @ n z
+Abr\utseIzi	A b r\ u t s eI z i
+{bz	{ b z
+{bi	{ b i
+eIbisi	eI b i s i
+eIbisiz	eI b i s i z
+{bkOU	{ b k OU
+{bkOUtEk	{ b k OU t E k
+eIbisiz	eI b i s i z
+{bd{5@	{ b d { 5 @
+{bd{5@	{ b d { 5 @
+{bdE5	{ b d E 5
+{bdE5@	{ b d E 5 @
+{bd@keIt	{ b d @ k eI t
+{bd@keIt@d	{ b d @ k eI t @ d
+{bd@keIts	{ b d @ k eI t s
+{bdIkeItIN	{ b d I k eI t I N
+{bdIkeIS@n	{ b d I k eI S @ n
+{bdn3	{ b d n 3
+{bdOU	{ b d OU
+{bdA5@	{ b d A 5 @
+{bdOUm@n	{ b d OU m @ n
+{bdAm@n@5	{ b d A m @ n @5
+{bdVkt	{ b d V k t
+{bdVktId	{ b d V k t I d
+{bd@kti	{ b d @ k t i
+{bd@ktiz	{ b d @ k t i z
+{bdVktIN	{ b d V k t I N
+{bdVkS@n	{ b d V k S @ n
+{bdVkS@nz	{ b d V k S @ n z
+{bdVkt3	{ b d V k t 3
+{bdVkt3z	{ b d V k t 3 z
+{bdVkts	{ b d V k t s
+{bdu5	{ b d u 5
+{bdu5@ziz	{ b d u 5 @ z i z
+Abdu5@	A b d u 5 @
+{bdV5@	{ b d V 5 @
+eIb	eI b
+@bEd	@ b E d
+@bEdi	@ b E d i
+@bi	@ b i
+eIb@5	eI b @5
+AbE5@	A b E 5 @
+{bI53d	{ b I 5 3 d
+@bE5	@ b E 5
+@bE5z	@ b E 5 z
+eIb@5	eI b @5
+@bE5@	@ b E 5 @
+{bI5n	{ b I 5 n
+{b@5OU	{ b @5 OU
+eIb@5z	eI b @5 z
+{bI5s@`n	{ b I 5 s @` n
+{bEnd	{ b E n d
+{bIndr\OT	{ b I n d r\ O T
+eIb3	eI b 3
+{b3kr\Ambi	{ b 3 k r\ A m b i
+{b3din	{ b 3 d i n
+eIb3f3d	eI b 3 f 3 d
+{b3g	{ b 3 g
+{b3@5	{ b 3 @5
+{b3mIn	{ b 3 m I n
+{b3n{Ti	{ b 3 n { T i
+{b3nETi	{ b 3 n E T i
+{bEr\@nt	{ b Er\ @ n t
+{b3eIS@n	{ b 3 eI S @ n
+{b3eIS@n@5	{ b 3 eI S @ n @5
+{b3eIS@nz	{ b 3 eI S @ n z
+{b3t	{ b 3 t
+@bEt	@ b E t
+@bEtId	@ b E t I d
+@bEtIN	@ b E t I N
+eIbEks	eI b E k s
+@beI@ns	@ b eI @ n s
+AbeIt@	A b eI t @
+{bhor\	{ b h or\
+@bhor\d	@ b h or\ d
+@bhor\@ns	@ b h or\ @ n s
+{bhor\@nt	{ b h or\ @ n t
+@bhor\z	@ b h or\ z
+eIbi@m	eI b i @ m
+eIbi@mz	eI b i @ m z
+eIbId	eI b I d
+@baId	@ b aI d
+@baIdId	@ b aI d I d
+@baIdz	@ b aI d z
+@baIdIN	@ b aI d I N
+{bIdZAn	{ b I dZ A n
+{bi	{ b i
+{b@geI5	{ b @ g eI 5
+Abi5@	A b i 5 @
+{bI5in	{ b I 5 i n
+@bI5@tiz	@ b I 5 @ t i z
+@bI5@ti	@ b I 5 @ t i
+{bImeI5	{ b I m eI 5
+{bImeI5z	{ b I m eI 5 z
+{bINd@n	{ b I N d @ n
+{bINt@n	{ b I N t @ n
+AbiOU	A b i OU
+AbiOU5@	A b i OU 5 @
+AbiOU5@z	A b i OU 5 @ z
+eIbi@mEd	eI b i @ m E d
+@bIkju	@ b I k j u
+{bItibi	{ b I t i b i
+{bIts	{ b I t s
+{bdZEkt	{ b dZ E k t
+{bkAzj@	{ b k A z j @
+{bkAzi@n	{ b k A z i @ n
+{bkAzi@nz	{ b k A z i @ n z
+@b5eIz	@ b 5 eI z
+eIb@5	eI b @5
+eIb@5bAdid	eI b @5 b A d i d
+eIb@5d	eI b @5 d
+eIb@53	eI b @5 3
+eIb@5z	eI b @5 z
+eIb@5st	eI b @5 s t
+@b5um	@ b 5 u m
+eIb5i	eI b 5 i
+{bnEgeIS@n	{ b n E g eI S @ n
+{bn3	{ b n 3
+{bni	{ b n i
+{bnor\m@5	{ b n or\ m @5
+{bnor\m{5@tiz	{ b n or\ m { 5 @ t i z
+{bnor\m{5@ti	{ b n or\ m { 5 @ t i
+{bnor\m@5i	{ b n or\ m @5 i
+AbOU	A b OU
+AbOUz	A b OU z
+@bor\d	@ b or\ d
+@bOUd	@ b OU d
+{b@h@5im@	{ b @ h @5 i m @
+@bA5IS	@ b A 5 I S
+@bA5ISt	@ b A 5 I S t
+@bA5ISIz	@ b A 5 I S I z
+@bA5ISIN	@ b A 5 I S I N
+{b@5IS@n	{ b @5 I S @ n
+{b@5IS@nIz@m	{ b @5 I S @ n I z @ m
+{b@5IS@n@st	{ b @5 I S @ n @ s t
+{b@5IS@n@sts	{ b @5 I S @ n @ s t s
+@bAm@n@b@5	@ b A m @ n @ b @5
+@bAm@neIS@n	@ b A m @ n eI S @ n
+@bAm@neIS@nz	@ b A m @ n eI S @ n z
+@bud	@ b u d
+@budi	@ b u d i
+@bor\	@ b or\
+{b3IdZ@n@5	{ b 3 I dZ @ n @5
+{b3IdZ@ni	{ b 3 I dZ @ n i
+{b3IdZ@niz	{ b 3 I dZ @ n i z
+@bor\n	@ b or\ n
+@bor\t	@ b or\ t
+@bor\tId	@ b or\ t I d
+@bor\t@feIS@nt	@ b or\ t @ f eI S @ n t
+@bor\t@feIS@nts	@ b or\ t @ f eI S @ n t s
+@bor\tIN	@ b or\ t I N
+@bor\S@n	@ b or\ S @ n
+@bor\S@nIst	@ b or\ S @ n I s t
+@bor\S@nIsts	@ b or\ S @ n I s t s
+@bor\S@nz	@ b or\ S @ n z
+@bor\tIv	@ b or\ t I v
+@bor\ts	@ b or\ t s
+@bAt	@ b A t
+@bu	@ b u
+Abud	A b u d
+AbuhA5im@	A b u h A 5 i m @
+AbuhA5im@z	A b u h A 5 i m @ z
+@b{Und	@ b {U n d
+@b{UndId	@ b {U n d I d
+@b{UndIN	@ b {U n d I N
+@b{Undz	@ b {U n d z
+@b{Ut	@ b {U t
+@b{Uts	@ b {U t s
+@bVv	@ b V v
+@bVvz	@ b V v z
+@bVvbor\d	@ b V v b or\ d
+{bp5@n{5p	{ b p 5 @ n { 5 p
+Abr\@	A b r\ @
+{br\@k@d{br\@	{ b r\ @ k @ d { b r\ @
+eIbr\@hE@m	eI b r\ @ h E@ m
+{br\@heImi@n	{ b r\ @ h eI m i @ n
+eIbr\@hE@mz	eI b r\ @ h E@ m z
+{br\@hE@ms@n	{ b r\ @ h E@ m s @ n
+@br\{h@ms@n	@ b r\ { h @ m s @ n
+@br\E@m	@ b r\ E@ m
+eIbr\@mz	eI b r\ @ m z
+Abr\@mtSIk	A b r\ @ m tS I k
+Abr\AmOU	A b r\ A m OU
+@br\Am@vIts	@ b r\ A m @ v I t s
+@br\Am@vItS	@ b r\ A m @ v I tS
+@br\Am@wIts	@ b r\ A m @ w I t s
+eIbr\@mz	eI b r\ @ m z
+eIbr\@mzIz	eI b r\ @ m z I z
+eIbr\@ms@n	eI b r\ @ m s @ n
+@br\eIZ@n	@ b r\ eI Z @ n
+@br\eIZ@nz	@ b r\ eI Z @ n z
+@br\eIsIv	@ b r\ eI s I v
+@br\eIsIvz	@ b r\ eI s I v z
+@br\{ks@	@ b r\ { k s @
+@br\{ks@z	@ b r\ { k s @ z
+@br\{ks@z	@ b r\ { k s @ z
+@br\Est	@ b r\ E s t
+Abr\EgOU	A b r\ E g OU
+@br\u	@ b r\ u
+@br\IdZ	@ b r\ I dZ
+@br\IdZd	@ b r\ I dZ d
+@br\IdZm@nt	@ b r\ I dZ m @ n t
+@br\IdZ@z	@ b r\ I dZ @ z
+@br\IdZIN	@ b r\ I dZ I N
+@br\I5	@ b r\ I 5
+@br\Od	@ b r\ O d
+{br\@geIt	{ b r\ @ g eI t
+{br\@geItId	{ b r\ @ g eI t I d
+{br\@geItIN	{ b r\ @ g eI t I N
+{br\@geIS@n	{ b r\ @ g eI S @ n
+@br\OU5	@ b r\ OU 5
+@br\An	@ b r\ A n
+@br\Vpt	@ b r\ V p t
+@br\Vpt5i	@ b r\ V p t 5 i
+@br\Vptn@s	@ b r\ V p t n @ s
+eIbr\utIn	eI b r\ u t I n
+Abr\utseIzi	A b r\ u t s eI z i
+Abr\uzOU	A b r\ u z OU
+eIbiEs	eI b i E s
+{bs@5@m	{ b s @5 @ m
+{bsAr\@k@	{ b s Ar\ @ k @
+{bskE@m	{ b s k E@ m
+{bsEs	{ b s E s
+{bskAnd	{ b s k A n d
+{bskAnd@d	{ b s k A n d @ d
+{bskAndIN	{ b s k A n d I N
+{bskAndz	{ b s k A n d z
+{bs@kOn	{ b s @ k O n
+{bs@ns	{ b s @ n s
+{bs@nsIz	{ b s @ n s I z
+{bs@nt	{ b s @ n t
+{bs@nti	{ b s @ n t i
+{bs@ntiIz@m	{ b s @ n t i I z @ m
+{bs@ntiz	{ b s @ n t i z
+{bsEnS@	{ b s E n S @
+{bS3	{ b S 3
+{bSi3	{ b S i 3
+{bSaIr\	{ b S aI r\
+{bsInT	{ b s I n T
+{bsOU	{ b s OU
+{bs@5@m	{ b s @5 @ m
+{bs@5ut	{ b s @5 u t
+{bs@5uts	{ b s @5 u t s
+{bs@5ut	{ b s @5 u t
+{bs@5ut5i	{ b s @5 u t 5 i
+{bs@5utn@s	{ b s @5 u t n @ s
+{bs@5uts	{ b s @5 u t s
+{bs@5uS@n	{ b s @5 u S @ n
+{bs@5utIz@m	{ b s @5 u t I z @ m
+{bs@5utIst	{ b s @5 u t I s t
+@bzA5v	@ b z A 5 v
+@bzA5vd	@ b z A 5 v d
+@bzA5vz	@ b z A 5 v z
+@bzA5vIN	@ b z A 5 v I N
+@bzor\b	@ b z or\ b
+@bzor\bd	@ b z or\ b d
+@bzor\b@nsi	@ b z or\ b @ n s i
+@bzor\b@nt	@ b z or\ b @ n t
+@bzor\b3	@ b z or\ b 3
+@bzor\b3z	@ b z or\ b 3 z
+@bzor\bIN	@ b z or\ b I N
+@bzor\bz	@ b z or\ b z
+@bzor\pS@n	@ b z or\ p S @ n
+@bsteIn	@ b s t eI n
+@bsteInd	@ b s t eI n d
+@bsteInIN	@ b s t eI n I N
+@bstEntS@n	@ b s t E n tS @ n
+@bstEntS@nz	@ b s t E n tS @ n z
+{bst@n@ns	{ b s t @ n @ n s
+{bst@n@nt	{ b s t @ n @ n t
+{bst@n	{ b s t @ n
+{bstr\{kt	{ b s t r\ { k t
+{bstr\{ktId	{ b s t r\ { k t I d
+{bstr\{kS@n	{ b s t r\ { k S @ n
+{bstr\{kS@nz	{ b s t r\ { k S @ n z
+{bstr\{kts	{ b s t r\ { k t s
+@bstr\us	@ b s t r\ u s
+@bs3d	@ b s 3 d
+@bs3dIst	@ b s 3 d I s t
+@bs3d@tiz	@ b s 3 d @ t i z
+@bs3d@ti	@ b s 3 d @ t i
+@bs3d5i	@ b s 3 d 5 i
+{bt	{ b t
+{bts	{ b t s
+{bu	{ b u
+@bVdr\@m	@ b V d r\ @ m
+@bwE5@	@ b w E 5 @
+@bwE5@z	@ b w E 5 @ z
+{bju5{dzi	{ b j u 5 { d z i
+@bVnd@ns	@ b V n d @ n s
+@bVnd@nt	@ b V n d @ n t
+@bVnd@nt5i	@ b V n d @ n t 5 i
+@bUr\tOU	@ b Ur\ t OU
+@bUr\tOUz	@ b Ur\ t OU z
+@bjus	@ b j u s
+@bjuzd	@ b j u z d
\ No newline at end of file
diff --git a/tests/data/lab/xsampa.lab b/tests/data/lab/xsampa.lab
new file mode 100644
index 00000000..f73fe41b
--- /dev/null
+++ b/tests/data/lab/xsampa.lab
@@ -0,0 +1 @@
+@bUr\tOU {bstr\{kt {bSaIr\ Abr\utseIzi {br\@geItIN @bor\n {b3kr\Ambi {bI5s@`n Ar\g thr\Ip@5eI Ar\dvAr\k
\ No newline at end of file
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
index b9741d02..3155d0fa 100644
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -403,4 +403,18 @@ def test_alternate_punctuation(punctuated_dir, temp_dir, sick_dict_path, differe
     c.initialize_corpus(dictionary)
     print(c.text_mapping['punctuated'])
     assert c.text_mapping['punctuated'] == 'oh yes, they they, you know, they love her and so i mean'
+    dictionary.cleanup_logger()
+
+def test_xsampa_corpus(xsampa_corpus_dir, xsampa_dict_path, temp_dir, generated_dir, different_punctuation_config):
+    train_config, align_config = train_yaml_to_config(different_punctuation_config)
+    output_directory = os.path.join(temp_dir, 'xsampa_corpus')
+    shutil.rmtree(output_directory, ignore_errors=True)
+    print(align_config.punctuation)
+    dictionary = Dictionary(xsampa_dict_path, output_directory, punctuation=align_config.punctuation)
+    dictionary.write()
+    c = AlignableCorpus(xsampa_corpus_dir, output_directory, use_mp=False, punctuation=align_config.punctuation)
+    print(c.punctuation)
+    c.initialize_corpus(dictionary)
+    print(c.text_mapping['michael-xsampa'])
+    assert c.text_mapping['michael-xsampa'] == r'@bUr\tOU {bstr\{kt {bSaIr\ Abr\utseIzi {br\@geItIN @bor\n {b3kr\Ambi {bI5s@`n Ar\g thr\Ip@5eI Ar\dvAr\k'.lower()
     dictionary.cleanup_logger()
\ No newline at end of file
diff --git a/tests/test_dict.py b/tests/test_dict.py
index ab54a5a9..41314753 100644
--- a/tests/test_dict.py
+++ b/tests/test_dict.py
@@ -113,6 +113,16 @@ def test_multilingual_ipa():
     assert parse_ipa(input_transcription) == expected
 
 
+def test_xsampa_dir(xsampa_dict_path, generated_dir):
+    d = Dictionary(xsampa_dict_path, os.path.join(generated_dir, 'xsampa'))
+    d.write()
+
+    print(d.words)
+    assert not d.clitic_set
+    assert d.split_clitics('r\{und') == ['r\{und']
+    assert d.split_clitics('{bI5s@`n') == ['{bI5s@`n']
+    assert d.words[r'r\{und']
+
 def test_multispeaker_config(multispeaker_dictionary_config, generated_dir):
     dictionary = MultispeakerDictionary(multispeaker_dictionary_config, os.path.join(generated_dir, 'multispeaker'))
     dictionary.write()