Skip to content

Commit

Permalink
Updates to debugging, sanitizing punctuation
Browse files Browse the repository at this point in the history
  • Loading branch information
mmcauliffe committed May 19, 2021
1 parent ad42d57 commit 0005a3f
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 23 deletions.
15 changes: 14 additions & 1 deletion montreal_forced_aligner/aligner/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ..multiprocessing import compile_information
from ..config import TEMP_DIR

from ..helper import log_kaldi_errors
from ..helper import log_kaldi_errors, load_scp
from ..exceptions import KaldiProcessingError


Expand Down Expand Up @@ -86,6 +86,19 @@ def compile_information(self, model_directory, output_directory):
self.logger.warning('There were {} segments/files not aligned. Please see {} for more details on why '
'alignment failed for these files.'.format(len(issues), issue_path))

log_like = 0
tot_frames = 0
for j in range(self.corpus.num_jobs):
score_path = os.path.join(model_directory, 'ali.{}.scores'.format(j))
scores = load_scp(score_path, data_type=float)
for k, v in scores.items():
log_like += v
tot_frames += self.corpus.utterance_lengths[k]
if tot_frames:
self.logger.debug('Average per frame likelihood (this might not actually mean anything): {}'.format(log_like/tot_frames))
else:
self.logger.debug('No files were aligned, this likely indicates serious problems with the aligner.')

def export_textgrids(self, output_directory):
"""
Export a TextGrid file for every sound file in the dataset
Expand Down
23 changes: 21 additions & 2 deletions montreal_forced_aligner/config/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,30 @@ def update_from_args(self, args):
if not isinstance(original_value, (bool, int, float, str)):
continue
try:
val = type(original_value)(args[i+1])
except (IndexError, ValueError):
if isinstance(original_value, bool):
if args[i+1].lower() == 'true':
val = True
elif args[i+1].lower() == 'false':
val = False
elif not original_value:
val = True
else:
continue
else:
val = type(original_value)(args[i+1])
except (ValueError):
continue
except (IndexError):
if isinstance(original_value, bool):
if not original_value:
val = True
else:
continue
else:
continue
setattr(self, name, val)


def save_config(config, path):
with open(path, 'w', encoding='utf8') as f:
yaml.dump(config.params(), f)
15 changes: 4 additions & 11 deletions montreal_forced_aligner/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,10 @@ def sanitize(item):
if item[0] == b[0] and item[-1] == b[1]:
return item
# Clitic markers are "-" and "'"
sanitized = re.sub(r"^[^-\w']+", '', item)
if sanitized.strip() == "":
return sanitized
punct_re = '[、。।,@<>"(),.:;¿?¡!\\\]'
old_len = len(sanitized)
sanitized = sanitized[:-1] + re.sub(punct_re.replace('-', '\-'), '', sanitized[-1])
while len(sanitized)!=old_len:
old_len = len(sanitized)
sanitized = sanitized[:-1] + re.sub(punct_re.replace('-', '\-'), '', sanitized[-1])
if sanitized.strip() == "":
break
punctuation_set = r'[、。।,@<>"(),.:;¿?¡!\\&%#*~【】,…‥「」『』〝〟″⟨⟩♪・‹›«»~′$+=]'
sanitized = re.sub(r'^{}+'.format(punctuation_set), '', item)
sanitized = re.sub(r'{}+$'.format(punctuation_set), '', sanitized)

return sanitized


Expand Down
24 changes: 19 additions & 5 deletions montreal_forced_aligner/multiprocessing/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,14 +240,16 @@ def mono_align_equal(mono_directory, split_directory, num_jobs, config):
run_non_mp(mono_align_equal_func, jobs, config.log_directory)


def align_func(directory, iteration, job_name, mdl, config, feature_string, output_directory):
def align_func(directory, iteration, job_name, mdl, config, feature_string, output_directory, debug=False):
fst_path = os.path.join(directory, 'fsts.{}'.format(job_name))
log_path = os.path.join(output_directory, 'log', 'align.{}.{}.log'.format(iteration, job_name))
ali_path = os.path.join(output_directory, 'ali.{}'.format(job_name))
score_path = os.path.join(output_directory, 'ali.{}.scores'.format(job_name))
loglike_path = os.path.join(output_directory, 'ali.{}.loglikes'.format(job_name))
with open(log_path, 'w', encoding='utf8') as log_file:
align_proc = subprocess.Popen([thirdparty_binary('gmm-align-compiled'),
log_file.write('DEBUG: {}'.format(debug))
if debug:
loglike_path = os.path.join(output_directory, 'ali.{}.loglikes'.format(job_name))
com = [thirdparty_binary('gmm-align-compiled'),
'--transition-scale={}'.format(config['transition_scale']),
'--acoustic-scale={}'.format(config['acoustic_scale']),
'--self-loop-scale={}'.format(config['self_loop_scale']),
Expand All @@ -257,12 +259,24 @@ def align_func(directory, iteration, job_name, mdl, config, feature_string, outp
'--write-per-frame-acoustic-loglikes=ark,t:{}'.format(loglike_path),
mdl,
"ark:" + fst_path, '{}'.format(feature_string), "ark,t:" + ali_path,
"ark,t:" + score_path],
"ark,t:" + score_path]
else:
com = [thirdparty_binary('gmm-align-compiled'),
'--transition-scale={}'.format(config['transition_scale']),
'--acoustic-scale={}'.format(config['acoustic_scale']),
'--self-loop-scale={}'.format(config['self_loop_scale']),
'--beam={}'.format(config['beam']),
'--retry-beam={}'.format(config['retry_beam']),
'--careful=false',
mdl,
"ark:" + fst_path, '{}'.format(feature_string), "ark,t:" + ali_path,
"ark,t:" + score_path]
align_proc = subprocess.Popen(com,
stderr=log_file)
align_proc.communicate()


def align(iteration, directory, split_directory, optional_silence, num_jobs, config, output_directory=None):
def align(iteration, directory, split_directory, optional_silence, num_jobs, config, output_directory=None, debug=False):
"""
Multiprocessing function that aligns based on the current model
Expand Down
8 changes: 6 additions & 2 deletions montreal_forced_aligner/trainers/monophone.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,17 +76,21 @@ def init_training(self, identifier, temporary_directory, corpus, dictionary, pre
try:
feat_dim = corpus.get_feat_dim(self.feature_config)
feature_string = self.feature_config.construct_feature_proc_string(self.data_directory, self.train_directory, 0)
feature_string += " subset-feats --n=10 ark:- ark:-| "
#feature_string += " subset-feats --n=10 ark:- ark:-| "
shared_phones_opt = "--shared-phones=" + os.path.join(dictionary.phones_dir, 'sets.int')
log_path = os.path.join(self.log_directory, 'init.log')
temp_feats_path = os.path.join(self.train_directory, 'temp_feats')
with open(log_path, 'w') as log_file:
subprocess.call([thirdparty_binary('subset-feats'), '--n=10',
feature_string, 'ark:'+temp_feats_path], stderr=log_file)
subprocess.call([thirdparty_binary('gmm-init-mono'), shared_phones_opt,
"--train-feats="+feature_string,
"--train-feats=ark:"+temp_feats_path,
os.path.join(dictionary.output_directory, 'topo'),
str(feat_dim),
mdl_path,
tree_path],
stderr=log_file)
os.remove(temp_feats_path)
num_gauss = self.get_num_gauss()
self.initial_gaussians = num_gauss
compile_train_graphs(self.train_directory, dictionary.output_directory,
Expand Down
14 changes: 13 additions & 1 deletion tests/test_commandline_align.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import pytest

from montreal_forced_aligner.command_line.align import run_align_corpus
from montreal_forced_aligner.command_line.align import run_align_corpus, load_basic_align
from montreal_forced_aligner.command_line.mfa import parser

from montreal_forced_aligner.exceptions import PronunciationAcousticMismatchError
Expand All @@ -19,6 +19,18 @@ def assert_export_exist(old_directory, new_directory):
assert (os.path.exists(os.path.join(new_root, new_f)))


def test_align_arguments(basic_corpus_dir, sick_dict_path, generated_dir, large_dataset_dictionary, temp_dir,
english_acoustic_model):

command = ['align', basic_corpus_dir, large_dataset_dictionary, 'english', os.path.join(generated_dir, 'basic_output'),
'-t', temp_dir, '-q', '--clean', '-d', '--disable_sat']
args, unknown_args = parser.parse_known_args(command)
align_config = load_basic_align()
assert not align_config.disable_sat
if unknown_args:
align_config.update_from_args(unknown_args)
assert align_config.disable_sat

#@pytest.mark.skip(reason='Optimization')
def test_align_basic(basic_corpus_dir, sick_dict_path, generated_dir, large_dataset_dictionary, temp_dir,
basic_align_config, english_acoustic_model):
Expand Down
13 changes: 12 additions & 1 deletion tests/test_dict.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import pytest

from montreal_forced_aligner.dictionary import Dictionary
from montreal_forced_aligner.dictionary import Dictionary, sanitize


def ListLines(path):
Expand Down Expand Up @@ -50,3 +50,14 @@ def test_frclitics(frclitics_dict_path, generated_dir):
assert d.split_clitics('m\'appele') == ['m\'', 'appele']
assert d.split_clitics('m\'ving-sic') == ["m'", 'ving', 'sic']
assert d.split_clitics('flying\'purple-people-eater') == ['flying\'purple-people-eater']


def test_devanagari():
test_cases = ["हैं", "हूं", "हौं"]
for tc in test_cases:
assert tc == sanitize(tc)

def test_japanese():
assert "かぎ括弧" == sanitize("「かぎ括弧」")
assert "二重かぎ括弧" == sanitize("『二重かぎ括弧』")

0 comments on commit 0005a3f

Please sign in to comment.