From 1a4aa52a35bdce6b1fecf6ed1686a2c939eca42b Mon Sep 17 00:00:00 2001 From: hainan-xv Date: Mon, 3 Jun 2019 23:31:22 -0400 Subject: [PATCH] [scripts] add script to compute dev PPL on kaldi-rnnlm (#3340) --- scripts/rnnlm/compute_perplexity.sh | 48 +++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100755 scripts/rnnlm/compute_perplexity.sh diff --git a/scripts/rnnlm/compute_perplexity.sh b/scripts/rnnlm/compute_perplexity.sh new file mode 100755 index 00000000000..17c441e6aea --- /dev/null +++ b/scripts/rnnlm/compute_perplexity.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# This script computes perplexity of text on the specified RNNLM model. + +[ -f ./path.sh ] && . ./path.sh +. utils/parse_options.sh + +if [ $# != 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +dir=$1 +text_in=$2 + +# the format of the $text_in file is one sentence per line, without explicit +# or symbols, and without utterance-id's, for example: + +# ====== begin file ====== +# well western new york is supposed to be used to this kind of weather but +# yeah you are right +# in um anaheim california you know just +# ====== end file ====== + +if [ -f $dir/word_embedding.final.mat ]; then + word_embedding=$dir/word_embedding.final.mat +else + [ ! -f $dir/feat_embedding.final.mat ] && + echo "$0: expect file $dir/feat_embedding.final.mat to exit" + word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.final.mat -|" +fi + +for x in final.raw config/words.txt; do + if [ ! -f $dir/$x ]; then + echo "$0: expected file $dir/$x to exist." + exit 1; + fi +done + +special_symbol_opts=$(cat $dir/special_symbol_opts.txt) + +ppl=$(rnnlm-sentence-probs --normalize-probs=true \ + $special_symbol_opts $dir/final.raw "$word_embedding" \ + <(cat $text_in | sym2int.pl $dir/config/words.txt | awk '{print "utt_id ", $0}') | \ + awk '{for(i=2;i<=NF;i++) a+=$i; b+=NF-1}END{print exp(-a / b)}') + +echo "$0: perplexity is $ppl" +