[src,scripts] adding more scripts and binaries

hainan-xv · Aug 26, 2017 · 9fc22b1 · 9fc22b1
1 parent bc387e6
commit 9fc22b1
Show file tree

Hide file tree

Showing 19 changed files with 635 additions and 238 deletions.
diff --git a/egs/ptb/s5/local/rnnlm/prepare_rnnlm_data.sh b/egs/ptb/s5/local/rnnlm/prepare_rnnlm_data.sh
@@ -15,10 +15,10 @@ cp data/ptb/dev.txt  data/text/
 
 
 # validata data dir
-rnnlm/validate_data_dir.py data/text
+rnnlm/validate_text_dir.py data/text
 
 # get unigram counts
-rnnlm/get_unigram_counts.sh data/text
+rnnlm/ensure_counts_present.sh data/text
 
 # get vocab
 mkdir -p data/vocab

diff --git a/scripts/rnnlm/ensure_counts_present.sh b/scripts/rnnlm/ensure_counts_present.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# This script makes sure that a <text-dir>, as validated by validate_text_dir.py,
+# has unigram counts present (*.counts).
+
+
+if [ $# != 1 ]; then
+  echo "Usage: $0 <text-dir>"
+  echo "Makes sure unigram counts (*.counts) are present in <text-dir>,"
+  echo "and if not, sets them up."
+  exit 1;
+fi
+
+
+dir=$1
+
+all_ok=true
+for f in `ls $dir/*.txt`; do
+  counts_file=$(echo $f | sed s/.txt$/.counts/)
+  if [ ! -f $counts_file -o $counts_file -ot $f ]; then
+    echo "$0: generating counts file for $f"
+    cat $f | awk '{for(i = 1; i <= NF; i++) {print $i;} print "</s>"}' | \
+      sort | uniq -c | awk '{print $2,$1}' > $counts_file
+  fi
+done
+
+
+
diff --git a/scripts/rnnlm/get_embedding_dim.py b/scripts/rnnlm/get_embedding_dim.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+# Copyright  2017  Johns Hopkins University (author: Daniel Povey)
+# License: Apache 2.0.
+
+import os
+import argparse
+import subprocess
+import sys
+import re
+
+
+parser = argparse.ArgumentParser(description="This script works out the embedding dimension from a "
+                                 "nnet3 neural network (e.g. 0.raw).  It does this by invoking "
+                                 "nnet3-info to print information about the neural network, and "
+                                 "parsing it.  You should make sure nnet3-info is on your path "
+                                 "before you call this script.  It is an error if the input and "
+                                 "output dimensions of the neural network are not the same.  This "
+                                 "script prints the embedding dimension to the standard output.",
+                                 epilog="E.g. " + sys.argv[0] + " 0.raw",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+
+parser.add_argument("nnet",
+                    help="Path for raw neural net (e.g. 0.raw)")
+
+args = parser.parse_args()
+
+if not os.path.exists(args.nnet):
+    sys.exit(sys.argv[0] + ": input neural net '{0}' does not exist.".format(args.nnet))
+
+proc = subprocess.Popen(["nnet3-info", args.nnet], stdout=subprocess.PIPE)
+out_lines = proc.stdout.readlines()
+proc.communicate()
+if proc.returncode != 0:
+    sys.exit(sys.argv[0] + ": error running command 'nnet3-info {0}'".format(args.nnet))
+
+
+# we're looking for lines like:
+# input-node name=input dim=600
+# output-node name=output input=output.affine dim=600
+
+input_dim=-1
+output_dim=-1
+for line in out_lines:
+    m = re.match(r'input-node name=input dim=(\d+)', line)
+    if m is not None:
+        try:
+            input_dim = int(m.group(1))
+        except:
+            sys.exit(sys.argv[0] + ": error processing line {0}".format(line))
+
+    m = re.match(r'output-node name=output .* dim=(\d+)', line)
+    if m is not None:
+        try:
+            output_dim = int(m.group(1))
+        except:
+            sys.exit(sys.argv[0] + ": error processing line {0}".format(line))
+
+
+if input_dim == -1:
+    sys.exit(sys.argv[0] + ": could not get input dim from output "
+             "of 'nnet3-info {0}'".format(args.nnet))
+
+if output_dim == -1:
+    sys.exit(sys.argv[0] + ": could not get output dim from output "
+             "of 'nnet3-info {0}'".format(args.nnet))
+
+if input_dim != output_dim:
+    sys.exit(sys.argv[0] + ": input and output dims differ for "
+             "nnet '{0}': {1} != {2}".format(
+            args.nnet, input_dim, output_dim))
+
+print(str(input_dim))
+
diff --git a/scripts/rnnlm/get_num_splits.sh b/scripts/rnnlm/get_num_splits.sh
@@ -13,29 +13,37 @@ if [ $# != 3 ]; then
   (
     echo "Usage: rnnlm/get_num_splits.sh <target-words-per-split> <data-dir> <weights-file>"
     echo "e.g.: rnnlm/get_num_splits.sh 200000 data/text exp/rnnlm/data_weights.txt"
-    echo "This works out how many pieces to split a data directory into; it"
-    echo "echoes a number such that the average words per split does not exceed"
-    echo "<target-words-per-split>.  It works out the number of words of training data from"
-    echo "<data-dir>/*.counts; they are scaled by the data-multiplicities given as"
-    echo "the second field of <weights-file> for each data source."
+    echo "This works out how many pieces to split a data directory into, and"
+    echo "(if just one piece) how many times that piece should be repeated to"
+    echo "get the target words-per-split.  A number is printed to the standard"
+    echo "output.  If no repeats are necessary it will be the number of splits,"
+    echo "a positive number.  If repeats are necessary, then a negative number,"
+    echo "interpretable as the negative of the number of times we should repeat"
+    echo "the data, is echoed, and the number of splits should be taken to be 1."
+    echo "To compute the number of words of training data"
+    echo "this script uses <data-dir>/*.counts; they are scaled by the data-multiplicities"
+    echo "given as the second field of <weights-file> for each data source."
   ) 1>&2
   exit 1
 fi
 
 
 words_per_split=$1
-data=$2
+text=$2
 weights_file=$3
 
 ! [ $words_per_split -eq $words_per_split ] && \
   echo "$0: first arg must be an integer" 1>&2 && exit 1;
 
-[ ! -d $data ] && \
-  echo "$0: no such directory $data" 1>&2 && exit 1;
+[ ! -d $text ] && \
+  echo "$0: no such directory $text" 1>&2 && exit 1;
 
 [ ! -f $weight ] && \
   echo "$0: expected weights file in $weight" 1>&2 && exit 1;
 
+rnnlm/ensure_counts_present.sh $text
+
+
 set -e -o pipefail -u
 
 export LC_ALL=C
@@ -55,8 +63,8 @@ tot_orig=0
 tot_with_multiplicities=0
 
 
-for f in $data/*.counts; do
-  if [ "$f" != "$data/dev.counts" ]; then
+for f in $text/*.counts; do
+  if [ "$f" != "$text/dev.counts" ]; then
     this_tot=$(cat $f | awk '{tot += $2} END{print tot}')
     if ! [ $this_tot -gt 0 ]; then
       echo "$0: there were no counts in counts file $f" 1>&2
@@ -75,11 +83,11 @@ for f in $data/*.counts; do
 done
 
 if ! [ $tot_orig -gt 0 ]; then
-  echo "$0: there was a problem getting counts from directory $data (no counts present?)" 1>&2
+  echo "$0: there was a problem getting counts from directory $text (no counts present?)" 1>&2
   exit 1
 fi
 if ! [ $tot_with_multiplicities -gt 0 ]; then
-  echo "$0: there was a problem getting counts from directory $data (check data-weights file $weights_file)" 1>&2
+  echo "$0: there was a problem getting counts from directory $text (check data-weights file $weights_file)" 1>&2
   exit 1
 fi
 
@@ -95,7 +103,26 @@ if ! [ $num_splits -gt 0 ]; then
 fi
 
 
+num_repeats=$[words_per_split/actual_words_per_split]
+if ! [ $num_repeats -ge 1 ]; then
+  echo "$0: error computing the number of repeats, got $num_repeats." 1>&2
+  exit 1
+fi
+
+if [ $num_repeats -gt 1 -a $num_splits -gt 1 ]; then
+  echo "$0: script error: both num-repeats and num-splits are over 1." 1>&2
+  exit 1
+fi
+
 echo -n "get_num_splits.sh: based on tot-words=$tot_orig (with multiplicities: $tot_with_multiplicities)" 1>&2
 echo " and target-words-per-split=$words_per_split, got $num_splits splits, actual words-per-split is $actual_words_per_split" 1>&2
+if [ $num_repeats -gt 1 ]; then
+  echo " ... and num-repeats is $num_repeats" 1>&2
+fi
 
-echo $num_splits  # this is the only thing that goes to the standard output.
+
+if [ $num_repeats -eq 1 ]; then
+  echo $num_splits
+else
+  echo -$num_repeats
+fi
diff --git a/scripts/rnnlm/get_unigram_counts.sh b/scripts/rnnlm/get_unigram_counts.sh
diff --git a/scripts/rnnlm/internal/distribute_lines.pl b/scripts/rnnlm/internal/distribute_lines.pl