Skip to content

Commit

Permalink
[src,scripts] adding more scripts and binaries
Browse files Browse the repository at this point in the history
  • Loading branch information
danpovey committed Aug 26, 2017
1 parent bc387e6 commit 9fc22b1
Show file tree
Hide file tree
Showing 19 changed files with 635 additions and 238 deletions.
4 changes: 2 additions & 2 deletions egs/ptb/s5/local/rnnlm/prepare_rnnlm_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ cp data/ptb/dev.txt data/text/


# validata data dir
rnnlm/validate_data_dir.py data/text
rnnlm/validate_text_dir.py data/text

# get unigram counts
rnnlm/get_unigram_counts.sh data/text
rnnlm/ensure_counts_present.sh data/text

# get vocab
mkdir -p data/vocab
Expand Down
28 changes: 28 additions & 0 deletions scripts/rnnlm/ensure_counts_present.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

# This script makes sure that a <text-dir>, as validated by validate_text_dir.py,
# has unigram counts present (*.counts).


if [ $# != 1 ]; then
echo "Usage: $0 <text-dir>"
echo "Makes sure unigram counts (*.counts) are present in <text-dir>,"
echo "and if not, sets them up."
exit 1;
fi


dir=$1

all_ok=true
for f in `ls $dir/*.txt`; do
counts_file=$(echo $f | sed s/.txt$/.counts/)
if [ ! -f $counts_file -o $counts_file -ot $f ]; then
echo "$0: generating counts file for $f"
cat $f | awk '{for(i = 1; i <= NF; i++) {print $i;} print "</s>"}' | \
sort | uniq -c | awk '{print $2,$1}' > $counts_file
fi
done



75 changes: 75 additions & 0 deletions scripts/rnnlm/get_embedding_dim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python3

# Copyright 2017 Johns Hopkins University (author: Daniel Povey)
# License: Apache 2.0.

import os
import argparse
import subprocess
import sys
import re


parser = argparse.ArgumentParser(description="This script works out the embedding dimension from a "
"nnet3 neural network (e.g. 0.raw). It does this by invoking "
"nnet3-info to print information about the neural network, and "
"parsing it. You should make sure nnet3-info is on your path "
"before you call this script. It is an error if the input and "
"output dimensions of the neural network are not the same. This "
"script prints the embedding dimension to the standard output.",
epilog="E.g. " + sys.argv[0] + " 0.raw",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)


parser.add_argument("nnet",
help="Path for raw neural net (e.g. 0.raw)")

args = parser.parse_args()

if not os.path.exists(args.nnet):
sys.exit(sys.argv[0] + ": input neural net '{0}' does not exist.".format(args.nnet))

proc = subprocess.Popen(["nnet3-info", args.nnet], stdout=subprocess.PIPE)
out_lines = proc.stdout.readlines()
proc.communicate()
if proc.returncode != 0:
sys.exit(sys.argv[0] + ": error running command 'nnet3-info {0}'".format(args.nnet))


# we're looking for lines like:
# input-node name=input dim=600
# output-node name=output input=output.affine dim=600

input_dim=-1
output_dim=-1
for line in out_lines:
m = re.match(r'input-node name=input dim=(\d+)', line)
if m is not None:
try:
input_dim = int(m.group(1))
except:
sys.exit(sys.argv[0] + ": error processing line {0}".format(line))

m = re.match(r'output-node name=output .* dim=(\d+)', line)
if m is not None:
try:
output_dim = int(m.group(1))
except:
sys.exit(sys.argv[0] + ": error processing line {0}".format(line))


if input_dim == -1:
sys.exit(sys.argv[0] + ": could not get input dim from output "
"of 'nnet3-info {0}'".format(args.nnet))

if output_dim == -1:
sys.exit(sys.argv[0] + ": could not get output dim from output "
"of 'nnet3-info {0}'".format(args.nnet))

if input_dim != output_dim:
sys.exit(sys.argv[0] + ": input and output dims differ for "
"nnet '{0}': {1} != {2}".format(
args.nnet, input_dim, output_dim))

print(str(input_dim))

53 changes: 40 additions & 13 deletions scripts/rnnlm/get_num_splits.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,29 +13,37 @@ if [ $# != 3 ]; then
(
echo "Usage: rnnlm/get_num_splits.sh <target-words-per-split> <data-dir> <weights-file>"
echo "e.g.: rnnlm/get_num_splits.sh 200000 data/text exp/rnnlm/data_weights.txt"
echo "This works out how many pieces to split a data directory into; it"
echo "echoes a number such that the average words per split does not exceed"
echo "<target-words-per-split>. It works out the number of words of training data from"
echo "<data-dir>/*.counts; they are scaled by the data-multiplicities given as"
echo "the second field of <weights-file> for each data source."
echo "This works out how many pieces to split a data directory into, and"
echo "(if just one piece) how many times that piece should be repeated to"
echo "get the target words-per-split. A number is printed to the standard"
echo "output. If no repeats are necessary it will be the number of splits,"
echo "a positive number. If repeats are necessary, then a negative number,"
echo "interpretable as the negative of the number of times we should repeat"
echo "the data, is echoed, and the number of splits should be taken to be 1."
echo "To compute the number of words of training data"
echo "this script uses <data-dir>/*.counts; they are scaled by the data-multiplicities"
echo "given as the second field of <weights-file> for each data source."
) 1>&2
exit 1
fi


words_per_split=$1
data=$2
text=$2
weights_file=$3

! [ $words_per_split -eq $words_per_split ] && \
echo "$0: first arg must be an integer" 1>&2 && exit 1;

[ ! -d $data ] && \
echo "$0: no such directory $data" 1>&2 && exit 1;
[ ! -d $text ] && \
echo "$0: no such directory $text" 1>&2 && exit 1;

[ ! -f $weight ] && \
echo "$0: expected weights file in $weight" 1>&2 && exit 1;

rnnlm/ensure_counts_present.sh $text


set -e -o pipefail -u

export LC_ALL=C
Expand All @@ -55,8 +63,8 @@ tot_orig=0
tot_with_multiplicities=0


for f in $data/*.counts; do
if [ "$f" != "$data/dev.counts" ]; then
for f in $text/*.counts; do
if [ "$f" != "$text/dev.counts" ]; then
this_tot=$(cat $f | awk '{tot += $2} END{print tot}')
if ! [ $this_tot -gt 0 ]; then
echo "$0: there were no counts in counts file $f" 1>&2
Expand All @@ -75,11 +83,11 @@ for f in $data/*.counts; do
done

if ! [ $tot_orig -gt 0 ]; then
echo "$0: there was a problem getting counts from directory $data (no counts present?)" 1>&2
echo "$0: there was a problem getting counts from directory $text (no counts present?)" 1>&2
exit 1
fi
if ! [ $tot_with_multiplicities -gt 0 ]; then
echo "$0: there was a problem getting counts from directory $data (check data-weights file $weights_file)" 1>&2
echo "$0: there was a problem getting counts from directory $text (check data-weights file $weights_file)" 1>&2
exit 1
fi

Expand All @@ -95,7 +103,26 @@ if ! [ $num_splits -gt 0 ]; then
fi


num_repeats=$[words_per_split/actual_words_per_split]
if ! [ $num_repeats -ge 1 ]; then
echo "$0: error computing the number of repeats, got $num_repeats." 1>&2
exit 1
fi

if [ $num_repeats -gt 1 -a $num_splits -gt 1 ]; then
echo "$0: script error: both num-repeats and num-splits are over 1." 1>&2
exit 1
fi

echo -n "get_num_splits.sh: based on tot-words=$tot_orig (with multiplicities: $tot_with_multiplicities)" 1>&2
echo " and target-words-per-split=$words_per_split, got $num_splits splits, actual words-per-split is $actual_words_per_split" 1>&2
if [ $num_repeats -gt 1 ]; then
echo " ... and num-repeats is $num_repeats" 1>&2
fi

echo $num_splits # this is the only thing that goes to the standard output.

if [ $num_repeats -eq 1 ]; then
echo $num_splits
else
echo -$num_repeats
fi
31 changes: 0 additions & 31 deletions scripts/rnnlm/get_unigram_counts.sh

This file was deleted.

60 changes: 0 additions & 60 deletions scripts/rnnlm/internal/distribute_lines.pl

This file was deleted.

Loading

0 comments on commit 9fc22b1

Please sign in to comment.