[src,build] fix Makefile; make some sampling code faster.

kaldi-asr · danpovey · Dec 21, 2017 · Jun 17, 2017 · Jun 17, 2017 · Jun 17, 2017
commit 8a1144f17514691290622b8d3b6329ca87d3a980
diff --git a/egs/ptb/s5/local/rnnlm/train_rnnlm_sampling.sh b/egs/ptb/s5/local/rnnlm/train_rnnlm_sampling.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/bash
+
+
+# this will eventually be totally refactored and moved into steps/.
+
+dir=exp/rnnlm_data_prep
+vocab=data/vocab/words.txt
+embedding_dim=600
+
+ns=$(rnnlm/get_num_splits.sh 200000 data/text $dir/data_weights.txt)
+
+# work out the number of splits.
+ns=$(rnnlm/get_num_splits.sh 200000 data/text $dir/data_weights.txt)
+vocab_size=$(tail -n 1 $vocab |awk '{print $NF + 1}')
+
+# split the data into pieces that individual jobs will train on.
+# rnnlm/split_data.sh data/text $ns
+
+
+rnnlm/prepare_split_data.py --vocab-file=$vocab --data-weights-file=$dir/data_weights.txt \
+                            --num-splits=$ns data/text  $dir/text
+
+. ./path.sh
+
+# cat >$dir/config <<EOF
+# input-node name=input dim=$embedding_dim
+# component name=affine1 type=NaturalGradientAffineComponent input-dim=$embedding_dim output-dim=$embedding_dim
+# component-node input=input name=affine1 component=affine1
+# output-node input=affine1 name=output
+# EOF
+
+mkdir -p $dir/configs
+cat >$dir/configs/network.xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=512 input=Append(0, IfDefined(-1))
+relu-renorm-layer name=tdnn2 dim=512 input=Append(0, IfDefined(-2))
+relu-renorm-layer name=tdnn3 dim=512 input=Append(0, IfDefined(-2))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+
+steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+
+# note: this is way too slow, we need to speed it up somehow.
+# I'm not sure if I want to have a dependency on numpy just for this though.
+# maybe we can rewrite in perl.
+rnnlm/initialize_matrix.py --num-rows=$vocab_size --num-cols=$embedding_dim \
+                           --first-column=1.0 > $dir/embedding.0.mat
+
+nnet3-init $dir/configs/final.config - | nnet3-copy --learning-rate=0.0001 - $dir/0.rnnlm
+
+
+rnnlm-train --use-gpu=no --read-rnnlm=$dir/0.rnnlm --write-rnnlm=$dir/1.rnnlm --read-embedding=$dir/embedding.0.mat \
+            --write-embedding=/$dir/embedding.1.mat "ark:rnnlm-get-egs --vocab-size=$vocab_size $dir/text/1.txt ark,t:- |"
+
+# or with GPU:
+rnnlm-train --rnnlm.max-param-change=0.5 --embedding.max-param-change=0.5 \
+             --use-gpu=yes --read-rnnlm=$dir/0.rnnlm --write-rnnlm=$dir/1.rnnlm --read-embedding=$dir/embedding.0.mat \
+            --write-embedding=$dir/embedding.1.mat 'ark:for n in 1 2 3 4 5 6; do cat exp/rnnlm_data_prep/text/*.txt; done | rnnlm-get-egs --vocab-size=10003 - ark,t:- |'
+
+
+# just a note on the unigram entropy of PTB training set:
+# awk '{for (n=1;n<=NF;n++) { count[$n]++; } count["</s>"]++; } END{ tot_count=0; tot_entropy=0.0; for(k in count) tot_count += count[k];  for (k in count) { p = count[k]*1.0/tot_count; tot_entropy += p*log(p); }  print "entropy is " -tot_entropy; }' <data/text/ptb.txt
+# 6.52933
+
+# .. and entropy of bigrams:
+# awk '{hist="<s>"; for (n=1;n<=NF;n++) { count[hist,$n]++; hist=$n; } count[hist,"</s>"]++; } END{ tot_count=0; tot_entropy=0.0; for(k in count) tot_count += count[k];  for (k in count) { p = count[k]*1.0/tot_count; tot_entropy += p*log(p); }  print "entropy is " -tot_entropy; }' <data/text/ptb.txt
+# 10.7482
+# in information theory, H(X) = H(Y) = 6.52, H(X,Y) = 10.7482, so H(Y | X) = 10.7482 - 6.52 = ***4.2282***, which
+# is the entropy of the next symbol given the preceding symbol.  this gives a limit on the expected training
+# objective given just a single word of context.
diff --git a/src/Makefile b/src/Makefile
@@ -150,9 +150,9 @@ $(EXT_SUBDIRS) : mklibdir ext_depend
 # this is necessary for correct parallel compilation
 #1)The tools depend on all the libraries
 
-bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin: \
+bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin: \
  base matrix util feat tree gmm transform sgmm2 fstext hmm \
- lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2
+ lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm
 
 #2)The libraries have inter-dependencies
 base: base/.depend.mk
@@ -172,7 +172,7 @@ cudamatrix: base util matrix
 nnet: base util hmm tree matrix cudamatrix
 nnet2: base util matrix lat gmm hmm tree transform cudamatrix
 nnet3: base util matrix lat gmm hmm tree transform cudamatrix chain fstext
-rnnlm: base util matrix cudamatrix nnet3
+rnnlm: base util matrix cudamatrix nnet3 lm hmm
 chain: lat hmm tree fstext matrix cudamatrix util base
 ivector: base util matrix transform tree gmm
 #3)Dependencies for optional parts of Kaldi

diff --git a/src/rnnlm/sampler.cc b/src/rnnlm/sampler.cc
@@ -300,6 +300,27 @@ void Sampler::SampleWords(
   SampleFromIntervals(intervals, sample);
 }
 
+
+
+// This hacked version of std::priority_queue allows us to extract all elements
+// of the priority queue to a supplied vector, in an efficient way.  It relies
+// on the fact that std::priority<queue> stores the underlying container as a
+// protected member 'c'.  The only way to do this using the supplied interface
+// of std::priority_queue is to repeatedly pop() the element from the queue, but
+// that is too slow, and it actually had an impact on the speed of the
+// application.
+template <typename T>
+class hacked_priority_queue: public std::priority_queue<T> {
+ public:
+  void append_all_elements(std::vector<T> *output) const {
+    output->insert(output->end(), this->c.begin(), this->c.end());
+  }
+  // we have to redeclare the constructor.
+  template <typename InputIter> hacked_priority_queue(
+      InputIter begin, const InputIter end): std::priority_queue<T>(begin, end) { }
+};
+
+
 // static
 void Sampler::NormalizeIntervals(int32 num_words_to_sample,
                                  double total_p,
@@ -324,7 +345,7 @@ void Sampler::NormalizeIntervals(int32 num_words_to_sample,
   //  current_alpha = (num_words_to_sample - num_ones) / total_remaining_p.
   // As we update 'num_ones' and 'total_remaining_p', we will continue
   // to update current_alpha, and it will keep getting larger.
-  std::priority_queue<Interval> queue(intervals->begin(), intervals->end());
+  hacked_priority_queue<Interval> queue(intervals->begin(), intervals->end());
 
   // clear 'intervals'; we'll use the space to store the intervals that will
   // have a prob of exactly 1.0, and eventually we'll add the rest.
@@ -376,15 +397,27 @@ void Sampler::NormalizeIntervals(int32 num_words_to_sample,
       }
     }
   }
-  // it's not that efficient to use the top() function of the queue to remove
-  // elements, but there doesn't seem to be an efficient way to get
-  // all the elements at once without nasty hacks.  Hopefully this won't dominate.
+#if 0
+  // The following code is a bit slow but has the advantage of not assuming
+  // anything about the internals of class std::priority_queue.
   while (!queue.empty()) {
     Interval top = queue.top();
     top.prob *= current_alpha;
     queue.pop();
     intervals->push_back(top);
   }
+#else
+  { // This code is faster but relies on the fact that priority_queue
+    // has a protected member 'c' which is the underlying container.
+    size_t cur_size = intervals->size();
+    queue.append_all_elements(intervals);
+    // the next loop scales the 'prob' members of the elements we just
+    // added to 'intervals', by current_alpha.
+    std::vector<Interval>::iterator iter = intervals->begin() + cur_size,
+        end = intervals->end();
+    for (; iter != end; ++iter) iter->prob *= current_alpha;
+  }
+#endif
 
   if (GetVerboseLevel() >= 2) {
     double tot_prob = 0.0;