Fix two bugs in batched-wav-nnet3-cuda binary.

1) Using the "Any" apis prior to finishing submission of the full group could lead to a group finishing early. This would cause output to appear repeatedly. I have not seen this occur but an audit revealed it as an issue. The fix is to use the "Any" APIs only when full groups have been submitted. 2) GetNumberOfTasksPending() can return zero even though groups have not been waited for. This API call should not be used to determine if all groups have been completed as the number of pending tasks is independent of the number of groups remaining.
kaldi-asr · danpovey · Sep 19, 2019 · Sep 18, 2019 · Sep 18, 2019 · 45a33e19d90f0b0d2fefc90a0ec983337818d316
commit 45a33e19d90f0b0d2fefc90a0ec983337818d316
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -203,6 +203,8 @@ int main(int argc, char *argv[]) {
 
     nvtxRangePush("Global Timer");
 
+    int num_groups_done=0;
+
     // starting timer here so we
     // can measure throughput
     // without allocation
@@ -271,27 +273,29 @@ int main(int argc, char *argv[]) {
         cuda_pipeline.OpenDecodeHandle(key, wave_data, task_group,
                                        finish_one_decode_lamba);
         num_task_submitted++;
-        std::string group_done;
-        // Non-blocking way to check if a group is done
-        // returns false if zero groups are ready
-        if (cuda_pipeline.IsAnyGroupCompleted(&group_done)) {
-          cuda_pipeline.CloseAllDecodeHandlesForGroup(group_done);
-          double total_time = timer.Elapsed();
-          int32 iter = std::atoi(group_done.c_str());
-          KALDI_LOG << "~Group " << group_done << " completed"
-                    << " Aggregate Total Time: " << total_time
-                    << " Audio: " << total_audio * (iter + 1)
-                    << " RealTimeX: " << total_audio * (iter + 1) / total_time;
-        }
 
         nvtxRangePop();
         if (num_todo != -1 && num_task_submitted >= num_todo) break;
       }  // end utterance loop
+
+      std::string group_done;
+      // Non-blocking way to check if a group is done
+      // returns false if zero groups are ready
+      while (cuda_pipeline.IsAnyGroupCompleted(&group_done)) {
+        cuda_pipeline.CloseAllDecodeHandlesForGroup(group_done);
+        double total_time = timer.Elapsed();
+        int32 iter = std::atoi(group_done.c_str());
+        KALDI_LOG << "~Group " << group_done << " completed"
+                  << " Aggregate Total Time: " << total_time
+                  << " Audio: " << total_audio * (iter + 1)
+                  << " RealTimeX: " << total_audio * (iter + 1) / total_time;
+        num_groups_done++;
+      }
     }    // end iterations loop
 
     // We've submitted all tasks. Now waiting for them to complete
     // We could also have called WaitForAllTasks and CloseAllDecodeHandles
-    while (cuda_pipeline.GetNumberOfTasksPending()) {
+    while (num_groups_done<iterations) {
       // WaitForAnyGroup is blocking. It will hold until one group is ready
       std::string group_done = cuda_pipeline.WaitForAnyGroup();
       cuda_pipeline.CloseAllDecodeHandlesForGroup(group_done);
@@ -301,6 +305,7 @@ int main(int argc, char *argv[]) {
                 << " Aggregate Total Time: " << total_time
                 << " Audio: " << total_audio * (iter + 1)
                 << " RealTimeX: " << total_audio * (iter + 1) / total_time;
+      num_groups_done++;
     }
 
     // number of seconds elapsed since the creation of timer