[src] Compile cudadecoder binaries on Windows (kaldi-asr#4506)

* Make BatchedThreadedNnet3CudaOnlinePipeline::CorrelationID public (fallout from kaldi-asr#4490). * Replace Win32-specific Sleep with portable C++ std::chrono classes in src/base/kaldi-utils.*. Change `Sleep(float)` arg type to `double` because this is what these classes take and return when converting to/from time in seconds. * Use C++ constants for timers instead of sesquipedalian macros. * Move includes under `HAVE_CUDA` conditionals in files without code that would make no sense without CUDA. * Issue an #error if attempting to compile CUDA-dependent code with HAVE_CUDA not set. * Add, reword some and reformat commentary and help strings in cudadecoderbin/batched-wav-nnet3-cuda-online.cc Accidental touch-paint fixes: * Replace usleep() with kaldi::Sleep in util/kaldi-table-test.cc. * Fix escape non-sequence '\%' where '%' was intended in strings in src/nnet3/nnet-utils.cc.
LvHang · May 5, 2021 · 5de039a · 5de039a
1 parent e28927f
commit 5de039a
Show file tree

Hide file tree

Showing 26 changed files with 357 additions and 354 deletions.
diff --git a/src/base/kaldi-utils.cc b/src/base/kaldi-utils.cc
@@ -16,40 +16,30 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef _WIN32_WINNT_WIN8
-#include <Synchapi.h>
-#elif defined(_WIN32) || defined(_MSC_VER) || defined(MINGW)
-#include <Windows.h>
-#if defined(_MSC_VER) && _MSC_VER < 1900
-#define snprintf _snprintf
-#endif /* _MSC_VER < 1900 */
-#else
-#include <unistd.h>
-#endif
+#include "base/kaldi-utils.h"
 
-#include <string>
-#include "base/kaldi-common.h"
+#include <chrono>
+#include <cstdio>
+#include <thread>
 
 
 namespace kaldi {
 
 std::string CharToString(const char &c) {
   char buf[20];
   if (std::isprint(c))
-    snprintf(buf, sizeof(buf), "\'%c\'", c);
+    std::snprintf(buf, sizeof(buf), "\'%c\'", c);
   else
-    snprintf(buf, sizeof(buf), "[character %d]", static_cast<int>(c));
-  return (std::string) buf;
+    std::snprintf(buf, sizeof(buf), "[character %d]", static_cast<int>(c));
+  return buf;
 }
 
-void Sleep(float seconds) {
-#if defined(_MSC_VER) || defined(MINGW)
-  ::Sleep(static_cast<int>(seconds * 1000.0));
-#elif defined(__CYGWIN__)
-  sleep(static_cast<int>(seconds));
-#else
-  usleep(static_cast<int>(seconds * 1000000.0));
-#endif
+void Sleep(double sec) {
+  // duration_cast<> rounds down, add 0.5 to compensate.
+  auto dur_nanos = std::chrono::duration<double, std::nano>(sec * 1E9 + 0.5);
+  auto dur_syshires = std::chrono::duration_cast<
+    typename std::chrono::high_resolution_clock::duration>(dur_nanos);
+  std::this_thread::sleep_for(dur_syshires);
 }
 
 }  // end namespace kaldi
diff --git a/src/base/kaldi-utils.h b/src/base/kaldi-utils.h
@@ -21,10 +21,14 @@
 #ifndef KALDI_BASE_KALDI_UTILS_H_
 #define KALDI_BASE_KALDI_UTILS_H_ 1
 
-#if defined(_MSC_VER)
-# define WIN32_LEAN_AND_MEAN
-# define NOMINMAX
-# include <windows.h>
+#if _MSC_VER
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN 1
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX 1
+#endif
+#include <windows.h>
 #endif
 
 #ifdef _MSC_VER
@@ -88,7 +92,7 @@ inline int MachineIsLittleEndian() {
 // This function kaldi::Sleep() provides a portable way
 // to sleep for a possibly fractional
 // number of seconds.  On Windows it's only accurate to microseconds.
-void Sleep(float seconds);
+void Sleep(double seconds);
 }
 
 #define KALDI_SWAP8(a) do { \

diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
@@ -15,14 +15,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if HAVE_CUDA == 1
-
-#define KALDI_CUDA_DECODER_WAIT_FOR_CALLBACKS_US 10000
-#define KALDI_CUDA_DECODER_WAIT_FOR_CPU_FEATURES_THREADS_US 1000
-#define KALDI_CUDA_DECODER_WAIT_FOR_AVAILABLE_CHANNEL_US 1000
+#if !HAVE_CUDA
+#error CUDA support must be configured to compile this library.
+#endif
 
 #include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"
 
+#include <mutex>
+#include <numeric>
+#include <tuple>
+
 #include <nvToolsExt.h>
 
 #include "feat/feature-window.h"
@@ -31,6 +33,11 @@
 
 namespace kaldi {
 namespace cuda_decoder {
+
+const double kSleepForCallBack = 10e-3;
+const double kSleepForCpuFeatures = 1e-3;
+const double kSleepForChannelAvailable = 1e-3;
+
 void BatchedThreadedNnet3CudaOnlinePipeline::Initialize(
     const fst::Fst<fst::StdArc> &decode_fst) {
   ReadParametersFromModel();
@@ -115,7 +122,7 @@ void BatchedThreadedNnet3CudaOnlinePipeline::SetBestPathCallback(
 }
 
 bool BatchedThreadedNnet3CudaOnlinePipeline::TryInitCorrID(
-    CorrelationID corr_id, int wait_for) {
+    CorrelationID corr_id, int32 wait_for_us) {
   bool inserted;
   decltype(corr_id2channel_.end()) it;
   std::tie(it, inserted) = corr_id2channel_.insert({corr_id, -1});
@@ -127,10 +134,10 @@ bool BatchedThreadedNnet3CudaOnlinePipeline::TryInitCorrID(
     if (!channel_available) {
       // We cannot use that corr_id
       int waited_for = 0;
-      while (waited_for < wait_for) {
+      while (waited_for < wait_for_us) {
         lk.unlock();
-        usleep(KALDI_CUDA_DECODER_WAIT_FOR_AVAILABLE_CHANNEL_US);
-        waited_for += KALDI_CUDA_DECODER_WAIT_FOR_AVAILABLE_CHANNEL_US;
+        Sleep(kSleepForChannelAvailable);
+        waited_for += int32(kSleepForChannelAvailable * 1e6);
         lk.lock();
         channel_available = (available_channels_.size() > 0);
         if (channel_available) break;
@@ -162,7 +169,7 @@ bool BatchedThreadedNnet3CudaOnlinePipeline::TryInitCorrID(
 
   channel_frame_offset_[ichannel] = 0;
   return true;
-}  // namespace cuda_decoder
+}
 
 void BatchedThreadedNnet3CudaOnlinePipeline::CompactWavesToMatrix(
     const std::vector<SubVector<BaseFloat>> &wave_samples) {
@@ -217,7 +224,7 @@ void BatchedThreadedNnet3CudaOnlinePipeline::ComputeCPUFeatureExtraction(
   }
 
   while (n_compute_features_not_done_.load(std::memory_order_acquire))
-    usleep(KALDI_CUDA_DECODER_WAIT_FOR_CPU_FEATURES_THREADS_US);
+    Sleep(kSleepForCpuFeatures);
 
   KALDI_ASSERT(d_all_features_.NumRows() == h_all_features_.NumRows() &&
                d_all_features_.NumCols() == h_all_features_.NumCols());
@@ -402,7 +409,7 @@ void BatchedThreadedNnet3CudaOnlinePipeline::ComputeOneFeature(int element) {
   n_input_frames_valid_[element] = nframes;
 
   n_compute_features_not_done_.fetch_sub(1, std::memory_order_release);
-}  // namespace cuda_decoder
+}
 
 void BatchedThreadedNnet3CudaOnlinePipeline::RunCallbacksAndFinalize(
     const std::vector<CorrelationID> &corr_ids,
@@ -486,7 +493,7 @@ void BatchedThreadedNnet3CudaOnlinePipeline::RunCallbacksAndFinalize(
         {&BatchedThreadedNnet3CudaOnlinePipeline::FinalizeDecodingWrapper, this,
          ichannel, static_cast<void *>(lattice_callback)});
   }
-}  // namespace cuda_decoder
+}
 
 void BatchedThreadedNnet3CudaOnlinePipeline::ListIChannelsInBatch(
     const std::vector<CorrelationID> &corr_ids, std::vector<int> *channels) {
@@ -606,9 +613,9 @@ void BatchedThreadedNnet3CudaOnlinePipeline::FinalizeDecoding(
 
 void BatchedThreadedNnet3CudaOnlinePipeline::WaitForLatticeCallbacks() {
   while (n_lattice_callbacks_not_done_.load() != 0)
-    usleep(KALDI_CUDA_DECODER_WAIT_FOR_CALLBACKS_US);
+    Sleep(kSleepForCallBack);
 }
+
+
 }  // namespace cuda_decoder
 }  // namespace kaldi
-
-#endif  // HAVE_CUDA
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h
@@ -15,10 +15,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if HAVE_CUDA == 1
+#ifndef KALDI_CUDADECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_
+#define KALDI_CUDADECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_
 
-#ifndef KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_
-#define KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_
+#if HAVE_CUDA
 
 #define KALDI_CUDA_DECODER_MIN_NCHANNELS_FACTOR 2
 
@@ -427,5 +427,5 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
 }  // end namespace cuda_decoder
 }  // end namespace kaldi.
 
-#endif  // KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_
 #endif  // HAVE_CUDA
+#endif  // KALDI_CUDADECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -15,9 +15,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#if !HAVE_CUDA
+#error CUDA support must be configured to compile this library.
+#endif
+
 #define SLEEP_BACKOFF_NS 500
 #define SLEEP_BACKOFF_S ((double)SLEEP_BACKOFF_NS / 1e9)
-#if HAVE_CUDA == 1
 
 #include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
 #include <nvToolsExt.h>
@@ -965,5 +968,3 @@ void BatchedThreadedNnet3CudaPipeline::ExecuteWorker(int threadId) {
 
 }  // end namespace cuda_decoder
 }  // end namespace kaldi
-
-#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
@@ -48,7 +48,7 @@ namespace cuda_decoder {
  */
 // configuration options common to the BatchedThreadedNnet3CudaPipeline and
 // BatchedThreadedNnet3CudaPipeline
-struct BatchedThreadedNnet3CudaPipelineConfig {
+struct [[deprecated]] BatchedThreadedNnet3CudaPipelineConfig {
   BatchedThreadedNnet3CudaPipelineConfig()
       : max_batch_size(200),
         num_channels(-1),
@@ -136,7 +136,7 @@ struct BatchedThreadedNnet3CudaPipelineConfig {
  * decoding. For examples of how to use this decoder see cudadecoder/README and
  * cudadecoderbin/batched-wav-nnet3-cuda.cc
  */
-class BatchedThreadedNnet3CudaPipeline {
+class [[deprecated]] BatchedThreadedNnet3CudaPipeline {
  public:
   BatchedThreadedNnet3CudaPipeline(
       const BatchedThreadedNnet3CudaPipelineConfig &config)

diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
@@ -15,19 +15,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <atomic>
-#if HAVE_CUDA == 1
+#if !HAVE_CUDA
+#error CUDA support is required to compile this library.
+#endif
 
-#define KALDI_CUDA_DECODER_WAIT_FOR_TASKS_US 10000
-#define KALDI_CUDA_DECODER_WAIT_FOR_NEW_TASKS_US 100
+#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h"
 
-#include <nvToolsExt.h>
+#include <atomic>
 
-#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h"
+#include <nvToolsExt.h>
 
 namespace kaldi {
 namespace cuda_decoder {
 
+const float kSleepForTaskComplete = 10e-3;
+const float kSleepForNewTask = 100e-6;
+
 BatchedThreadedNnet3CudaPipeline2::BatchedThreadedNnet3CudaPipeline2(
     const BatchedThreadedNnet3CudaPipeline2Config &config,
     const fst::Fst<fst::StdArc> &decode_fst, const nnet3::AmNnetSimple &am_nnet,
@@ -147,7 +150,7 @@ void BatchedThreadedNnet3CudaPipeline2::BuildBatchFromCurrentTasks() {
 
 void BatchedThreadedNnet3CudaPipeline2::WaitForAllTasks() {
   while (n_tasks_not_done_.load() != 0) {
-    usleep(KALDI_CUDA_DECODER_WAIT_FOR_TASKS_US);
+    Sleep(kSleepForTaskComplete);
   }
 }
 
@@ -180,7 +183,7 @@ void BatchedThreadedNnet3CudaPipeline2::WaitForGroup(const std::string &group) {
   }
 
   while (n_not_done->load(std::memory_order_consume) != 0)
-    usleep(KALDI_CUDA_DECODER_WAIT_FOR_TASKS_US);
+    Sleep(kSleepForTaskComplete);
 }
 
 void BatchedThreadedNnet3CudaPipeline2::CreateTask(
@@ -446,7 +449,7 @@ void BatchedThreadedNnet3CudaPipeline2::ComputeTasks() {
     if (current_tasks_.size() < max_batch_size_) AcquireTasks();
     if (current_tasks_.empty()) {
       // If we still have nothing to do, let's sleep a bit
-      usleep(KALDI_CUDA_DECODER_WAIT_FOR_NEW_TASKS_US);
+      Sleep(kSleepForNewTask);
       continue;
     }
     BuildBatchFromCurrentTasks();
@@ -475,6 +478,4 @@ void BatchedThreadedNnet3CudaPipeline2::SetLatticePostprocessor(
 }
 
 }  // namespace cuda_decoder
-}  // end namespace kaldi.
-
-#endif  // HAVE_CUDA
+}  // namespace kaldi
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h
@@ -15,11 +15,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if HAVE_CUDA == 1
-
 #ifndef KALDI_CUDA_DECODER_BATCHED_THREADED_NNET3_CUDA_PIPELINE2_H_
 #define KALDI_CUDA_DECODER_BATCHED_THREADED_NNET3_CUDA_PIPELINE2_H_
 
+#if HAVE_CUDA
+
 #include <atomic>
 #include <thread>
 
@@ -241,5 +241,5 @@ class BatchedThreadedNnet3CudaPipeline2 {
 }  // end namespace cuda_decoder
 }  // namespace kaldi
 
-#endif  // KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
 #endif  // HAVE_CUDA
+#endif  // KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
@@ -15,22 +15,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <sstream>
-#include "online2/online-endpoint.h"
-#if HAVE_CUDA == 1
+#if !HAVE_CUDA
+#error CUDA support must be configured to compile this library.
+#endif
 
-#include "cuda-decoder-kernels.h"
-#include "cuda-decoder.h"
+#include "cudadecoder/cuda-decoder.h"
 
-#include <cuda_runtime_api.h>
-#include <nvToolsExt.h>
-#include <util/text-utils.h>
 #include <algorithm>
 #include <map>
+#include <sstream>
 #include <tuple>
 
+#include <cuda_runtime_api.h>
+#include <nvToolsExt.h>
+
+#include "cudadecoder/cuda-decoder-kernels.h"
+#include "online2/online-endpoint.h"
+#include "util/text-utils.h"
+
+
 namespace kaldi {
 namespace cuda_decoder {
+
 CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
                          int32 nlanes, int32 nchannels)
     : word_syms_(NULL),
@@ -901,13 +907,13 @@ void CudaDecoder::WaitForPartialHypotheses() {
   if (!generate_partial_hypotheses_) return;
   while (n_partial_traceback_threads_not_done_.load(std::memory_order_acquire) >
          0)
-    usleep(200);
+    Sleep(200e-6);
 }
 
 void CudaDecoder::CheckOverflow() {
   for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
     LaneCounters *lane_counters = h_lanes_counters_.lane(ilane);
-    bool q_overflow = lane_counters->q_overflow;
+    int32_t q_overflow = lane_counters->q_overflow;
     if (q_overflow != OVERFLOW_NONE) {
       // An overflow was prevented in a kernel
       // The algorithm can still go on but quality of the
@@ -2072,5 +2078,3 @@ void CudaDecoder::SetThreadPoolAndStartCPUWorkers(ThreadPoolLight *thread_pool,
 
 }  // namespace cuda_decoder
 }  // namespace kaldi
-
-#endif  // HAVE_CUDA == 1