Skip to content

Commit

Permalink
[src] Compile cudadecoder binaries on Windows (kaldi-asr#4506)
Browse files Browse the repository at this point in the history
* Make BatchedThreadedNnet3CudaOnlinePipeline::CorrelationID
  public (fallout from kaldi-asr#4490).
* Replace Win32-specific Sleep with portable C++ std::chrono
  classes in src/base/kaldi-utils.*. Change `Sleep(float)` arg
  type to `double` because this is what these classes take and
  return when converting to/from time in seconds.
* Use C++ constants for timers instead of sesquipedalian macros.
* Move includes under `HAVE_CUDA` conditionals in files without
  code that would make no sense without CUDA.
* Issue an #error if attempting to compile CUDA-dependent code
  with HAVE_CUDA not set.
* Add, reword some and reformat commentary and help strings in
  cudadecoderbin/batched-wav-nnet3-cuda-online.cc

Accidental touch-paint fixes:
* Replace usleep() with kaldi::Sleep in util/kaldi-table-test.cc.
* Fix escape non-sequence '\%' where '%' was intended in strings
  in src/nnet3/nnet-utils.cc.
  • Loading branch information
kkm000 committed May 5, 2021
1 parent e28927f commit 5de039a
Show file tree
Hide file tree
Showing 26 changed files with 357 additions and 354 deletions.
36 changes: 13 additions & 23 deletions src/base/kaldi-utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,40 +16,30 @@
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifdef _WIN32_WINNT_WIN8
#include <Synchapi.h>
#elif defined(_WIN32) || defined(_MSC_VER) || defined(MINGW)
#include <Windows.h>
#if defined(_MSC_VER) && _MSC_VER < 1900
#define snprintf _snprintf
#endif /* _MSC_VER < 1900 */
#else
#include <unistd.h>
#endif
#include "base/kaldi-utils.h"

#include <string>
#include "base/kaldi-common.h"
#include <chrono>
#include <cstdio>
#include <thread>


namespace kaldi {

std::string CharToString(const char &c) {
char buf[20];
if (std::isprint(c))
snprintf(buf, sizeof(buf), "\'%c\'", c);
std::snprintf(buf, sizeof(buf), "\'%c\'", c);
else
snprintf(buf, sizeof(buf), "[character %d]", static_cast<int>(c));
return (std::string) buf;
std::snprintf(buf, sizeof(buf), "[character %d]", static_cast<int>(c));
return buf;
}

void Sleep(float seconds) {
#if defined(_MSC_VER) || defined(MINGW)
::Sleep(static_cast<int>(seconds * 1000.0));
#elif defined(__CYGWIN__)
sleep(static_cast<int>(seconds));
#else
usleep(static_cast<int>(seconds * 1000000.0));
#endif
void Sleep(double sec) {
// duration_cast<> rounds down, add 0.5 to compensate.
auto dur_nanos = std::chrono::duration<double, std::nano>(sec * 1E9 + 0.5);
auto dur_syshires = std::chrono::duration_cast<
typename std::chrono::high_resolution_clock::duration>(dur_nanos);
std::this_thread::sleep_for(dur_syshires);
}

} // end namespace kaldi
14 changes: 9 additions & 5 deletions src/base/kaldi-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,14 @@
#ifndef KALDI_BASE_KALDI_UTILS_H_
#define KALDI_BASE_KALDI_UTILS_H_ 1

#if defined(_MSC_VER)
# define WIN32_LEAN_AND_MEAN
# define NOMINMAX
# include <windows.h>
#if _MSC_VER
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN 1
#endif
#ifndef NOMINMAX
#define NOMINMAX 1
#endif
#include <windows.h>
#endif

#ifdef _MSC_VER
Expand Down Expand Up @@ -88,7 +92,7 @@ inline int MachineIsLittleEndian() {
// This function kaldi::Sleep() provides a portable way
// to sleep for a possibly fractional
// number of seconds. On Windows it's only accurate to microseconds.
void Sleep(float seconds);
void Sleep(double seconds);
}

#define KALDI_SWAP8(a) do { \
Expand Down
39 changes: 23 additions & 16 deletions src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#if HAVE_CUDA == 1

#define KALDI_CUDA_DECODER_WAIT_FOR_CALLBACKS_US 10000
#define KALDI_CUDA_DECODER_WAIT_FOR_CPU_FEATURES_THREADS_US 1000
#define KALDI_CUDA_DECODER_WAIT_FOR_AVAILABLE_CHANNEL_US 1000
#if !HAVE_CUDA
#error CUDA support must be configured to compile this library.
#endif

#include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"

#include <mutex>
#include <numeric>
#include <tuple>

#include <nvToolsExt.h>

#include "feat/feature-window.h"
Expand All @@ -31,6 +33,11 @@

namespace kaldi {
namespace cuda_decoder {

const double kSleepForCallBack = 10e-3;
const double kSleepForCpuFeatures = 1e-3;
const double kSleepForChannelAvailable = 1e-3;

void BatchedThreadedNnet3CudaOnlinePipeline::Initialize(
const fst::Fst<fst::StdArc> &decode_fst) {
ReadParametersFromModel();
Expand Down Expand Up @@ -115,7 +122,7 @@ void BatchedThreadedNnet3CudaOnlinePipeline::SetBestPathCallback(
}

bool BatchedThreadedNnet3CudaOnlinePipeline::TryInitCorrID(
CorrelationID corr_id, int wait_for) {
CorrelationID corr_id, int32 wait_for_us) {
bool inserted;
decltype(corr_id2channel_.end()) it;
std::tie(it, inserted) = corr_id2channel_.insert({corr_id, -1});
Expand All @@ -127,10 +134,10 @@ bool BatchedThreadedNnet3CudaOnlinePipeline::TryInitCorrID(
if (!channel_available) {
// We cannot use that corr_id
int waited_for = 0;
while (waited_for < wait_for) {
while (waited_for < wait_for_us) {
lk.unlock();
usleep(KALDI_CUDA_DECODER_WAIT_FOR_AVAILABLE_CHANNEL_US);
waited_for += KALDI_CUDA_DECODER_WAIT_FOR_AVAILABLE_CHANNEL_US;
Sleep(kSleepForChannelAvailable);
waited_for += int32(kSleepForChannelAvailable * 1e6);
lk.lock();
channel_available = (available_channels_.size() > 0);
if (channel_available) break;
Expand Down Expand Up @@ -162,7 +169,7 @@ bool BatchedThreadedNnet3CudaOnlinePipeline::TryInitCorrID(

channel_frame_offset_[ichannel] = 0;
return true;
} // namespace cuda_decoder
}

void BatchedThreadedNnet3CudaOnlinePipeline::CompactWavesToMatrix(
const std::vector<SubVector<BaseFloat>> &wave_samples) {
Expand Down Expand Up @@ -217,7 +224,7 @@ void BatchedThreadedNnet3CudaOnlinePipeline::ComputeCPUFeatureExtraction(
}

while (n_compute_features_not_done_.load(std::memory_order_acquire))
usleep(KALDI_CUDA_DECODER_WAIT_FOR_CPU_FEATURES_THREADS_US);
Sleep(kSleepForCpuFeatures);

KALDI_ASSERT(d_all_features_.NumRows() == h_all_features_.NumRows() &&
d_all_features_.NumCols() == h_all_features_.NumCols());
Expand Down Expand Up @@ -402,7 +409,7 @@ void BatchedThreadedNnet3CudaOnlinePipeline::ComputeOneFeature(int element) {
n_input_frames_valid_[element] = nframes;

n_compute_features_not_done_.fetch_sub(1, std::memory_order_release);
} // namespace cuda_decoder
}

void BatchedThreadedNnet3CudaOnlinePipeline::RunCallbacksAndFinalize(
const std::vector<CorrelationID> &corr_ids,
Expand Down Expand Up @@ -486,7 +493,7 @@ void BatchedThreadedNnet3CudaOnlinePipeline::RunCallbacksAndFinalize(
{&BatchedThreadedNnet3CudaOnlinePipeline::FinalizeDecodingWrapper, this,
ichannel, static_cast<void *>(lattice_callback)});
}
} // namespace cuda_decoder
}

void BatchedThreadedNnet3CudaOnlinePipeline::ListIChannelsInBatch(
const std::vector<CorrelationID> &corr_ids, std::vector<int> *channels) {
Expand Down Expand Up @@ -606,9 +613,9 @@ void BatchedThreadedNnet3CudaOnlinePipeline::FinalizeDecoding(

void BatchedThreadedNnet3CudaOnlinePipeline::WaitForLatticeCallbacks() {
while (n_lattice_callbacks_not_done_.load() != 0)
usleep(KALDI_CUDA_DECODER_WAIT_FOR_CALLBACKS_US);
Sleep(kSleepForCallBack);
}


} // namespace cuda_decoder
} // namespace kaldi

#endif // HAVE_CUDA
8 changes: 4 additions & 4 deletions src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#if HAVE_CUDA == 1
#ifndef KALDI_CUDADECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_
#define KALDI_CUDADECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_

#ifndef KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_
#define KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_
#if HAVE_CUDA

#define KALDI_CUDA_DECODER_MIN_NCHANNELS_FACTOR 2

Expand Down Expand Up @@ -427,5 +427,5 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
} // end namespace cuda_decoder
} // end namespace kaldi.

#endif // KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_
#endif // HAVE_CUDA
#endif // KALDI_CUDADECODER_BATCHED_THREADED_CUDA_ONLINE_PIPELINE_H_
7 changes: 4 additions & 3 deletions src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#if !HAVE_CUDA
#error CUDA support must be configured to compile this library.
#endif

#define SLEEP_BACKOFF_NS 500
#define SLEEP_BACKOFF_S ((double)SLEEP_BACKOFF_NS / 1e9)
#if HAVE_CUDA == 1

#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
#include <nvToolsExt.h>
Expand Down Expand Up @@ -965,5 +968,3 @@ void BatchedThreadedNnet3CudaPipeline::ExecuteWorker(int threadId) {

} // end namespace cuda_decoder
} // end namespace kaldi

#endif // HAVE_CUDA == 1
4 changes: 2 additions & 2 deletions src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace cuda_decoder {
*/
// configuration options common to the BatchedThreadedNnet3CudaPipeline and
// BatchedThreadedNnet3CudaPipeline
struct BatchedThreadedNnet3CudaPipelineConfig {
struct [[deprecated]] BatchedThreadedNnet3CudaPipelineConfig {
BatchedThreadedNnet3CudaPipelineConfig()
: max_batch_size(200),
num_channels(-1),
Expand Down Expand Up @@ -136,7 +136,7 @@ struct BatchedThreadedNnet3CudaPipelineConfig {
* decoding. For examples of how to use this decoder see cudadecoder/README and
* cudadecoderbin/batched-wav-nnet3-cuda.cc
*/
class BatchedThreadedNnet3CudaPipeline {
class [[deprecated]] BatchedThreadedNnet3CudaPipeline {
public:
BatchedThreadedNnet3CudaPipeline(
const BatchedThreadedNnet3CudaPipelineConfig &config)
Expand Down
25 changes: 13 additions & 12 deletions src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,22 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <atomic>
#if HAVE_CUDA == 1
#if !HAVE_CUDA
#error CUDA support is required to compile this library.
#endif

#define KALDI_CUDA_DECODER_WAIT_FOR_TASKS_US 10000
#define KALDI_CUDA_DECODER_WAIT_FOR_NEW_TASKS_US 100
#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h"

#include <nvToolsExt.h>
#include <atomic>

#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h"
#include <nvToolsExt.h>

namespace kaldi {
namespace cuda_decoder {

const float kSleepForTaskComplete = 10e-3;
const float kSleepForNewTask = 100e-6;

BatchedThreadedNnet3CudaPipeline2::BatchedThreadedNnet3CudaPipeline2(
const BatchedThreadedNnet3CudaPipeline2Config &config,
const fst::Fst<fst::StdArc> &decode_fst, const nnet3::AmNnetSimple &am_nnet,
Expand Down Expand Up @@ -147,7 +150,7 @@ void BatchedThreadedNnet3CudaPipeline2::BuildBatchFromCurrentTasks() {

void BatchedThreadedNnet3CudaPipeline2::WaitForAllTasks() {
while (n_tasks_not_done_.load() != 0) {
usleep(KALDI_CUDA_DECODER_WAIT_FOR_TASKS_US);
Sleep(kSleepForTaskComplete);
}
}

Expand Down Expand Up @@ -180,7 +183,7 @@ void BatchedThreadedNnet3CudaPipeline2::WaitForGroup(const std::string &group) {
}

while (n_not_done->load(std::memory_order_consume) != 0)
usleep(KALDI_CUDA_DECODER_WAIT_FOR_TASKS_US);
Sleep(kSleepForTaskComplete);
}

void BatchedThreadedNnet3CudaPipeline2::CreateTask(
Expand Down Expand Up @@ -446,7 +449,7 @@ void BatchedThreadedNnet3CudaPipeline2::ComputeTasks() {
if (current_tasks_.size() < max_batch_size_) AcquireTasks();
if (current_tasks_.empty()) {
// If we still have nothing to do, let's sleep a bit
usleep(KALDI_CUDA_DECODER_WAIT_FOR_NEW_TASKS_US);
Sleep(kSleepForNewTask);
continue;
}
BuildBatchFromCurrentTasks();
Expand Down Expand Up @@ -475,6 +478,4 @@ void BatchedThreadedNnet3CudaPipeline2::SetLatticePostprocessor(
}

} // namespace cuda_decoder
} // end namespace kaldi.

#endif // HAVE_CUDA
} // namespace kaldi
6 changes: 3 additions & 3 deletions src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#if HAVE_CUDA == 1

#ifndef KALDI_CUDA_DECODER_BATCHED_THREADED_NNET3_CUDA_PIPELINE2_H_
#define KALDI_CUDA_DECODER_BATCHED_THREADED_NNET3_CUDA_PIPELINE2_H_

#if HAVE_CUDA

#include <atomic>
#include <thread>

Expand Down Expand Up @@ -241,5 +241,5 @@ class BatchedThreadedNnet3CudaPipeline2 {
} // end namespace cuda_decoder
} // namespace kaldi

#endif // KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
#endif // HAVE_CUDA
#endif // KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
28 changes: 16 additions & 12 deletions src/cudadecoder/cuda-decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,28 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <sstream>
#include "online2/online-endpoint.h"
#if HAVE_CUDA == 1
#if !HAVE_CUDA
#error CUDA support must be configured to compile this library.
#endif

#include "cuda-decoder-kernels.h"
#include "cuda-decoder.h"
#include "cudadecoder/cuda-decoder.h"

#include <cuda_runtime_api.h>
#include <nvToolsExt.h>
#include <util/text-utils.h>
#include <algorithm>
#include <map>
#include <sstream>
#include <tuple>

#include <cuda_runtime_api.h>
#include <nvToolsExt.h>

#include "cudadecoder/cuda-decoder-kernels.h"
#include "online2/online-endpoint.h"
#include "util/text-utils.h"


namespace kaldi {
namespace cuda_decoder {

CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
int32 nlanes, int32 nchannels)
: word_syms_(NULL),
Expand Down Expand Up @@ -901,13 +907,13 @@ void CudaDecoder::WaitForPartialHypotheses() {
if (!generate_partial_hypotheses_) return;
while (n_partial_traceback_threads_not_done_.load(std::memory_order_acquire) >
0)
usleep(200);
Sleep(200e-6);
}

void CudaDecoder::CheckOverflow() {
for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
LaneCounters *lane_counters = h_lanes_counters_.lane(ilane);
bool q_overflow = lane_counters->q_overflow;
int32_t q_overflow = lane_counters->q_overflow;
if (q_overflow != OVERFLOW_NONE) {
// An overflow was prevented in a kernel
// The algorithm can still go on but quality of the
Expand Down Expand Up @@ -2072,5 +2078,3 @@ void CudaDecoder::SetThreadPoolAndStartCPUWorkers(ThreadPoolLight *thread_pool,

} // namespace cuda_decoder
} // namespace kaldi

#endif // HAVE_CUDA == 1
Loading

0 comments on commit 5de039a

Please sign in to comment.