kaldi-asr · danpovey · Aug 28, 2019 · Aug 28, 2019 · Aug 28, 2019
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
@@ -8,7 +8,7 @@ ifeq ($(CUDA), true)
 TESTFILES = 
 
 ifeq ($(CUDA), true)
-  OBJFILES +=  feature-window-cuda.o feature-mfcc-cuda.o feature-online-cmvn-cuda.o \
+  OBJFILES +=  feature-window-cuda.o feature-spectral-cuda.o feature-online-cmvn-cuda.o \
 							 online-ivector-feature-cuda-kernels.o online-ivector-feature-cuda.o \
 							 online-cuda-feature-pipeline.o
 endif

diff --git a/src/cudafeat/feature-mfcc-cuda.cu → src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-mfcc-cuda.cu → src/cudafeat/feature-spectral-cuda.cu
@@ -1,4 +1,4 @@
-// cudafeature/feature-mfcc-cuda.cu
+// cudafeature/feature-spectral-cuda.cu
 //
 // Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 // Justin Luitjens
@@ -20,7 +20,7 @@
 #include <cub/cub.cuh>
 #endif
 
-#include "cudafeat/feature-mfcc-cuda.h"
+#include "cudafeat/feature-spectral-cuda.h"
 #include "cudamatrix/cu-rand.h"
 
 // Each thread block processes a unique frame
@@ -60,7 +60,8 @@ __global__ void apply_lifter_and_floor_energy(
 // Threads in the same block compute the row collaboratively.
 // This kernel must be called out of place (A_in!=A_out).
 __global__ void power_spectrum_kernel(int row_length, float *A_in, int32_t ldi,
-                                      float *A_out, int32_t ldo) {
+                                      float *A_out, int32_t ldo,
+                                      bool use_power) {
   int thread_id = threadIdx.x;
   int block_id = blockIdx.x;
   float *Ar = A_in + block_id * ldi;
@@ -73,7 +74,11 @@ __global__ void power_spectrum_kernel(int row_length, float *A_in, int32_t ldi,
 
     float2 val = reinterpret_cast<float2 *>(Ar)[idx];
     float ret = val.x * val.x + val.y * val.y;
-    Aw[idx] = ret;
+    if (use_power) {
+      Aw[idx] = ret;
+    } else {
+      Aw[idx] = sqrtf(ret);
+    }
   }
 
   // handle special case
@@ -84,17 +89,23 @@ __global__ void power_spectrum_kernel(int row_length, float *A_in, int32_t ldi,
     // internal implementation
     float im = Ar[row_length];
 
-    Aw[0] = real * real;
-    Aw[half_length] = im * im;
+    if (use_power) {
+      Aw[0] = real * real;
+      Aw[half_length] = im * im;
+    } else {
+      Aw[0] = fabs(real);
+      Aw[half_length] = fabs(im);
+    }
   }
 }
 
 // Expects to be called with 32x8 sized thread block.
+// LDB: Adding use_log flag
 __global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor,
                                          int32 *offsets, int32 *sizes,
                                          float **vecs, const float *feats,
-                                         int32_t ldf, float *mels,
-                                         int32_t ldm) {
+                                         int32_t ldf, float *mels, int32_t ldm,
+                                         bool use_log) {
   // Specialize WarpReduce for type float
   typedef cub::WarpReduce<float> WarpReduce;
   // Allocate WarpReduce shared memory for 8 warps
@@ -125,10 +136,14 @@ __global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor,
   // Sum in cub
   sum = WarpReduce(temp_storage[wid]).Sum(sum);
   if (tid == 0) {
-    // avoid log of zero
-    if (sum < energy_floor) sum = energy_floor;
-    float val = logf(sum);
-    mels[frame * ldm + bin] = val;
+    if (use_log) {
+      // avoid log of zero
+      if (sum < energy_floor) sum = energy_floor;
+      float val = logf(sum);
+      mels[frame * ldm + bin] = val;
+    } else {
+      mels[frame * ldm + bin] = sum;
+    }
   }
 }
 
@@ -341,11 +356,11 @@ __global__ void dot_log_kernel(int32_t num_frames, int32_t frame_length,
 
 namespace kaldi {
 
-CudaMfcc::CudaMfcc(const MfccOptions &opts)
-    : MfccComputer(opts),
+CudaSpectralFeatures::CudaSpectralFeatures(const CudaSpectralFeatureOptions &opts)
+    : MfccComputer(opts.mfcc_opts),
       cu_lifter_coeffs_(lifter_coeffs_),
       cu_dct_matrix_(dct_matrix_),
-      window_function_(opts.frame_opts) {
+      window_function_(opts.mfcc_opts.frame_opts) {
   const MelBanks *mel_banks = GetMelBanks(1.0);
   const std::vector<std::pair<int32, Vector<BaseFloat>>> &bins =
       mel_banks->GetBins();
@@ -376,8 +391,8 @@ CudaMfcc::CudaMfcc(const MfccOptions &opts)
                                cudaMemcpyHostToDevice, cudaStreamPerThread));
   CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
 
-  frame_length_ = opts.frame_opts.WindowSize();
-  padded_length_ = opts.frame_opts.PaddedWindowSize();
+  frame_length_ = opts.mfcc_opts.frame_opts.WindowSize();
+  padded_length_ = opts.mfcc_opts.frame_opts.PaddedWindowSize();
   fft_length_ = padded_length_ / 2;  // + 1;
   fft_size_ = 800;
 
@@ -395,12 +410,13 @@ CudaMfcc::CudaMfcc(const MfccOptions &opts)
   cufftPlanMany(&plan_, 1, &padded_length_, NULL, 1, stride_, NULL, 1,
                 tmp_stride_ / 2, CUFFT_R2C, fft_size_);
   cufftSetStream(plan_, cudaStreamPerThread);
+  cumfcc_opts_ = opts;
 }
 
 // ExtractWindow extracts a windowed frame of waveform with a power-of-two,
 // padded size.  It does mean subtraction, pre-emphasis and dithering as
 // requested.
-void CudaMfcc::ExtractWindows(int32_t num_frames, int64 sample_offset,
+void CudaSpectralFeatures::ExtractWindows(int32_t num_frames, int64 sample_offset,
                               const CuVectorBase<BaseFloat> &wave,
                               const FrameExtractionOptions &opts) {
   KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
@@ -415,7 +431,7 @@ void CudaMfcc::ExtractWindows(int32_t num_frames, int64 sample_offset,
   CU_SAFE_CALL(cudaGetLastError());
 }
 
-void CudaMfcc::ProcessWindows(int num_frames,
+void CudaSpectralFeatures::ProcessWindows(int num_frames,
                               const FrameExtractionOptions &opts,
                               CuVectorBase<BaseFloat> *log_energy_pre_window) {
   if (num_frames == 0) return;
@@ -433,15 +449,16 @@ void CudaMfcc::ProcessWindows(int num_frames,
   CU_SAFE_CALL(cudaGetLastError());
 }
 
-void CudaMfcc::ComputeFinalFeatures(int num_frames, BaseFloat vtln_wrap,
+void CudaSpectralFeatures::ComputeFinalFeatures(int num_frames, BaseFloat vtln_wrap,
                                     CuVector<BaseFloat> *cu_signal_log_energy,
                                     CuMatrix<BaseFloat> *cu_features) {
+  MfccOptions mfcc_opts = cumfcc_opts_.mfcc_opts;
   Vector<float> tmp;
-  assert(opts_.htk_compat == false);
+  assert(mfcc_opts.htk_compat == false);
 
   if (num_frames == 0) return;
 
-  if (opts_.use_energy && !opts_.raw_energy) {
+  if (mfcc_opts.use_energy && !mfcc_opts.raw_energy) {
     dot_log_kernel<<<num_frames, CU1DBLOCK>>>(
         num_frames, cu_windows_.NumCols(), cu_windows_.Data(),
         cu_windows_.Stride(), cu_signal_log_energy->Data());
@@ -465,7 +482,7 @@ void CudaMfcc::ComputeFinalFeatures(int num_frames, BaseFloat vtln_wrap,
 
   power_spectrum_kernel<<<num_frames, CU1DBLOCK>>>(
       padded_length_, tmp_window_.Data(), tmp_window_.Stride(),
-      power_spectrum.Data(), power_spectrum.Stride());
+      power_spectrum.Data(), power_spectrum.Stride(), cumfcc_opts_.use_power);
   CU_SAFE_CALL(cudaGetLastError());
 
   // mel banks
@@ -476,24 +493,32 @@ void CudaMfcc::ComputeFinalFeatures(int num_frames, BaseFloat vtln_wrap,
   mel_banks_compute_kernel<<<mel_blocks, mel_threads>>>(
       num_frames, std::numeric_limits<float>::epsilon(), offsets_, sizes_,
       vecs_, power_spectrum.Data(), power_spectrum.Stride(),
-      cu_mel_energies_.Data(), cu_mel_energies_.Stride());
+      cu_mel_energies_.Data(), cu_mel_energies_.Stride(),
+      cumfcc_opts_.use_log_fbank);
   CU_SAFE_CALL(cudaGetLastError());
 
   // dct transform
-  cu_features->AddMatMat(1.0, cu_mel_energies_, kNoTrans, cu_dct_matrix_,
-                         kTrans, 0.0);
-
-  apply_lifter_and_floor_energy<<<num_frames, CU1DBLOCK>>>(
-      cu_features->NumRows(), cu_features->NumCols(), opts_.cepstral_lifter,
-      opts_.use_energy, opts_.energy_floor, cu_signal_log_energy->Data(),
-      cu_lifter_coeffs_.Data(), cu_features->Data(), cu_features->Stride());
+  if (cumfcc_opts_.use_dct) {
+     cu_features->AddMatMat(1.0, cu_mel_energies_, kNoTrans, cu_dct_matrix_,
+                            kTrans, 0.0);
+
+     apply_lifter_and_floor_energy<<<num_frames, CU1DBLOCK>>>(
+         cu_features->NumRows(), cu_features->NumCols(), 
+	 mfcc_opts.cepstral_lifter, mfcc_opts.use_energy, 
+	 mfcc_opts.energy_floor, cu_signal_log_energy->Data(),
+         cu_lifter_coeffs_.Data(), cu_features->Data(), cu_features->Stride());
+  } else {
+    cudaMemcpyAsync(cu_features->Data(), cu_mel_energies_.Data(),
+               sizeof(BaseFloat) * num_frames * cu_features->Stride(),
+               cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+  }
   CU_SAFE_CALL(cudaGetLastError());
 }
 
-void CudaMfcc::ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
+void CudaSpectralFeatures::ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
                                BaseFloat sample_freq, BaseFloat vtln_warp,
                                CuMatrix<BaseFloat> *cu_features) {
-  nvtxRangePushA("CudaMfcc::ComputeFeatures");
+  nvtxRangePushA("CudaSpectralFeatures::ComputeFeatures");
   const FrameExtractionOptions &frame_opts = GetFrameOptions();
   int num_frames = NumFrames(cu_wave.Dim(), frame_opts, true);
   // compute fft frames by rounding up to a multiple of fft_size_
@@ -533,7 +558,7 @@ void CudaMfcc::ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
 
   nvtxRangePop();
 }
-CudaMfcc::~CudaMfcc() {
+CudaSpectralFeatures::~CudaSpectralFeatures() {
   delete[] cu_vecs_;
   CuDevice::Instantiate().Free(vecs_);
   CuDevice::Instantiate().Free(offsets_);

diff --git a/src/cudafeat/feature-mfcc-cuda.h → src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-mfcc-cuda.h → src/cudafeat/feature-spectral-cuda.h
@@ -1,4 +1,4 @@
-// cudafeat/feature-mfcc-cuda.h
+// cudafeat/feature-spectral-cuda.h
 //
 // Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 // Justin Luitjens
@@ -25,20 +25,65 @@
 #include "cudafeat/feature-window-cuda.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
+#include "feat/feature-fbank.h"
 #include "feat/feature-mfcc.h"
 
 namespace kaldi {
-// This class implements MFCC computation in CUDA.
+enum SpectralFeatureType {MFCC, FBANK};
+struct CudaSpectralFeatureOptions {
+  MfccOptions mfcc_opts;
+  bool use_log_fbank; // LDB: Adding these two to enable fbank and mfcc
+  bool use_power;     //   to use the same code path for GPU (and CPU?)
+  bool use_dct;       // LDB: Adding this so that fbank can run w/o applying dct
+  SpectralFeatureType feature_type;
+  CudaSpectralFeatureOptions(MfccOptions opts_in)
+      : mfcc_opts(opts_in),
+        use_log_fbank(true), 
+	use_power(true), 
+	use_dct(true),
+        feature_type(MFCC) {}
+  CudaSpectralFeatureOptions(FbankOptions opts){
+     mfcc_opts.frame_opts      = opts.frame_opts;
+     mfcc_opts.mel_opts        = opts.mel_opts;
+     mfcc_opts.use_energy      = opts.use_energy;
+     mfcc_opts.energy_floor    = opts.energy_floor;
+     mfcc_opts.raw_energy      = opts.raw_energy;
+     mfcc_opts.htk_compat      = opts.htk_compat;
+     mfcc_opts.cepstral_lifter = 0.0f;
+     use_log_fbank = opts.use_log_fbank;
+     use_power = opts.use_power;
+     use_dct = false;
+     feature_type = FBANK;
+  }
+  // Default is MFCC
+  CudaSpectralFeatureOptions() : use_log_fbank(true),
+                  use_power(true),
+                  use_dct(true),
+                  feature_type(MFCC)	{}
+
+};
+// This class implements MFCC and Fbank computation in CUDA.
 // It takes input from device memory and outputs to
 // device memory.  It also does no synchronization.
-class CudaMfcc : public MfccComputer {
+class CudaSpectralFeatures : public MfccComputer {
  public:
   void ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
                        BaseFloat sample_freq, BaseFloat vtln_warp,
                        CuMatrix<BaseFloat> *cu_features);
 
-  CudaMfcc(const MfccOptions &opts);
-  ~CudaMfcc();
+  CudaSpectralFeatures(const CudaSpectralFeatureOptions &opts);
+  ~CudaSpectralFeatures();
+  CudaSpectralFeatureOptions cumfcc_opts_;
+  int32 Dim()
+  // The dimension of the output is different for MFCC and Fbank. 
+  // This returns the appropriate value depending on the feature
+  // extraction algorithm
+  {
+    if (cumfcc_opts_.feature_type == MFCC) return MfccComputer::Dim();
+    //If we're running fbank, we need to set the dimension right
+    else return cumfcc_opts_.mfcc_opts.mel_opts.num_bins + 
+	        (cumfcc_opts_.mfcc_opts.use_energy ? 1 : 0);
+  }
 
  private:
   void ExtractWindows(int32 num_frames, int64 sample_offset,

diff --git a/src/cudafeat/online-cuda-feature-pipeline.cc b/src/cudafeat/online-cuda-feature-pipeline.cc
@@ -23,9 +23,12 @@ namespace kaldi {
 
 OnlineCudaFeaturePipeline::OnlineCudaFeaturePipeline(
     const OnlineNnet2FeaturePipelineConfig &config)
-    : info_(config), mfcc(NULL), ivector(NULL) {
+    : info_(config), spectral_feat(NULL), ivector(NULL) {
   if (info_.feature_type == "mfcc") {
-    mfcc = new CudaMfcc(info_.mfcc_opts);
+    spectral_feat = new CudaSpectralFeatures(info_.mfcc_opts);
+  }
+  if (info_.feature_type == "fbank") {
+    spectral_feat = new CudaSpectralFeatures(info_.fbank_opts);
   }
 
   if (info_.use_ivectors) {
@@ -43,27 +46,26 @@ OnlineCudaFeaturePipeline::OnlineCudaFeaturePipeline(
 }
 
 OnlineCudaFeaturePipeline::~OnlineCudaFeaturePipeline() {
-  if (mfcc != NULL) delete mfcc;
+  if (spectral_feat = NULL) delete spectral_feat;
   if (ivector != NULL) delete ivector;
 }
 
 void OnlineCudaFeaturePipeline::ComputeFeatures(
     const CuVectorBase<BaseFloat> &cu_wave, BaseFloat sample_freq,
     CuMatrix<BaseFloat> *input_features,
     CuVector<BaseFloat> *ivector_features) {
-  if (info_.feature_type == "mfcc") {
+  if (info_.feature_type == "mfcc" || info_.feature_type == "fbank") {
+    // Fbank called via the MFCC codepath
     // MFCC
     float vtln_warp = 1.0;
-    mfcc->ComputeFeatures(cu_wave, sample_freq, vtln_warp, input_features);
+    spectral_feat->ComputeFeatures(cu_wave, sample_freq, vtln_warp, input_features);
   } else {
     KALDI_ASSERT(false);
   }
 
   // Ivector
   if (info_.use_ivectors && ivector_features != NULL) {
     ivector->GetIvector(*input_features, ivector_features);
-  } else {
-    KALDI_ASSERT(false);
   }
 }
 

diff --git a/src/cudafeat/online-cuda-feature-pipeline.h b/src/cudafeat/online-cuda-feature-pipeline.h
@@ -25,7 +25,7 @@
 #include <vector>
 
 #include "base/kaldi-error.h"
-#include "cudafeat/feature-mfcc-cuda.h"
+#include "cudafeat/feature-spectral-cuda.h"
 #include "cudafeat/online-ivector-feature-cuda.h"
 #include "matrix/matrix-lib.h"
 #include "online2/online-nnet2-feature-pipeline.h"
@@ -47,7 +47,7 @@ class OnlineCudaFeaturePipeline {
 
  private:
   OnlineNnet2FeaturePipelineInfo info_;
-  CudaMfcc *mfcc;
+  CudaSpectralFeatures *spectral_feat;
   IvectorExtractorFastCuda *ivector;
 };
 }  // namespace kaldi

diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile
@@ -9,7 +9,7 @@ LDLIBS += $(CUDA_LDLIBS)
 BINFILES =
 
 ifeq ($(CUDA), true)
-  BINFILES += compute-mfcc-feats-cuda apply-cmvn-online-cuda compute-online-feats-cuda
+  BINFILES += compute-mfcc-feats-cuda apply-cmvn-online-cuda compute-online-feats-cuda compute-fbank-feats-cuda
 endif
 
 OBJFILES =