add plp

sp-nitech · Jan 8, 2024 · 97438b4 · 97438b4
1 parent 629fe76
commit 97438b4
Show file tree

Hide file tree

Showing 11 changed files with 895 additions and 10 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -80,6 +80,7 @@ set(CC_SOURCES
   ${SOURCE_DIR}/analysis/mel_filter_bank_analysis.cc
   ${SOURCE_DIR}/analysis/mel_frequency_cepstral_coefficients_analysis.cc
   ${SOURCE_DIR}/analysis/mel_generalized_cepstral_analysis.cc
+  ${SOURCE_DIR}/analysis/perceptual_linear_predictive_coefficients_analysis.cc
   ${SOURCE_DIR}/analysis/pitch_extraction.cc
   ${SOURCE_DIR}/analysis/pitch_extraction_by_dio.cc
   ${SOURCE_DIR}/analysis/pitch_extraction_by_harvest.cc
@@ -177,7 +178,9 @@ set(CC_SOURCES
   ${SOURCE_DIR}/math/gaussian_mixture_modeling.cc
   ${SOURCE_DIR}/math/histogram_calculation.cc
   ${SOURCE_DIR}/math/inverse_discrete_cosine_transform.cc
+  ${SOURCE_DIR}/math/inverse_discrete_fourier_transform.cc
   ${SOURCE_DIR}/math/inverse_fast_fourier_transform.cc
+  ${SOURCE_DIR}/math/inverse_fourier_transform.cc
   ${SOURCE_DIR}/math/levinson_durbin_recursion.cc
   ${SOURCE_DIR}/math/matrix.cc
   ${SOURCE_DIR}/math/matrix2d.cc
@@ -352,6 +355,7 @@ set(MAIN_SOURCES
   ${SOURCE_DIR}/main/pca.cc
   ${SOURCE_DIR}/main/pcas.cc
   ${SOURCE_DIR}/main/phase.cc
+  ${SOURCE_DIR}/main/plp.cc
   ${SOURCE_DIR}/main/pitch.cc
   ${SOURCE_DIR}/main/pitch2sin.cc
   ${SOURCE_DIR}/main/pitch_mark.cc

diff --git a/doc/main/mfcc.rst b/doc/main/mfcc.rst
@@ -5,7 +5,7 @@ mfcc
 
 .. doxygenfile:: mfcc.cc
 
-.. seealso:: :ref:`fbank`
+.. seealso:: :ref:`fbank`  :ref:`plp`
 
 .. doxygenclass:: sptk::MelFrequencyCepstralCoefficientsAnalysis
    :members:
diff --git a/doc/main/plp.rst b/doc/main/plp.rst
@@ -0,0 +1,11 @@
+.. _plp:
+
+plp
+===
+
+.. doxygenfile:: plp.cc
+
+.. seealso:: :ref:`fbank`  :ref:`mfcc`
+
+.. doxygenclass:: sptk::PerceptualLinearPredictiveCoefficientsAnalysis
+   :members:
diff --git a/include/SPTK/analysis/mel_filter_bank_analysis.h b/include/SPTK/analysis/mel_filter_bank_analysis.h
@@ -104,6 +104,11 @@ class MelFilterBankAnalysis {
     return is_valid_;
   }
 
+  /**
+   * @return Center frequencies in Hz.
+   */
+  bool GetCenterFrequencies(std::vector<double>* center_frequencies) const;
+
   /**
    * @param[in] power_spectrum @f$(N/2+1)@f$-length power spectrum.
    * @param[out] filter_bank_output @f$C@f$-channel filter-bank outputs.
@@ -123,6 +128,7 @@ class MelFilterBankAnalysis {
 
   int lower_bin_index_;
   int upper_bin_index_;
+  std::vector<double> center_frequencies_;
   std::vector<int> channel_indices_;
   std::vector<double> channel_weights_;
 

diff --git a/include/SPTK/analysis/perceptual_linear_predictive_coefficients_analysis.h b/include/SPTK/analysis/perceptual_linear_predictive_coefficients_analysis.h
@@ -0,0 +1,173 @@
+// ------------------------------------------------------------------------ //
+// Copyright 2021 SPTK Working Group                                        //
+//                                                                          //
+// Licensed under the Apache License, Version 2.0 (the "License");          //
+// you may not use this file except in compliance with the License.         //
+// You may obtain a copy of the License at                                  //
+//                                                                          //
+//     http://www.apache.org/licenses/LICENSE-2.0                           //
+//                                                                          //
+// Unless required by applicable law or agreed to in writing, software      //
+// distributed under the License is distributed on an "AS IS" BASIS,        //
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
+// See the License for the specific language governing permissions and      //
+// limitations under the License.                                           //
+// ------------------------------------------------------------------------ //
+
+#ifndef SPTK_ANALYSIS_PERCEPTUAL_LINEAR_PREDICTIVE_COEFFICIENTS_ANALYSIS_H_
+#define SPTK_ANALYSIS_PERCEPTUAL_LINEAR_PREDICTIVE_COEFFICIENTS_ANALYSIS_H_
+
+#include <vector>  // std::vector
+
+#include "SPTK/analysis/mel_filter_bank_analysis.h"
+#include "SPTK/conversion/linear_predictive_coefficients_to_cepstrum.h"
+#include "SPTK/math/inverse_fourier_transform.h"
+#include "SPTK/math/levinson_durbin_recursion.h"
+#include "SPTK/utils/sptk_utils.h"
+
+namespace sptk {
+
+/**
+ * Perform perceptual linear predictive (PLP) coefficients analysis.
+ *
+ * The input is the half part of power spectrum:
+ * @f[
+ *   \begin{array}{cccc}
+ *     |X(0)|^2, & |X(1)|^2, & \ldots, & |X(N/2)|^2,
+ *   \end{array}
+ * @f]
+ * where @f$N@f$ is the FFT length. The outputs are the @f$M@f$-th order PLP
+ * features with the zeroth cepstral parameter:
+ * @f[
+ *   \begin{array}{ccccc}
+ *     c(0), & \bar{c}(1), & \bar{c}(2), & \ldots, & \bar{c}(M)
+ *   \end{array}
+ * @f]
+ * and the log-signal energy @f$E@f$.
+ *
+ * [1] S. Young et al., &quot;The HTK book,&quot; Cambridge University
+ *     Engineering Department, 2006.
+ */
+class PerceptualLinearPredictiveCoefficientsAnalysis {
+ public:
+  /**
+   * Buffer for PerceptualLinearPredictiveCoefficientsAnalysis class.
+   */
+  class Buffer {
+   public:
+    Buffer() {
+    }
+
+    virtual ~Buffer() {
+    }
+
+   private:
+    std::vector<double> filter_bank_output_;
+    std::vector<double> spectrum_;
+    std::vector<double> cepstrum_;
+
+    std::vector<double> real_part_input_;
+    std::vector<double> real_part_output_;
+    std::vector<double> imag_part_input_;
+    std::vector<double> imag_part_output_;
+
+    LevinsonDurbinRecursion::Buffer buffer_for_levinson_durbin_recursion_;
+
+    friend class PerceptualLinearPredictiveCoefficientsAnalysis;
+    DISALLOW_COPY_AND_ASSIGN(Buffer);
+  };
+
+  /**
+   * @param[in] fft_length Number of FFT bins, @f$N@f$.
+   * @param[in] num_channel Number of channels, @f$C@f$.
+   * @param[in] num_order Order of cepstral coefficients, @f$M@f$.
+   * @param[in] liftering_coefficient A parameter of liftering, @f$L@f$.
+   * @param[in] sampling_rate Sampling rate in Hz.
+   * @param[in] lowest_frequency Lowest frequency in Hz.
+   * @param[in] highest_frequency Highest frequency in Hz.
+   * @param[in] floor Floor value of raw filter-bank output.
+   * @param[in] compression_factor Amplitude compression factor.
+   */
+  PerceptualLinearPredictiveCoefficientsAnalysis(
+      int fft_length, int num_channel, int num_order, int liftering_coefficient,
+      double sampling_rate, double lowest_frequency, double highest_frequency,
+      double floor, double compression_factor);
+
+  virtual ~PerceptualLinearPredictiveCoefficientsAnalysis() {
+  }
+
+  /**
+   * @return FFT size.
+   */
+  int GetFftLength() const {
+    return mel_filter_bank_analysis_.GetFftLength();
+  }
+
+  /**
+   * @return Number of channels.
+   */
+  int GetNumChannel() const {
+    return mel_filter_bank_analysis_.GetNumChannel();
+  }
+
+  /**
+   * @return Order of cepstral coefficients.
+   */
+  int GetNumOrder() const {
+    return levinson_durbin_recursion_.GetNumOrder();
+  }
+
+  /**
+   * @return Liftering coefficient.
+   */
+  int GetLifteringCoefficient() const {
+    return liftering_coefficient_;
+  }
+
+  /**
+   * @return Compression factor.
+   */
+  double GetCompressionFactor() const {
+    return compression_factor_;
+  }
+
+  /**
+   * @return True if this object is valid.
+   */
+  bool IsValid() const {
+    return is_valid_;
+  }
+
+  /**
+   * @param[in] power_spectrum @f$(N/2+1)@f$-length power spectrum.
+   * @param[out] plp @f$M@f$-th order PLP features.
+   * @param[out] energy Signal energy @f$E@f$ (optional).
+   * @param[out] buffer Buffer.
+   * @return True on success, false on failure.
+   */
+  bool Run(
+      const std::vector<double>& power_spectrum, std::vector<double>* plp,
+      double* energy,
+      PerceptualLinearPredictiveCoefficientsAnalysis::Buffer* buffer) const;
+
+ private:
+  const int liftering_coefficient_;
+  const double compression_factor_;
+
+  const MelFilterBankAnalysis mel_filter_bank_analysis_;
+  const InverseFourierTransform inverse_fourier_transform_;
+  const LevinsonDurbinRecursion levinson_durbin_recursion_;
+  const LinearPredictiveCoefficientsToCepstrum
+      linear_predictive_coefficients_to_cepstrum_;
+
+  bool is_valid_;
+
+  std::vector<double> equal_loudness_curve_;
+  std::vector<double> cepstal_weights_;
+
+  DISALLOW_COPY_AND_ASSIGN(PerceptualLinearPredictiveCoefficientsAnalysis);
+};
+
+}  // namespace sptk
+
+#endif  // SPTK_ANALYSIS_PERCEPTUAL_LINEAR_PREDICTIVE_COEFFICIENTS_ANALYSIS_H_
diff --git a/src/analysis/mel_filter_bank_analysis.cc b/src/analysis/mel_filter_bank_analysis.cc
@@ -17,16 +17,19 @@
 #include "SPTK/analysis/mel_filter_bank_analysis.h"
 
 #include <algorithm>  // std::fill, std::max, std::min
-#include <cmath>      // std::log, std::sqrt
+#include <cmath>      // std::exp, std::log, std::sqrt
 #include <cstddef>    // std::size_t
 #include <numeric>    // std::accumulate
 
 namespace {
 
 // Note that HTK use 1127 instead of 1127.01048.
 double HzToMel(double hz) {
-  // return 1127.01048 * std::log(hz / 700.0 + 1.0);
-  return 1127 * std::log(hz / 700.0 + 1.0);
+  return 1127.0 * std::log(hz / 700.0 + 1.0);
+}
+
+double MelToHz(double mel) {
+  return 700.0 * (std::exp(mel / 1127.0) - 1.0);
 }
 
 double SampleMel(int index, int fft_length, double sampling_rate) {
@@ -74,8 +77,8 @@ MelFilterBankAnalysis::MelFilterBankAnalysis(int fft_length, int num_channel,
   const double mel_high(HzToMel(highest_frequency));
 
   // Create vector of filter-bank center frequencies.
-  std::vector<double> center_frequencies(num_channel_ + 1);
-  double* cf(&(center_frequencies[0]));
+  center_frequencies_.resize(num_channel_ + 1);
+  double* cf(&(center_frequencies_[0]));
   {
     const double diff(mel_high - mel_low);
     for (int m(0); m <= num_channel_; ++m) {
@@ -108,6 +111,24 @@ MelFilterBankAnalysis::MelFilterBankAnalysis(int fft_length, int num_channel,
   }
 }
 
+bool MelFilterBankAnalysis::GetCenterFrequencies(
+    std::vector<double>* center_frequencies) const {
+  if (!is_valid_ || NULL == center_frequencies) {
+    return false;
+  }
+
+  if (center_frequencies->size() !=
+      static_cast<std::size_t>(num_channel_ + 1)) {
+    center_frequencies->resize(num_channel_ + 1);
+  }
+
+  for (int m(0); m <= num_channel_; ++m) {
+    (*center_frequencies)[m] = MelToHz(center_frequencies_[m]);
+  }
+
+  return true;
+}
+
 bool MelFilterBankAnalysis::Run(const std::vector<double>& power_spectrum,
                                 std::vector<double>* filter_bank_output,
                                 double* energy) const {