diff --git a/README.md b/README.md index 3c9dfd3..47b4232 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ diffsptk.write("voiced.wav", x_voiced, sr) diffsptk.write("unvoiced.wav", x_unvoiced, sr) ``` -### Mel-spectrogram and MFCC extraction +### Mel-spectrogram, MFCC, and PLP extraction ```python import diffsptk @@ -127,6 +127,16 @@ mfcc = diffsptk.MFCC( ) Y = mfcc(X) print(Y.shape) + +# Extract PLP. +plp = diffsptk.PLP( + plp_order=M, + n_channel=n_channel, + fft_length=n_fft, + sample_rate=sr, +) +Y = plp(X) +print(Y.shape) ``` ### Subband decomposition diff --git a/diffsptk/core/__init__.py b/diffsptk/core/__init__.py index 46f510a..1e713d3 100644 --- a/diffsptk/core/__init__.py +++ b/diffsptk/core/__init__.py @@ -78,6 +78,8 @@ from .pca import PrincipalComponentAnalysis as PCA from .phase import Phase from .pitch import Pitch +from .plp import PerceptualLinearPredictiveCoefficientsAnalysis +from .plp import PerceptualLinearPredictiveCoefficientsAnalysis as PLP from .pol_root import RootsToPolynomial from .poledf import AllPoleDigitalFilter from .pqmf import PseudoQuadratureMirrorFilterBanks diff --git a/diffsptk/core/c2acr.py b/diffsptk/core/c2acr.py index 3a4d82b..715c9b6 100644 --- a/diffsptk/core/c2acr.py +++ b/diffsptk/core/c2acr.py @@ -68,6 +68,5 @@ def forward(self, c): """ x = torch.fft.rfft(c, n=self.fft_length).real x = torch.exp(2 * x) - r = torch.fft.hfft(x)[..., : self.acr_order + 1] - r = r / self.fft_length + r = torch.fft.hfft(x, norm="forward")[..., : self.acr_order + 1] return r diff --git a/diffsptk/core/fbank.py b/diffsptk/core/fbank.py index c84db51..8c0574e 100644 --- a/diffsptk/core/fbank.py +++ b/diffsptk/core/fbank.py @@ -59,11 +59,13 @@ def __init__( f_min=0, f_max=None, floor=1e-5, + use_power=False, out_format="y", ): super(MelFilterBankAnalysis, self).__init__() self.floor = floor + self.use_power = use_power self.out_format = out_format if f_max is None: @@ -87,6 +89,9 @@ def __init__( def hz_to_mel(x): return 1127 * np.log(x / 700 + 1) + def mel_to_hz(x): + return 700 * (np.exp(x / 1127) - 1) + lower_bin_index = max(1, int(f_min / sample_rate * fft_length + 1.5)) upper_bin_index = min( fft_length // 2, int(f_max / sample_rate * fft_length + 0.5) @@ -96,17 +101,18 @@ def hz_to_mel(x): mel_max = hz_to_mel(f_max) seed = np.arange(1, n_channel + 2) - freq = (mel_max - mel_min) / (n_channel + 1) * seed + mel_min + center_frequencies = (mel_max - mel_min) / (n_channel + 1) * seed + mel_min + self.center_frequencies = mel_to_hz(center_frequencies) seed = np.arange(lower_bin_index, upper_bin_index) mel = hz_to_mel(sample_rate * seed / fft_length) - lower_channel_map = [np.argmax(0 < (m <= freq)) for m in mel] + lower_channel_map = [np.argmax(0 < (m <= center_frequencies)) for m in mel] - diff = freq - np.insert(freq[:-1], 0, mel_min) + diff = center_frequencies - np.insert(center_frequencies[:-1], 0, mel_min) weights = np.zeros((fft_length // 2 + 1, n_channel)) for i, k in enumerate(seed): m = lower_channel_map[i] - w = (freq[max(0, m)] - mel[i]) / diff[max(0, m)] + w = (center_frequencies[max(0, m)] - mel[i]) / diff[max(0, m)] if 0 < m: weights[k, m - 1] += w if m < n_channel: @@ -141,7 +147,8 @@ def forward(self, x): [3.3640, 3.4518, 2.7717, 0.5088]]) """ - y = torch.matmul(torch.sqrt(x), self.H) + y = x if self.use_power else torch.sqrt(x) + y = torch.matmul(y, self.H) y = torch.log(torch.clip(y, min=self.floor)) E = (2 * x[..., 1:-1]).sum(-1) + x[..., 0] + x[..., -1] E = torch.log(E / (2 * (x.size(-1) - 1))).unsqueeze(-1) diff --git a/diffsptk/core/levdur.py b/diffsptk/core/levdur.py index 1094b9c..1c79453 100644 --- a/diffsptk/core/levdur.py +++ b/diffsptk/core/levdur.py @@ -23,7 +23,7 @@ class LevinsonDurbin(nn.Module): """See `this page `_ - for details. + for details. The implementation is based on a simple matrix inversion. Parameters ---------- diff --git a/diffsptk/core/mfcc.py b/diffsptk/core/mfcc.py index 8b7ca07..c0a32db 100644 --- a/diffsptk/core/mfcc.py +++ b/diffsptk/core/mfcc.py @@ -73,6 +73,7 @@ def __init__( self.mfcc_order = mfcc_order assert 1 <= self.mfcc_order < n_channel + assert 1 <= lifter if out_format == 0 or out_format == "y": self.format_func = lambda y, c, E: y @@ -90,11 +91,10 @@ def __init__( ) self.dct = DiscreteCosineTransform(n_channel) - m = np.arange(1, self.mfcc_order + 1) - liftering_vector = 1 + (lifter / 2) * np.sin((np.pi / lifter) * m) - self.register_buffer("liftering_vector", numpy_to_torch(liftering_vector)) - - self.const = np.sqrt(2) + m = np.arange(self.mfcc_order + 1) + v = 1 + (lifter / 2) * np.sin((np.pi / lifter) * m) + v[0] = np.sqrt(2) + self.register_buffer("liftering_vector", numpy_to_torch(v)) def forward(self, x): """Compute MFCC. @@ -128,6 +128,6 @@ def forward(self, x): """ y, E = self.fbank(x) y = self.dct(y) - c = y[..., :1] * self.const - y = y[..., 1 : self.mfcc_order + 1] * self.liftering_vector + y = y[..., : self.mfcc_order + 1] * self.liftering_vector + c, y = torch.split(y, [1, self.mfcc_order], dim=-1) return self.format_func(y, c, E) diff --git a/diffsptk/core/plp.py b/diffsptk/core/plp.py new file mode 100644 index 0000000..1198e9d --- /dev/null +++ b/diffsptk/core/plp.py @@ -0,0 +1,165 @@ +# ------------------------------------------------------------------------ # +# Copyright 2022 SPTK Working Group # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# ------------------------------------------------------------------------ # + +import numpy as np +import torch +import torch.nn as nn + +from ..misc.utils import numpy_to_torch +from .fbank import MelFilterBankAnalysis +from .levdur import LevinsonDurbin +from .mgc2mgc import MelGeneralizedCepstrumToMelGeneralizedCepstrum + + +class PerceptualLinearPredictiveCoefficientsAnalysis(nn.Module): + """See `this page `_ + for details. + + Parameters + ---------- + mfcc_order : int >= 1 [scalar] + Order of MFCC, :math:`M`. + + n_channel : int >= 1 [scalar] + Number of mel-filter banks, :math:`C`. + + fft_length : int >= 2 [scalar] + Number of FFT bins, :math:`L`. + + sample_rate : int >= 1 [scalar] + Sample rate in Hz. + + lifter : int >= 1 [scalar] + Liftering coefficient. + + compression_factor : float > 0 [scalar] + Amplitude compression factor. + + f_min : float >= 0 [scalar] + Minimum frequency in Hz. + + f_max : float <= sample_rate // 2 [scalar] + Maximum frequency in Hz. + + floor : float > 0 [scalar] + Minimum mel-filter bank output in linear scale. + + out_format : ['y', 'yE', 'yc', 'ycE'] + `y` is MFCC, `c` is C0, and `E` is energy. + + n_fft : int >> :math:`M` [scalar] + Number of FFT bins. Accurate conversion requires the large value. + + """ + + def __init__( + self, + plp_order, + n_channel, + fft_length, + sample_rate, + lifter=1, + compression_factor=0.33, + out_format="y", + n_fft=512, + **fbank_kwargs, + ): + super(PerceptualLinearPredictiveCoefficientsAnalysis, self).__init__() + + self.plp_order = plp_order + self.compression_factor = compression_factor + + assert 1 <= self.plp_order < n_channel + assert 1 <= lifter + assert 0 < self.compression_factor + + if out_format == 0 or out_format == "y": + self.format_func = lambda y, c, E: y + elif out_format == 1 or out_format == "yE": + self.format_func = lambda y, c, E: torch.cat((y, E), dim=-1) + elif out_format == 2 or out_format == "yc": + self.format_func = lambda y, c, E: torch.cat((y, c), dim=-1) + elif out_format == 3 or out_format == "ycE": + self.format_func = lambda y, c, E: torch.cat((y, c, E), dim=-1) + else: + raise ValueError(f"out_format {out_format} is not supported") + + self.fbank = MelFilterBankAnalysis( + n_channel, + fft_length, + sample_rate, + use_power=True, + out_format="y,E", + **fbank_kwargs, + ) + self.levdur = LevinsonDurbin(self.plp_order) + self.lpc2c = MelGeneralizedCepstrumToMelGeneralizedCepstrum( + self.plp_order, + self.plp_order, + in_gamma=-1, + in_norm=True, + in_mul=True, + n_fft=n_fft, + ) + + f = self.fbank.center_frequencies[:-1] ** 2 + e = (f / (f + 1.6e5)) ** 2 * (f + 1.44e6) / (f + 9.61e6) + self.register_buffer("equal_loudness_curve", numpy_to_torch(e)) + + m = np.arange(self.plp_order + 1) + v = 1 + (lifter / 2) * np.sin((np.pi / lifter) * m) + v[0] = 2 + self.register_buffer("liftering_vector", numpy_to_torch(v)) + + def forward(self, x): + """Compute PLP. + + Parameters + ---------- + x : Tensor [shape=(..., L/2+1)] + Power spectrum. + + Returns + ------- + y : Tensor [shape=(..., M)] + PLP without C0. + + E : Tensor [shape=(..., 1)] + Energy. + + c : Tensor [shape=(..., 1)] + C0. + + Examples + -------- + >>> x = diffsptk.ramp(19) + >>> stft = diffsptk.STFT(frame_length=10, frame_period=10, fft_length=32) + >>> plp = diffsptk.PLP(4, 8, 32, 8000) + >>> y = plp(stft(x)) + >>> y + tensor([[-0.2896, -0.2356, -0.0586, -0.0387], + [ 0.4468, -0.5820, 0.0104, -0.0505]]) + + """ + y, E = self.fbank(x) + y = (torch.exp(y) * self.equal_loudness_curve) ** self.compression_factor + y = torch.cat((y[..., :1], y, y[..., -1:]), dim=-1) + y = torch.fft.hfft(y, norm="forward")[..., : self.plp_order + 1].real + y = self.levdur(y) + y = self.lpc2c(y) + y *= self.liftering_vector + c, y = torch.split(y, [1, self.plp_order], dim=-1) + return self.format_func(y, c, E) diff --git a/docs/core/mfcc.rst b/docs/core/mfcc.rst index b2a56c0..a1f2080 100644 --- a/docs/core/mfcc.rst +++ b/docs/core/mfcc.rst @@ -8,4 +8,4 @@ mfcc .. autoclass:: diffsptk.MelFrequencyCepstralCoefficientsAnalysis :members: -.. seealso:: :ref:`stft` :ref:`fbank` +.. seealso:: :ref:`stft` :ref:`fbank` :ref:`plp` diff --git a/docs/core/plp.rst b/docs/core/plp.rst new file mode 100644 index 0000000..e031181 --- /dev/null +++ b/docs/core/plp.rst @@ -0,0 +1,11 @@ +.. _plp: + +plp +--- + +.. autoclass:: diffsptk.PLP + +.. autoclass:: diffsptk.PerceptualLinearPredictiveCoefficientsAnalysis + :members: + +.. seealso:: :ref:`stft` :ref:`fbank` :ref:`mfcc` diff --git a/tests/test_plp.py b/tests/test_plp.py new file mode 100644 index 0000000..a5a8569 --- /dev/null +++ b/tests/test_plp.py @@ -0,0 +1,45 @@ +# ------------------------------------------------------------------------ # +# Copyright 2022 SPTK Working Group # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# ------------------------------------------------------------------------ # + +import pytest + +import diffsptk +import tests.utils as U + + +@pytest.mark.parametrize("device", ["cpu", "cuda"]) +@pytest.mark.parametrize("o", [0, 1, 2, 3]) +def test_compatibility( + device, o, M=4, C=10, L=32, sr=8000, lifter=20, f_min=300, factor=0.3, B=2 +): + spec = diffsptk.Spectrum(L, eps=0) + plp = diffsptk.PLP( + M, C, L, sr, lifter=lifter, f_min=f_min, compression_factor=factor, out_format=o + ) + + s = sr // 1000 + U.check_compatibility( + device, + [plp, spec], + [], + f"nrand -l {B*L}", + f"plp -m {M} -n {C} -l {L} -s {s} -c {lifter} -L {f_min} -f {factor} -o {o}", + [], + dx=L, + dy=M + (o if o <= 1 else o - 1), + ) + + U.check_differentiable(device, [plp, spec], [B, L])