We use cookies on this website to distinguish you from other users.
We use this data to improve our content experience and for targeted advertising.
By continuing to use this website you consent to our use of cookies.
For more information, please see our
Cookie Policy.
Source code for spafe.features.ngcc
"""
- Description : Normalized Gammachirp Cepstral Coefficients (NGCCs) extraction algorithm implementation.
- Copyright (c) 2019-2024 Ayoub Malek.
This source code is licensed under the terms of the BSD 3-Clause License.
For a copy, see <https://github.com/SuperKogito/spafe/blob/master/LICENSE>.
"""
from typing import Optional
import numpy as np
from scipy.fftpack import dct
from ..fbanks.gammatone_fbanks import gammatone_filter_banks
from ..utils.cepstral import normalize_ceps, lifter_ceps, NormalizationType
from ..utils.converters import ErbConversionApproach
from ..utils.exceptions import ParameterError, ErrorMsgs
from ..utils.filters import ScaleType
from ..utils.preprocessing import (
pre_emphasis,
framing,
windowing,
zero_handling,
SlidingWindow,
)
[docs]def ngcc(
sig: np.ndarray,
fs: int = 16000,
num_ceps=13,
pre_emph: bool = True,
pre_emph_coeff: float = 0.97,
window: Optional[SlidingWindow] = None,
nfilts: int = 24,
nfft: int = 512,
low_freq: Optional[float] = 0,
high_freq: Optional[float] = None,
scale: ScaleType = "constant",
dct_type: int = 2,
use_energy: bool = False,
lifter: Optional[int] = None,
normalize: Optional[NormalizationType] = None,
fbanks: Optional[np.ndarray] = None,
conversion_approach: ErbConversionApproach = "Glasberg",
) -> np.ndarray:
"""
Compute the normalized gammachirp cepstral coefficients (NGCC features) from
an audio signal according to [Zouhir]_.
Args:
sig (numpy.ndarray) : input mono audio signal (Nx1).
fs (int) : signal sampling frequency.
(Default is 16000).
num_ceps (int) : number of cepstra to return.
(Default is 13).
pre_emph (bool) : apply pre-emphasis if 1.
(Default is True).
pre_emph_coeff (float) : pre-emphasis filter coefficient.
(Default is 0.97).
window (SlidingWindow) : sliding window object.
(Default is None).
nfilts (int) : the number of filters in the filter bank.
(Default is 40.
nfft (int) : number of FFT points.
(Default is 512).
low_freq (float) : lowest band edge of mel filters (Hz).
(Default is 0).
high_freq (float) : highest band edge of mel filters (Hz).
(Default is samplerate / 2).
scale (str) : monotonicity behavior of the filter banks.
(Default is "constant").
dct_type (int) : type of DCT used.
(Default is 2).
use_energy (bool) : overwrite C0 with true log energy
(Default is False).
lifter (int) : apply liftering if specified.
(Default is None).
normalize (str) : apply normalization if specified.
(Default is None).
fbanks (numpy.ndarray) : filter bank matrix.
(Default is None).
conversion_approach (str) : erb scale conversion approach.
(Default is "Glasberg").
Returns:
(numpy.ndarray) : 2d array of NGCC features (num_frames x num_ceps)
Tip:
- :code:`scale` : can take the following options ["constant", "ascendant", "descendant"].
- :code:`dct` : can take the following options [1, 2, 3, 4].
- :code:`normalize` : can take the following options ["mvn", "ms", "vn", "mn"].
- :code:`conversion_approach` : can take the following options ["Glasberg"].
Note that the use of different options than the default can lead to unexpected behavior/issues.
Note:
.. figure:: ../_static/architectures/ngccs.png
Architecture of normalized gammachirp cepstral coefficients extraction algorithm.
Examples:
.. plot::
from scipy.io.wavfile import read
from spafe.features.ngcc import ngcc
from spafe.utils.preprocessing import SlidingWindow
from spafe.utils.vis import show_features
# read audio
fpath = "../../../tests/data/test.wav"
fs, sig = read(fpath)
# compute ngccs
ngccs = ngcc(sig,
fs=fs,
pre_emph=1,
pre_emph_coeff=0.97,
window=SlidingWindow(0.03, 0.015, "hamming"),
nfilts=128,
nfft=2048,
low_freq=0,
high_freq=8000,
normalize="mvn")
# visualize features
show_features(ngccs, "Normalized Gammachirp Cepstral Coefficients", "NGCC Index", "Frame Index")
References:
.. [Zouhir] : Zouhir, Y., & Ouni, K. (2016).
Feature Extraction Method for Improving Speech Recognition in Noisy Environments.
J. Comput. Sci., 12, 56-61.
"""
# run checks
if nfilts < num_ceps:
raise ParameterError(ErrorMsgs["nfilts"])
# get fbanks
if fbanks is None:
# compute fbanks
gamma_fbanks_mat, _ = gammatone_filter_banks(
nfilts=nfilts,
nfft=nfft,
fs=fs,
low_freq=low_freq,
high_freq=high_freq,
scale=scale,
conversion_approach=conversion_approach,
)
fbanks = gamma_fbanks_mat
# pre-emphasis
if pre_emph:
sig = pre_emphasis(sig=sig, pre_emph_coeff=pre_emph_coeff)
# init window
if window is None:
window = SlidingWindow()
# -> framing
frames, frame_length = framing(
sig=sig, fs=fs, win_len=window.win_len, win_hop=window.win_hop
)
# -> windowing
windows = windowing(frames=frames, frame_len=frame_length, win_type=window.win_type)
# -> FFT -> |.|**2
fourrier_transform = np.absolute(np.fft.rfft(windows, nfft))
abs_fft_values = (1.0 / nfft) * np.square(fourrier_transform)
# -> x Gammatone fbanks
features = np.dot(abs_fft_values, fbanks.T)
# -> log(.)
# handle zeros: if feat is zero, we get problems with log
features_no_zero = zero_handling(x=features)
log_features = np.log(features_no_zero)
# -> DCT(.)
ngccs = dct(x=log_features, type=dct_type, axis=1, norm="ortho")[:, :num_ceps]
# use energy for 1st features column
if use_energy:
# compute the # Magnitude of the FFT and then the Power Spectrum
magnitude_frames = np.absolute(fourrier_transform)
power_frames = (1.0 / nfft) * ((magnitude_frames) ** 2)
# compute total energy in each frame
frame_energies = np.sum(power_frames, 1)
# Handling zero enegies
energy = zero_handling(frame_energies)
ngccs[:, 0] = np.log(energy)
# liftering
if lifter:
ngccs = lifter_ceps(ngccs, lifter)
# normalization
if normalize:
ngccs = normalize_ceps(ngccs, normalize)
return ngccs