"""
speechDescriptorsUtils
==============
.. autosummary::
:toctree: generated/
compute_correlation_dimension,
compute_dfa,
compute_emd_features,
compute_mfcc,
compute_ppe,
compute_recurrence_period_and_rpde,
compute_spectral_spread,
compute_tqwt_features,
compute_wt_features,
feature_spectral_flux,
compute_vot,
compute_bark_band_energies,
compute_dpi,
compute_avqi
"""
import parselmouth
import librosa
import json
import pywt
import pandas as pd
import numpy as np
from numpy.fft import fft, ifft, fftshift, ifftshift
from scipy.signal import find_peaks
__all__ = [
"compute_correlation_dimension",
"compute_dfa",
"compute_emd_features",
"compute_mfcc",
"compute_ppe",
"compute_recurrence_period_and_rpde",
"compute_spectral_spread",
"compute_tqwt_features",
"compute_wt_features",
"feature_spectral_flux",
"compute_vot",
"compute_bark_band_energies",
"compute_dpi",
"compute_avqi"
]
[docs]
def feature_spectral_flux(X, f_s):
"""
Calculates spectral flux
Parameters
----------
X : ndarray
spectrogram
f_s : float
sample rate of audio data
Returns
-------
vsf : float
spectral_flux
"""
isSpectrum = X.ndim == 1
if isSpectrum:
X = np.expand_dims(X, axis=1)
# difference spectrum (set first diff to zero)
X = np.c_[X[:, 0], X]
afDeltaX = np.diff(X, 1, axis=1)
# flux
vsf = np.sqrt((afDeltaX**2).sum(axis=0)) / X.shape[0]
return np.squeeze(vsf) if isSpectrum else vsf
[docs]
def compute_ppe(pitch_values_hz):
"""
Compute Pitch Period Entropy (PPE) from a 1D array of pitch values in Hz.
Parameters
----------
pitch_values_hz : ndarray
array of pitch values
Returns
-------
entropy : float
Pitch Period Entropy (PPE)
"""
voiced = pitch_values_hz[pitch_values_hz > 0]
if len(voiced) < 2:
return np.nan
# Convert to semitones relative to median
f0_median = np.median(voiced)
semitones = 12 * np.log2(voiced / f0_median)
# Kernel density estimate over semitone deviations
hist, edges = np.histogram(semitones, bins=50, density=True)
probs = hist / np.sum(hist)
probs = probs[probs > 0]
# Shannon entropy
entropy = -np.sum(probs * np.log2(probs))
return float(entropy)
[docs]
def compute_vot(seg, pitch, sr):
"""
Compute Voice Onset Time (VOT)
Parameters
----------
seg : Praat Sound Object
segment/slice of Snd object between onset and offset
pitch : Pitch (CC) object
pitch object from Praat
sr : float
sample rate of audio
Returns
-------
entropy : float
voice onset time (in ms)
"""
y = seg.values[0]
frame_length = int(0.02 * sr) # 20 ms window
hop = int(0.005 * sr) # 5 ms hop
energy = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop)[0]
times = librosa.frames_to_time(np.arange(len(energy)), sr=sr, hop_length=hop)
# Release burst = first strong jump in energy
release_idx = np.argmax(energy > np.mean(energy) + 2*np.std(energy))
t_release = times[release_idx] if release_idx > 0 else None
t_voicing = None
if t_release:
for t in times[release_idx:]:
f0 = pitch.get_value_at_time(t)
if f0 and f0 > 0:
t_voicing = t
break
if t_release and t_voicing:
return (t_voicing - t_release) * 1000.0 # ms
else:
return np.nan
# Bark band energies
def bark_scale_freqs(n_bands=24, fmax=16000):
"""
Return Bark band edges up to fmax.
Parameters
----------
n_bands : int
number of bark bands
fmax : float
maximum frequency of analysis
Returns
-------
freqs : array
array of bark band frequencies
"""
bark_edges = np.linspace(0, 24, n_bands+1)
freqs = 1960 * ( (bark_edges + 0.53) / (26.28 - bark_edges) )
freqs = np.clip(freqs, 0, fmax)
return freqs
[docs]
def compute_bark_band_energies(S, sr, n_bands=24, mean_energies=True):
"""
Compute bark band energies via bark_scale_freqs edges function
Parameters
----------
S : ndraay
magnitude spectrum
sr : float
sample rate of audio file
n_bands : int
number of bark bands; default = 24
mean_energies : bool
default (True) returns mean of band energies, False returns array of band energies
Returns
-------
band_energies : float
mean of band energies as float, or array of band energies
"""
# S = magnitude spectrogram (freq_bins x frames)
freqs = np.linspace(0, sr/2, S.shape[0])
edges = bark_scale_freqs(n_bands, sr/2)
band_energies = []
for b in range(n_bands):
mask = (freqs >= edges[b]) & (freqs < edges[b+1])
if not np.any(mask):
band_energies.append(np.zeros(S.shape[1]))
continue
band_energy = S[mask, :].sum(axis=0)
band_energies.append(band_energy)
# Returns scalar of BBEs
# Remove two lines to instead return array
band_energies = np.stack(band_energies, axis=0) # (bands × frames)
if mean_energies == True:
return float(np.mean(band_energies))
else:
return np.array(band_energies) # shape (n_bands, frames)
[docs]
def compute_correlation_dimension(y, m=3, tau=2, r_frac=0.1):
"""
Compute Correlation Dimension (CD)
Parameters
----------
y : array
time series values
m : int
embedding dimension for phase-space reconstruction; default = 3
tau : int
time delay (in samples); default = 2
r_frac : float
fraction of the distance standard deviation used to set the radius threshold; default = 0.1
Returns
----------
cd : float
estimated correlation dimenstion, or np.nan if insufficient data
"""
y = np.asarray(y, float)
y = y[np.isfinite(y)]
if y.size < 200:
return np.nan
y -= np.mean(y)
N = len(y) - (m - 1) * tau
if N < 50:
return np.nan
X = np.array([y[i:i + m*tau:tau] for i in range(N)])
dists = np.linalg.norm(X[:, None] - X[None, :], axis=2)
r = r_frac * np.std(dists)
cd = np.mean(dists < r)
return cd
# Duration of Pause Intervals (DPI) - NOT WORKING
[docs]
def compute_dpi(snd, pitch, intensity, intensity_thresh, time_step=0.01,
min_pause_dur=0.15):
"""
Compute Duration of Pause Intervals (DPI).
Parameters
----------
snd : Praat Sound Object
Parselmouth/Praat Sound Object
pitch : Praat Pitch Object
Parselmouth/Praat Pitch Object
intensity : Praat Intensity Object
time delay (in samples)
intensity_thresh : int
dB SPL threshold for silence
time_step : float
analysis step in seconds; default = 0.01
min_pause_dur : float
minimum pause duration to count (sec); default = 0.15
Returns
----------
pause_durations : array
list of pause durations (s)
mean_dpi: float
average pause duration
"""
# Segment bounds (absolute time)
t0 = snd.xmin
t1 = snd.xmax
times = np.arange(t0, t1, time_step)
silent = []
for t in times:
f0 = pitch.get_value_at_time(t)
inten = intensity.get_value(t)
is_silent = (
(f0 is None or f0 <= 0) or
(inten is None or inten < intensity_thresh)
)
silent.append(is_silent)
pause_durations = []
in_pause = False
pause_start = None
for t, s in zip(times, silent):
if s and not in_pause:
in_pause = True
pause_start = t
elif not s and in_pause:
dur = t - pause_start
if dur >= min_pause_dur:
pause_durations.append(dur)
in_pause = False
# Handle trailing pause
if in_pause and pause_start is not None:
dur = t1 - pause_start
if dur >= min_pause_dur:
pause_durations.append(dur)
mean_dpi = np.mean(pause_durations) if pause_durations else np.nan
return pause_durations, mean_dpi
[docs]
def compute_dfa(y, min_window=4, max_window=None, n_windows=20):
"""
Compute DFA (Detrended Fluctuation Analysis) scaling component (alpha) for 1D signal y
Parameters
----------
y : array
speech segment waveform
min_window : int
smallest window size (samples); default = 4
min_window : max_window
largest window size (samples); default = len(y)//4
n_windows : int
number of window sizes to test; default = 20
Returns
----------
alpha : float
DFA scaling exponent
"""
y = np.array(y, dtype=float)
y -= np.mean(y)
N = len(y)
if N < min_window*4:
return np.nan
# Integrated signal
y_int = np.cumsum(y)
# Window sizes (log spaced)
if max_window is None:
max_window = N // 4
window_sizes = np.unique(np.logspace(np.log10(min_window),
np.log10(max_window),
n_windows, dtype=int))
flucts = []
for w in window_sizes:
if w < 2:
continue
n_segments = N // w
if n_segments < 2:
continue
shape = (n_segments, w)
segments = np.reshape(y_int[:n_segments*w], shape)
# detrend each segment
t = np.arange(w)
detrended = []
for seg in segments:
p = np.polyfit(t, seg, 1)
trend = np.polyval(p, t)
detrended.append(np.sqrt(np.mean((seg - trend)**2)))
flucts.append(np.mean(detrended))
if len(flucts) < 2:
return np.nan
flucts = np.array(flucts)
slope, _ = np.polyfit(np.log(window_sizes[:len(flucts)]),
np.log(flucts), 1)
return float(slope)
# Empirical Mode Decomposition (EMD)
[docs]
def compute_emd_features(y, max_imfs=6, max_sift=10):
"""
Empirical Mode Decomposition (EMD) features.
Returns scalar complexity measures, not raw IMFs.
Parameters
----------
y : 1D numpy array
input signal (e.g. gneVals or waveform)
max_imfs : int
maximum IMFs to extract; default = 6
max_sift : int
max sifting iterations per IMF; default = 10
Returns
-------
emd_n_imfs : int
number of imfs
emd_energy_ratio : float
high-frequency IMF energy / total energy
emd_entropy : float
Shannon entropy of IMF energy distribution
"""
y = np.asarray(y, float)
y = y[np.isfinite(y)]
if y.size < 100:
return np.nan, np.nan, np.nan
y = y - np.mean(y)
residue = y.copy()
imfs = []
for _ in range(max_imfs):
h = residue.copy()
for _ in range(max_sift):
peaks, _ = find_peaks(h)
troughs, _ = find_peaks(-h)
if len(peaks) < 2 or len(troughs) < 2:
break
# Envelope interpolation
t = np.arange(len(h))
upper = np.interp(t, peaks, h[peaks])
lower = np.interp(t, troughs, h[troughs])
mean_env = 0.5 * (upper + lower)
h = h - mean_env
imfs.append(h)
residue = residue - h
if np.std(residue) < 1e-6:
break
if len(imfs) == 0:
return np.nan, np.nan, np.nan
# Energy per IMF
energies = np.array([np.sum(imf**2) for imf in imfs])
total_energy = np.sum(energies)
if total_energy <= 0:
return np.nan, np.nan, np.nan
# Metrics
emd_n_imfs = len(imfs)
# High-frequency = first half of IMFs
hf_energy = np.sum(energies[: max(1, emd_n_imfs // 2)])
emd_energy_ratio = hf_energy / total_energy
probs = energies / total_energy
emd_entropy = -np.sum(probs * np.log2(probs + 1e-12))
return int(emd_n_imfs), float(emd_energy_ratio), float(emd_entropy)
# Mel Frequency Cepstral Coefficients (MFCCs) — STRING RETURN
[docs]
def compute_mfcc(seg, sr, n_mfcc=13, n_fft=2048, hop_length=512):
"""
Compute Mel Frequency Cepstral Coefficients (MFCCs)
Parameters
----------
seg : Praat Sound Object
segment of Sound cut around onset and offset
n_mfcc : int
number of coefficients to return; default = 13
n_fft : int
FFT window size; default = 2048
hop_length : int
number of samples of analysis advances between consecutive frames; default = 512
Returns
-------
mfcc : JSON Stringified list
list of MFCCs
"""
y = seg.values[0].astype(float)
mfcc = librosa.feature.mfcc(
y=y,
sr=sr,
n_mfcc=n_mfcc,
n_fft=n_fft,
hop_length=hop_length,
htk=False
)
return json.dumps(mfcc.astype(np.float32).tolist())
[docs]
def compute_recurrence_period_and_rpde(pp, onset, offset, n_bins=50):
"""
Compute recurrence period (RP) and recurrence period density entropy (RPDE)
RP is the mean inter-pulse interval within a segment.
RPDE is the normalized entropy of the distribution of recurrence periods.
Low = highly periodic/stable signal
High = irregular, noisy, chaotic signal
Parameters
----------
pp : Praat Pitch Object
Periodic (cc) of the segment
onset : float
offset : float
segment boundaries in seconds
n_bins : int
bin count for histogram calculation; default = 50
Returns
-------
rp_ms : float
recurrence period in ms
rpde : float
normalized entropy of recurrence-period distribution
"""
n = parselmouth.praat.call(pp, "Get number of points")
times = []
for k in range(1, n + 1):
t = parselmouth.praat.call(pp, "Get time from index", k)
if onset <= t <= offset:
times.append(t)
times = np.asarray(times)
if times.size < 3:
return np.nan, np.nan
# Inter-pulse periods (seconds)
periods = np.diff(times)
periods = periods[(periods > 0) & np.isfinite(periods)]
if periods.size < 3:
return np.nan, np.nan
# RP (mean period)
rp_ms = float(np.mean(periods) * 1000.0)
# RPDE (normalized entropy)
hist, _ = np.histogram(periods, bins=n_bins, density=True)
probs = hist / np.sum(hist)
probs = probs[probs > 0]
rpde = float(-np.sum(probs * np.log(probs)) / np.log(len(probs)))
return rp_ms, rpde
[docs]
def compute_spectral_spread(S, freqs):
"""
Compute spectral spread
Parameters
----------
S : ndarray
magnitude spectrum
freqs : array
frequency axis in Hz (freq_bins,)
Returns
-------
rp_ms : float
recurrence period in ms
rpde : float
normalized entropy of recurrence-period distribution
"""
# S: magnitude spectrogram (freq_bins × frames)
# freqs: frequency axis in Hz (freq_bins,)
S_sum = np.sum(S, axis=0)
valid = S_sum > 0
if not np.any(valid):
return np.nan
# Spectral centroid per frame
centroid = np.sum(freqs[:, None] * S, axis=0) / S_sum
# Spectral spread per frame
spread = np.sqrt(
np.sum(((freqs[:, None] - centroid) ** 2) * S, axis=0) / S_sum
)
return float(np.mean(spread[valid]))
# Tuneable Q-Wavelet Transform (TQWT)
def _tqwt_theta(w):
# Eq. (17) in Selesnick 2011: theta(ω) = 0.5(1+cos ω)*sqrt(2 - cos ω), |ω|<=π
w = np.asarray(w, dtype=float)
return 0.5 * (1.0 + np.cos(w)) * np.sqrt(np.maximum(2.0 - np.cos(w), 0.0))
def _tqwt_analysis_filters(N, alpha, beta):
# Eqs. (11)-(16) in Selesnick 2011; even-symmetric magnitude response on DFT grid
k = np.arange(N)
w = 2.0 * np.pi * k / N
w = (w + np.pi) % (2.0 * np.pi) - np.pi # [-π, π)
aw = np.abs(w)
H0 = np.zeros(N, dtype=float)
H1 = np.zeros(N, dtype=float)
wP = (1.0 - beta) * np.pi
wS = alpha * np.pi
denom = (alpha + beta - 1.0)
# Pass/stop
P = aw <= wP
S = aw >= wS
H0[P] = 1.0
H1[S] = 1.0
# Transition: (1-β)π < |ω| < απ
T = (~P) & (~S)
wt = aw[T]
u0 = (wt + (beta - 1.0) * np.pi) / denom
u1 = (alpha * np.pi - wt) / denom
H0[T] = _tqwt_theta(u0)
H1[T] = _tqwt_theta(u1)
return H0.astype(np.complex128), H1.astype(np.complex128)
def _tqwt_lps(x, alpha):
# Low-pass scaling (α<1): keep center αN bins (FFT-domain truncation)
x = np.asarray(x, dtype=float)
N = x.size
M = int(np.ceil(alpha * N))
if M < 2:
return np.zeros(2, dtype=float)
Xs = fftshift(fft(x))
start = (N - M) // 2
Ys = Xs[start:start + M]
y = np.real(ifft(ifftshift(Ys))) * np.sqrt(alpha)
return y
def _tqwt_hps(x, beta):
# High-pass scaling (β<1): keep outer βN bins (FFT-domain) and pack into length βN
x = np.asarray(x, dtype=float)
N = x.size
M = int(np.ceil(beta * N))
if M < 2:
return np.zeros(2, dtype=float)
Xs = fftshift(fft(x))
left_len = M // 2
right_len = M - left_len
left = Xs[:left_len]
right = Xs[-right_len:] if right_len > 0 else np.array([], dtype=Xs.dtype)
Ys = np.concatenate([left, right])
y = np.real(ifft(ifftshift(Ys))) * np.sqrt(beta)
return y
def tqwt_forward(x, Q=1.0, r=3.0, J=8):
"""
Forward TQWT (DFT-domain, finite-length).
Returns: coeffs (list of J arrays), residual (array), (alpha, beta)
Uses Eq. (26): beta = 2/(Q+1), alpha = 1 - beta/r.
"""
x = np.asarray(x, dtype=float)
x = x[np.isfinite(x)]
if x.size < 16:
return [np.array([], dtype=float) for _ in range(int(J))], np.array([], dtype=float), (np.nan, np.nan)
Q = float(Q)
r = float(r)
J = int(J)
beta = 2.0 / (Q + 1.0)
alpha = 1.0 - (beta / r)
v = x
coeffs = []
for _ in range(J):
N = v.size
if N < 16:
coeffs.append(np.array([], dtype=float))
continue
H0, H1 = _tqwt_analysis_filters(N, alpha, beta)
V = fft(v)
v0 = np.real(ifft(V * H0))
v1 = np.real(ifft(V * H1))
w = _tqwt_hps(v1, beta)
v = _tqwt_lps(v0, alpha)
coeffs.append(w)
return coeffs, v, (alpha, beta)
[docs]
def compute_tqwt_features(y, sr, Q=1.0, r=3.0, J=8, n_keep=5):
"""
Compute a fixed-width feature vector from the Tunable Q-factor Wavelet Transform (TQWT).
Features are derived from subband energies of the TQWT decomposition of a 1-D signal.
Parameters
----------
y : 1D numpy array
input signal
sr : float
sample rate of the signal in Hz. (Currently unused; included for API consistency.)
Q : float
Q-factor controlling oscillatory behavior of the wavelets; default = 1.0
r : float
redundancy factor of the TQWT; default = 3.0
J : int
number of TQWT decomposition levels; default = 8
n_keep : int
number of lowest-index subband log-energies to return; default = 5
Returns
-------
features : list of float
Feature vector containing, in order:
- tqwt_e1 .. tqwt_e{n_keep} : float
Log10 subband energies of the first `n_keep` TQWT bands.
- tqwt_entropy : float
Normalized Shannon entropy of the subband energy distribution.
- tqwt_centroid : float
Energy-weighted centroid of subband indices.
- tqwt_low_high_ratio : float
Ratio of summed low-band energy to high-band energy.
If insufficient valid data are available, all features are returned as NaN.
"""
y = np.asarray(y, dtype=float)
y = y[np.isfinite(y)]
if y.size < 16:
feats = [np.nan] * n_keep + [np.nan, np.nan, np.nan]
return feats
coeffs, resid, _ = tqwt_forward(y, Q=Q, r=r, J=J)
eps = 1e-12
energies = np.array([float(np.mean(c*c)) if c.size else 0.0 for c in coeffs], dtype=float)
energies = np.maximum(energies, 0.0)
# log-energies (first n_keep bands)
loge = np.log10(energies + eps)
e_keep = [float(loge[i]) if i < loge.size else np.nan for i in range(n_keep)]
# entropy over subband energy distribution
tot = float(np.sum(energies))
if tot <= eps:
tqwt_entropy = np.nan
tqwt_centroid = np.nan
tqwt_low_high_ratio = np.nan
else:
p = energies / tot
p = p[p > 0]
tqwt_entropy = float(-np.sum(p * np.log(p)) / np.log(max(len(p), 2)))
idx = np.arange(1, energies.size + 1, dtype=float)
tqwt_centroid = float(np.sum(idx * energies) / (tot + eps)) # band-index centroid
low = float(np.sum(energies[: max(1, energies.size // 2)]))
high = float(np.sum(energies[max(1, energies.size // 2):]))
tqwt_low_high_ratio = float((low + eps) / (high + eps))
return e_keep + [tqwt_entropy, tqwt_centroid, tqwt_low_high_ratio]
[docs]
def compute_wt_features(y, wavelet="db4", level=5):
"""
Compute wavelet-based features from a 1-D signal using discrete wavelet transform (DWT).
Features are derived from the normalized energy distribution of wavelet subbands.
Parameters
----------
y : 1D numpy array
input signal
wavelet : str
Wavelet family used for decomposition; default = "db4"
level : int
Decomposition level; default = 5
Returns
-------
wt_entropy : float
Shannon entropy (base 2) of the normalized wavelet subband energies.
wt_low_high_ratio : float
Ratio of high-frequency subband energy to low-frequency subband energy.
"""
coeffs = pywt.wavedec(y, wavelet, level=level)
energies = np.array([np.sum(c**2) for c in coeffs])
energies /= np.sum(energies)
wt_entropy = -np.sum(energies * np.log2(energies + 1e-12))
wt_low_high_ratio = np.sum(energies[level//2:]) / np.sum(energies[:level//2])
return wt_entropy, wt_low_high_ratio
[docs]
def compute_avqi(shimmer_local, hnr_mean_db,cpps_vowel, slope_vowel,cpps_speech, slope_speech):
"""Compute the Acoustic Voice Quality Index (AVQI)."""
avqi = (
3.41
+ (0.33 * shimmer_local)
- (0.09 * hnr_mean_db)
- (0.88 * cpps_vowel)
+ (0.12 * slope_vowel)
- (1.18 * cpps_speech)
+ (0.46 * slope_speech)
)
return avqi
# This will give the proper 0-10 AVQI score?
def avqi_v0206(cpps_vowel,
hnr_speech,
shimmer_local_percent,
shimmer_local_db,
ltas_slope_speech,
ltas_tilt_speech):
"""
Compute the REAL Acoustic Voice Quality Index (AVQI) version 02.06
as defined by Maryn, Weenink, Roy, et al.
Parameters
----------
cpps_vowel : float
Smoothed CPP (dB) from sustained vowel /a/ segment.
hnr_speech : float
Harmonics-to-noise ratio (dB) from connected speech.
shimmer_local_percent : float
Shimmer local (percent) from sustained vowel.
shimmer_local_db : float
Shimmer local (dB) from sustained vowel.
ltas_slope_speech : float
LTAS spectral slope (dB/decade) from connected speech.
ltas_tilt_speech : float
LTAS spectral tilt (dB) from connected speech.
Returns
-------
float
AVQI (real clinical AVQI v02.06)
Range roughly: 0–10 (normal voices ~1–3, dysphonic ≥4).
"""
# Raw regression (unscaled)
raw = (
3.295
- 0.111 * cpps_vowel
- 0.073 * hnr_speech
- 0.213 * shimmer_local_percent
+ 2.789 * shimmer_local_db
- 0.032 * ltas_slope_speech
+ 0.077 * ltas_tilt_speech
)
# Final scaling and offset (critical!)
avqi = raw * 2.208 + 1.797
return avqi
# # Johanna's code...
# def vocal_presense(y, sr, nfft=1024, hop_len=256):
# # Features
# rmse = librosa.feature.rms(y=y, frame_length=nfft, hop_length=hop_len, center=False)[0]
# zcr = librosa.feature.zero_crossing_rate(y=y, frame_length=nfft, hop_length=hop_len, center=False)[0]
# # Clip-adaptive thresholds (percentile-based) + sanity clamps
# # Energy (RMS)
# ste_on = max(np.percentile(rmse, 65), 0.02) # don't go below 0.02 on normalized audio
# ste_off = max(np.percentile(rmse, 45), 0.015)
# # ZCR (lower is more voiced) – invert logic with hysteresis
# zcr_on = np.clip(np.percentile(zcr, 35), 0.08, 0.18)
# zcr_off = np.clip(np.percentile(zcr, 65), 0.18, 0.30)
# # Hysteresis state machine
# is_vocal = np.zeros_like(rmse, dtype=bool)
# state = False
# for i, (e, z) in enumerate(zip(rmse, zcr)):
# if not state:
# state = (e >= ste_on) and (z <= zcr_on)
# else:
# state = not ((e <= ste_off) or (z >= zcr_off))
# is_vocal[i] = state
# # Debounce flicker (median filter ~30–50 ms)
# k = int(round(0.04 * sr / hop_len)) | 1 # odd kernel size, ~40 ms
# is_vocal = sps.medfilt(is_vocal.astype(float), kernel_size=max(k, 3)) > 0.5
# return is_vocal