Source code for pyampact.performance

"""
performance
==============


.. autosummary::
    :toctree: generated/

    estimate_perceptual_parameters
    calculate_vibrato
    perceived_pitch
"""

import numpy as np
import librosa
import warnings

__all__ = [
    "estimate_perceptual_parameters",
    "calculate_vibrato",
    "perceived_pitch"
]

np.seterr(all='ignore')
warnings.filterwarnings(
    "ignore", message="Mean of empty slice.", category=RuntimeWarning)


[docs] def estimate_perceptual_parameters(f0_vals, pwr_vals, M, SR, hop, gt_flag): """ Estimates a range of performance parameters from the inputted fundamental frequency (f0_vales), power (pwr_vals), and spectrum estimates (M) Parameters ---------- f0_vals : np.ndarray A vector of fundamental frequency estimates. pwr_vals : np.ndarray A vector of power values corresponding to the fundamental frequency estimates. M : np.ndarray or dict Either: - dict of precomputed spectral feature vectors (FAST PATH), or - spectrogram slice [freq x frames] (SLOW / LEGACY PATH) SR : float The sample rate used to generate the fundamental frequency estimates. hop : float The hop size used in the analysis, expressed in seconds. gt_flag : bool A flag indicating whether to use ground truth information for the estimation. Returns ------- res : dict Dictionary of per-note vectors and summary statistics. """ # Perceived pitch res_ppitch = perceived_pitch(f0_vals, SR) # Work with clean (non-NaN) f0 values for voiced-frame descriptors f0_clean = f0_vals[~np.isnan(f0_vals)] # Jitter (mean absolute frame-to-frame f0 change, voiced frames only) if len(f0_clean) > 1: tmp_jitter = np.abs(np.diff(f0_clean)) res_jitter = float(np.nanmean(tmp_jitter)) else: res_jitter = np.nan # Vibrato rate and depth (voiced frames only) if len(f0_clean) > 2: mean_f0_clean = np.mean(f0_clean) detrended_f0 = f0_clean - mean_f0_clean frame_rate = SR / hop res_vibrato_depth, res_vibrato_rate = calculate_vibrato(detrended_f0, frame_rate) else: res_vibrato_depth, res_vibrato_rate = np.nan, np.nan # Shimmer – work with positive power values pwr_safe = np.where(pwr_vals > 0, pwr_vals, np.nan) res_pwr_vals = 10 * np.log10(np.where(pwr_vals > 0, pwr_vals, 1e-20)) if np.sum(~np.isnan(pwr_safe)) > 1: pwr0 = np.nanmean(pwr_safe) # use mean as reference to avoid division-by-first-frame issue tmp_shimmer = np.abs(10 * np.log10(pwr_safe[1:] / pwr0)) res_shimmer = float(np.nanmean(tmp_shimmer)) else: res_shimmer = np.nan res_f0_vals = f0_vals if isinstance(M, dict): # FAST PATH: precomputed spectral feature slices passed in from dataCompilation _sc = np.asarray(M["spec_centroid"]).flatten() _sb = np.asarray(M["spec_bandwidth"]).flatten() _sco = np.asarray(M["spec_contrast"]) # (bands, frames) or (frames,) _sf = np.asarray(M["spec_flatness"]).flatten() _sr = np.asarray(M["spec_rolloff"]).flatten() res_spec_centroid = _sc.reshape(1, -1) res_mean_spec_centroid = float(np.nanmean(_sc)) res_spec_bandwidth = _sb.reshape(1, -1) res_mean_spec_bandwidth = float(np.nanmean(_sb)) res_spec_contrast = _sco res_mean_spec_contrast = float(np.nanmean(_sco)) res_spec_flatness = _sf.reshape(1, -1) res_mean_spec_flatness = float(np.nanmean(_sf)) res_spec_rolloff = _sr.reshape(1, -1) res_mean_spec_rolloff = float(np.nanmean(_sr)) else: # LEGACY PATH: raw spectrogram slice [freq x frames] if gt_flag: M = np.abs(M) ** 2 # Spectral bandwidth res_spec_bandwidth = librosa.feature.spectral_bandwidth(S=M) res_mean_spec_bandwidth = np.mean(res_spec_bandwidth) # Spectral centroid res_spec_centroid = librosa.feature.spectral_centroid(S=M) res_mean_spec_centroid = np.mean(res_spec_centroid) # Spectral contrast res_spec_contrast = librosa.feature.spectral_contrast(S=M) res_mean_spec_contrast = np.mean(res_spec_contrast) # Spectral flatness res_spec_flatness = librosa.feature.spectral_flatness(S=M) res_mean_spec_flatness = np.mean(res_spec_flatness) # Spectral rolloff res_spec_rolloff = librosa.feature.spectral_rolloff(S=M) res_mean_spec_rolloff = np.mean(res_spec_rolloff) res = { "ppitch": res_ppitch, "jitter": res_jitter, "vibrato_depth": res_vibrato_depth, "vibrato_rate": res_vibrato_rate, "shimmer": res_shimmer, "pwr_vals": res_pwr_vals, "f0_vals": res_f0_vals, "spec_centroid": res_spec_centroid, "mean_spec_centroid": res_mean_spec_centroid, "spec_bandwidth": res_spec_bandwidth, "mean_spec_bandwidth": res_mean_spec_bandwidth, "spec_contrast": res_spec_contrast, "mean_spec_contrast": res_mean_spec_contrast, "spec_flatness": res_spec_flatness, "mean_spec_flatness": res_mean_spec_flatness, "spec_rolloff": res_spec_rolloff, "mean_spec_rolloff": res_mean_spec_rolloff, } return res
[docs] def calculate_vibrato(note_vals, sr): """ Calculate the vibrato depth and rate from a note's frequency signal. Parameters ---------- note_vals : np.ndarray The time-domain signal values of the note, typically a 1D array representing the amplitude of the sound wave over time. sr : int The sampling rate of the signal in Hertz (samples per second). Returns ------- vibrato_depth : float The depth of the vibrato, calculated as twice the amplitude of the dominant frequency component. vibrato_rate : float The rate of the vibrato, in Hertz (Hz), derived from the position of the dominant frequency in the Fast Fourier Transform (FFT). """ L = len(note_vals) # Length of signal Y = np.fft.fft(note_vals) / L # Run FFT on normalized note vals w = np.arange(0, L) * sr / L # Set FFT frequency grid vibrato_depth_tmp, noteVibratoPos = max(abs(Y)), np.argmax(abs(Y)) vibrato_depth = vibrato_depth_tmp * 2 # Multiply by 2 (above and below zero) vibrato_rate = w[noteVibratoPos] # Index into FFT freq grid → Hz return vibrato_depth, vibrato_rate
[docs] def perceived_pitch(f0s, sr, gamma=100000): """ Calculate the perceived pitch of a note based on Gockel, H., B.J.C. Moore, and R.P. Carlyon. 2001. Influence of rate of change of frequency on the overall pitch of frequency-modulated Tones. Journal of the Acoustical Society of America. 109(2):701-12. Parameters ---------- f0s : np.ndarray Vector of fundamental frequency estimates, typically a 1D array representing frequency values over time. sr : int The sampling rate of the f0 estimates in Hertz (Hz). gamma : float, optional A parameter that sets the relative weighting of quickly changing versus slowly changing portions of notes. A high gamma value (e.g., 1000000) gives more weight to slowly changing portions. Default is 100000. Returns ------- pp1 : float The perceived pitch using the entire vector of f0 estimates. pp2 : float The perceived pitch using the central 80% of f0 estimates. """ # Remove all NaNs in the f0 vector f0s = f0s[~np.isnan(f0s)] # Need at least 2 voiced frames for meaningful pitch estimates if len(f0s) < 2: if len(f0s) == 1: return float(f0s[0]), float(f0s[0]) return np.nan, np.nan # Calculate the rate of change (derivative) deriv = np.diff(f0s) * sr deriv = np.append(deriv, deriv[-1]) # Extend to match original length # Weights based on inverse of rate of change weights = 1.0 / np.clip(np.abs(deriv), 1e-6, None) weights /= np.sum(weights) # Normalize weights # pp1: weighted average of all f0s pp1 = float(np.dot(f0s, weights)) # pp2: weighted average of central 80% of f0 estimates ord_ = np.argsort(f0s) ind = ord_[int(np.ceil(len(f0s) * 0.1)) : int(np.floor(len(f0s) * 0.9))] if len(ind) == 0: pp2 = pp1 else: w_sum = np.sum(weights[ind]) pp2 = float(np.dot(f0s[ind], weights[ind]) / w_sum) if w_sum > 0 else pp1 return pp1, pp2