Source code for pyampact.performance

"""
performance
==============


.. autosummary::
    :toctree: generated/

    estimate_perceptual_parameters
    calculate_vibrato
    perceived_pitch    
"""

import numpy as np
import librosa
import warnings

__all__ = [
    "estimate_perceptual_parameters",
    "calculate_vibrato",
    "perceived_pitch"
]

np.seterr(all='ignore')
warnings.filterwarnings(
    "ignore", message="Mean of empty slice.", category=RuntimeWarning)



[docs]
def estimate_perceptual_parameters(f0_vals, pwr_vals, M, SR, hop, gt_flag):
    """
    Estimates a range of performance parameters from the inputted fundamental
    frequency (f0_vales), power (pwr_vals), and spectrum estimates (M)

    Parameters
    ----------
    f0_vals : np.ndarray
        A vector of fundamental frequency estimates.

    pwr_vals : np.ndarray
        A vector of power values corresponding to the fundamental frequency estimates.

    M : np.ndarray
        The spectral estimates from which the perceptual parameters will be derived.

    SR : float
        The sample rate used to generate the fundamental frequency estimates.

    hop : float
        The hop size used in the analysis, expressed in seconds.

    gt_flag : bool
        A flag indicating whether to use ground truth information for the estimation.

    Returns
    -------
    pp1 : float
        The perceived pitch calculated using the entire vector of f0 estimates.
    pp2 : float
        The perceived pitch calculated using the central 80% of f0 estimates.

    """

    # Perceived pitch
    res_ppitch = perceived_pitch(f0_vals, SR)
    # Jitter
    tmp_jitter = np.abs(np.diff(f0_vals))
    res_jitter = np.mean(tmp_jitter)

    # Vibrato rate and depth
    mean_f0_vals = np.mean(f0_vals)
    detrended_f0_vals = f0_vals - mean_f0_vals
    res_vibrato_depth, res_vibrato_rate = calculate_vibrato(
        detrended_f0_vals, SR / hop)

    # Shimmer
    tmp_shimmer = 10 * np.log10(pwr_vals[1:] / pwr_vals[0])
    res_shimmer = np.mean(np.abs(tmp_shimmer))
    res_pwr_vals = 10 * np.log10(pwr_vals)
    res_f0_vals = f0_vals

    if gt_flag:
        M = np.abs(M) ** 2

    # spectral bandwidth
    res_spec_bandwidth = librosa.feature.spectral_bandwidth(S=M)
    res_mean_spec_bandwidth = np.mean(res_spec_bandwidth)

    # Spectral Centroid
    # S, phase = librosa.magphase(librosa.stft(y=y))
    res_spec_centroid = librosa.feature.spectral_centroid(S=M)
    res_mean_spec_centroid = np.mean(res_spec_centroid)

    # spectral contrast
    res_spec_contrast = librosa.feature.spectral_contrast(S=M)
    res_mean_spec_contrast = np.mean(res_spec_contrast)

    # spectral flatness
    res_spec_flatness = librosa.feature.spectral_flatness(S=M)
    res_mean_spec_flatness = np.mean(res_spec_flatness)

    # spectral rolloff
    res_spec_rolloff = librosa.feature.spectral_rolloff(S=M)
    res_mean_spec_rolloff = np.mean(res_spec_rolloff)

    # Spectral Flatness
    XLog = np.log(M + 1e-20)
    res_spec_flat = np.exp(np.mean(XLog, axis=0)) / np.mean(M, axis=0)
    res_spec_flat[np.sum(M, axis=0) == 0] = 0
    res_mean_spec_flat = np.mean(res_spec_flat)

    res = {
        "ppitch": res_ppitch,
        "jitter": res_jitter,
        "vibrato_depth": res_vibrato_depth,
        "vibrato_rate": res_vibrato_rate,
        "shimmer": res_shimmer,
        "pwr_vals": res_pwr_vals,
        "f0_vals": res_f0_vals,
        "spec_centroid": res_spec_centroid,
        "mean_spec_centroid": res_mean_spec_centroid,
        "spec_bandwidth": res_spec_bandwidth,
        "mean_spec_bandwidth": res_mean_spec_bandwidth,
        "spec_contrast": res_spec_contrast,
        "mean_spec_contrast": res_mean_spec_contrast,
        "spec_flatness": res_spec_flatness,
        "mean_spec_flatness": res_mean_spec_flatness,
        "spec_rolloff": res_spec_rolloff,
        "mean_spec_rolloff": res_mean_spec_rolloff,
    }

    return res




[docs]
def calculate_vibrato(note_vals, sr):
    """
    Calculate the vibrato depth and rate from a note's frequency signal.

    Parameters
    ----------
    note_vals : np.ndarray
        The time-domain signal values of the note, typically a 1D array representing 
        the amplitude of the sound wave over time.

    sr : int
        The sampling rate of the signal in Hertz (samples per second).

    Returns
    -------        
    vibrato_depth : float
        The depth of the vibrato, calculated as twice the amplitude of the 
        dominant frequency component.
    vibrato_rate : float
        The rate of the vibrato, in Hertz (Hz), derived from the position of 
        the dominant frequency in the Fast Fourier Transform (FFT).

    """
    L = len(note_vals)  # Length of signal
    Y = np.fft.fft(note_vals) / L  # Run FFT on normalized note vals
    w = np.arange(0, L) * sr / L  # Set FFT frequency grid

    vibrato_depth_tmp, noteVibratoPos = max(abs(Y)), np.argmax(
        abs(Y))  # Find the max value and its position
    # Multiply the max by 2 to find depth (above and below zero)
    vibrato_depth = vibrato_depth_tmp * 2
    # Index into FFT frequency grid to find position in Hz
    vibrato_rate = w[noteVibratoPos]

    return vibrato_depth, vibrato_rate




[docs]
def perceived_pitch(f0s, sr, gamma=100000):
    """
    Calculate the perceived pitch of a note based on 
    Gockel, H., B.J.C. Moore,and R.P. Carlyon. 2001. 
    Influence of rate of change of frequency on the overall 
    pitch of frequency-modulated Tones. Journal of the 
    Acoustical Society of America. 109(2):701?12.

    Parameters
    ----------
    f0s : np.ndarray
        Vector of fundamental frequency estimates, typically a 1D array 
        representing frequency values over time.

    sr : int
        The sampling rate of the f0 estimates in Hertz (Hz).

    gamma : float, optional
        A parameter that sets the relative weighting of quickly changing 
        versus slowly changing portions of notes. A high gamma value (e.g., 
        1000000) gives more weight to slowly changing portions. Default is 100000.

    Returns
    -------
    pp1 : float
        The perceived pitch using the entire vector of f0 estimates.
    pp2 : float
        The perceived pitch using the central 80% of f0 estimates.

    """

    # Remove all NaNs in the f0 vector
    f0s = f0s[~np.isnan(f0s)]

    # Calculate the rate of change (derivative)
    deriv = np.diff(f0s) * sr
    deriv = np.append(deriv, deriv[-1])  # Extend to match original length

    # Weights based on inverse of rate of change
    # Using np.clip to avoid division by zero for small values
    weights = 1.0 / np.clip(np.abs(deriv), 1e-6, None)
    weights /= np.sum(weights)  # Normalize weights

    # Calculate pp1: Weighted average of f0s based on smooth weights
    pp1 = np.dot(f0s, weights)

    # Calculate central 80% of the f0 vector
    ord = np.argsort(f0s)
    ind = ord[int(np.ceil(len(f0s) * 0.1)):int(np.floor(len(f0s) * 0.9))]

    # pp2: Weighted average of central 80% of f0 estimates
    pp2 = np.dot(f0s[ind], weights[ind]) / np.sum(weights[ind])

    return pp1, pp2