Source code for calpit.utils

from scipy.stats import binom
from matplotlib import pyplot as plt
import numpy as np
import torch


[docs]
def normalize(
    cde_estimates: np.ndarray, y_grid: np.ndarray, tol: float = 1e-6, max_iter: int = 200
) -> np.ndarray:
    """
    Normalizes conditional density estimates to be non-negative and integrate to one.

    Args:
        cde_estimates (numpy.ndarray): A numpy array or matrix of conditional density estimates.
        x_grid (numpy.ndarray): The array of grid points.
        tol (float): The tolerance to accept for abs(area - 1).
        max_iter (int): The maximal number of search iterations.

    Returns:
        numpy.ndarray: The normalized conditional density estimates.

    """
    if cde_estimates.ndim == 1:
        normalized_cde = _normalize(cde_estimates, y_grid, tol, max_iter)
    else:
        normalized_cde = np.apply_along_axis(_normalize, 1, cde_estimates, y_grid, tol=tol, max_iter=max_iter)
    return normalized_cde




[docs]
def _normalize(density, y_grid, tol=1e-6, max_iter=500):
    # TODO: Use an alternate root finding method to vectorize this
    hi = np.max(density)
    lo = 0.0

    area = np.trapz(np.maximum(density, 0.0), y_grid)
    if area == 0.0:
        # replace with uniform if all negative density
        density[:] = 1 / (y_grid.max() - y_grid.min())
    elif area < 1:
        density /= area
        density[density < 0.0] = 0.0
        return density

    for _ in range(max_iter):
        mid = (hi + lo) / 2
        area = np.trapz(np.maximum(density - mid, 0.0), y_grid)
        if abs(1.0 - area) <= tol:
            break
        if area < 1.0:
            hi = mid
        else:
            lo = mid

    # update in place
    density -= mid
    density[density < 0.0] = 0.0

    return density




[docs]
def trapz_grid(y: np.ndarray, x: np.ndarray) -> np.ndarray:
    """
    Does trapezoid integration between the same limits as the grid.

    Args:
        y (np.ndarray): The array of values to integrate.
        x (np.ndarray): The array of grid points.

    Returns:
        np.ndarray: The integrated values.

    """
    dx = np.diff(x)
    trapz_area = dx * (y[:, 1:] + y[:, :-1]) / 2
    integral = np.cumsum(trapz_area, axis=-1)
    return np.hstack((np.zeros(len(integral))[:, None], integral))



[docs]
def trapz_grid_torch(y: torch.tensor, x: torch.tensor) -> torch.tensor:
    """
    Same as trapz_grid but implemented in Pytorch
    """
    dx = torch.diff(x)
    trapz_area = dx * (y[:,1:] + y[:,:-1]) / 2
    integral = torch.cumsum(trapz_area, axis=-1)
    return torch.hstack((torch.zeros(len(integral), device=x.device)[:,None], integral))


    

[docs]
def plot_pit(pit_values, ci_level, n_bins=30, y_true=None, ax=None, **fig_kw):
    """
    Plots the PIT/HPD histogram and calculates the confidence interval for the bin values,
    were the PIT/HPD values follow an uniform distribution

    @param values: a numpy array with PIT/HPD values
    @param ci_level: a float between 0 and 1 indicating the size of the confidence level
    @param x_label: a string, populates the x_label of the plot
    @param n_bins: an integer, the number of bins in the histogram
    @param figsize: a tuple, the plot size (width, height)
    @param ylim: a list of two elements, including the lower and upper limit for the y axis
    @returns The matplotlib figure object with the histogram of the PIT/HPD values
    and the CI for the uniform distribution
    """

    # Extract the number of CDEs
    n = pit_values.shape[0]

    # Creating upper and lower limit for selected uniform band
    ci_quantity = (1 - ci_level) / 2
    low_lim = binom.ppf(q=ci_quantity, n=n, p=1 / n_bins)
    upp_lim = binom.ppf(q=ci_level + ci_quantity, n=n, p=1 / n_bins)

    # Creating figure

    if ax is None:
        fig, ax = plt.subplots(1, 2, **fig_kw)

    # plot PIT histogram
    ax[0].hist(pit_values, bins=n_bins)
    ax[0].axhline(y=low_lim, color="grey")
    ax[0].axhline(y=upp_lim, color="grey")
    ax[0].axhline(y=n / n_bins, label="Uniform Average", color="red")
    ax[0].fill_between(
        x=np.linspace(0, 1, 100),
        y1=np.repeat(low_lim, 100),
        y2=np.repeat(upp_lim, 100),
        color="grey",
        alpha=0.2,
    )
    ax[0].set_xlabel("PIT Values")
    ax[0].legend(loc="best")

    # plot P-P plot
    prob_theory = np.linspace(0.01, 0.99, 100)
    prob_data = [np.sum(pit_values < i) / len(pit_values) for i in prob_theory]
    # # plot Q-Q
    # quants = np.linspace(0, 100, 100)
    # quant_theory = quants/100.
    # quant_data = np.percentile(pit_values,quants)

    ax[1].scatter(prob_theory, prob_data, marker=".")
    ax[1].plot(prob_theory, prob_theory, c="k", ls="--")
    ax[1].set_xlim(0, 1)
    ax[1].set_ylim(0, 1)
    ax[1].set_xlabel("Expected Cumulative Probability")
    ax[1].set_ylabel("Empirical Cumulative Probability")
    xlabels = np.linspace(0, 1, 6)[1:]
    ax[1].set_xticks(xlabels)
    ax[1].set_aspect("equal")
    if y_true is not None:
        ks = kolmogorov_smirnov_statistic(prob_data, prob_theory)
        ad = anderson_darling_statistic(prob_data, prob_theory, len(y_true))
        cvm = cramer_von_mises(prob_data, prob_theory)
        ax[1].text(0.05, 0.9, f"KS:  ${ks:.3f} $", fontsize=15)
        ax[1].text(0.05, 0.84, f"CvM:  ${cvm:.3f} $", fontsize=15)
        ax[1].text(0.05, 0.78, f"AD:  ${ad:.2f} $", fontsize=15)

    return fig, ax