Source code for calpit.metrics

import numpy as np


[docs] def cde_loss(cde_estimates: np.ndarray, y_grid: np.ndarray, y_test: np.ndarray) -> tuple: """ Calculates conditional density estimation loss on holdout data. Args: cde_estimates (numpy.array): An array where each row is a density estimate on y_grid. z_grid (numpy.array): An array of the grid points at which cde_estimates is evaluated. z_test (numpy.array): An array of the true y values corresponding to the rows of cde_estimates. Returns: tuple: A tuple containing the loss and the standard error of the loss. Raises: ValueError: If the dimensions of the input tensors are not compatible. """ if len(y_test.shape) == 1: y_test = y_test.reshape(-1, 1) if len(y_grid.shape) == 1: y_grid = y_grid.reshape(-1, 1) n_obs, n_grid = cde_estimates.shape n_samples, feats_samples = y_test.shape n_grid_points, feats_grid = y_grid.shape if n_obs != n_samples: raise ValueError( f"Number of samples in CDEs should be the same as in z_test.Currently {n_obs} and {n_samples}." ) if n_grid != n_grid_points: raise ValueError( f"Number of grid points in CDEs should be the same as in z_grid. Currently {n_grid} and {n_grid_points}." ) if feats_samples != feats_grid: raise ValueError( f"Dimensionality of test points and grid points need to coincise. Currently {feats_samples} and {feats_grid}." ) integrals = np.trapz(cde_estimates**2, np.squeeze(y_grid), axis=1) nn_ids = np.argmin(np.abs(y_grid - y_test.T), axis=0) likeli = cde_estimates[(tuple(np.arange(n_samples)), tuple(nn_ids))] losses = integrals - 2 * likeli loss = np.mean(losses) se_error = np.std(losses, axis=0) / (n_obs**0.5) return loss, se_error
[docs] def kolmogorov_smirnov_statistic(cdf_test: np.ndarray, cdf_ref: np.ndarray) -> np.ndarray: """ Calculate the Kolmogorov-Smirnov statistic between two cumulative distribution functions (CDFs). Parameters: cdf_test (np.ndarray): CDF of the test distribution. cdf_ref (np.ndarray): CDF of the reference distribution on the same grid. Returns: np.ndarray: The Kolmogorov-Smirnov statistic. """ ks = np.max(np.abs(cdf_test - cdf_ref), axis=-1) return ks
[docs] def cramer_von_mises_statistic(cdf_test: np.ndarray, cdf_ref: np.ndarray) -> np.ndarray: """ Calculates the Cramer-von Mises statistic between two cumulative distribution functions (CDFs). Args: cdf_test (np.ndarray): CDF of the test distribution. cdf_ref (np.ndarray): CDF of the reference distribution on the same grid. Returns: np.ndarray: The Cramer-von Mises statistic. """ diff = (cdf_test - cdf_ref) ** 2 cvm2 = np.trapz(diff, cdf_ref, axis=-1) return np.sqrt(cvm2)
[docs] def anderson_darling_statistic(cdf_test: np.ndarray, cdf_ref: np.ndarray, n_tot: int = 1) -> np.ndarray: """ Calculates the Anderson-Darling statistic between two cumulative distribution functions (CDFs). Args: cdf_test (np.ndarray): CDF of the test distribution (1D array). cdf_ref (np.ndarray): CDF of the reference distribution on the same grid (1D array). n_tot (int): Scaling factor equal to the number of PDFs used to construct ECDF. Returns: np.ndarray: The Anderson-Darling statistic. """ num = (cdf_test - cdf_ref) ** 2 den = cdf_ref * (1 - cdf_ref) ad2 = n_tot * np.trapz((num / den), cdf_ref, axis=-1) return np.sqrt(ad2)
[docs] def probability_integral_transform(cde: np.ndarray, y_grid: np.ndarray, y_test: np.ndarray) -> np.ndarray: """ Calculates the Probability Integral Transform (PIT) based on Conditional Density Estimates (CDE). Args: cde (np.ndarray): A numpy array of conditional density estimates. Each row corresponds to an observation, each column corresponds to a grid point. y_grid (np.ndarray): A numpy array of the grid points at which cde is evaluated. y_test (np.ndarray): A numpy array of the true y values corresponding to the rows of cde. Returns: np.ndarray: A numpy array of PIT values. Raises: ValueError: If the number of samples in cde is not the same as in y_test, or if the number of grid points in cde is not the same as in y_grid. """ # flatten the input arrays to 1D y_grid = np.ravel(y_grid) y_test = np.ravel(y_test) # Sanity checks nrow_cde, ncol_cde = cde.shape n_samples = y_test.shape[0] n_grid_points = y_grid.shape[0] if nrow_cde != n_samples: raise ValueError( f"Number of samples in CDEs should be the same as in z_test. Currently {nrow_cde} and {n_samples}." ) if ncol_cde != n_grid_points: raise ValueError( f"Number of grid points in CDEs should be the same as in z_grid. Currently {nrow_cde} and {n_grid_points}." ) # Vectorized implementation using masked arrays pit = np.ma.masked_array(cde, (y_grid > y_test[:, np.newaxis])) pit = np.trapz(pit, y_grid) return np.array(pit)