import numpy as np
[docs]
def cde_loss(cde_estimates: np.ndarray, y_grid: np.ndarray, y_test: np.ndarray) -> tuple:
"""
Calculates conditional density estimation loss on holdout data.
Args:
cde_estimates (numpy.array): An array where each row is a density estimate on y_grid.
z_grid (numpy.array): An array of the grid points at which cde_estimates is evaluated.
z_test (numpy.array): An array of the true y values corresponding to the rows of cde_estimates.
Returns:
tuple: A tuple containing the loss and the standard error of the loss.
Raises:
ValueError: If the dimensions of the input tensors are not compatible.
"""
if len(y_test.shape) == 1:
y_test = y_test.reshape(-1, 1)
if len(y_grid.shape) == 1:
y_grid = y_grid.reshape(-1, 1)
n_obs, n_grid = cde_estimates.shape
n_samples, feats_samples = y_test.shape
n_grid_points, feats_grid = y_grid.shape
if n_obs != n_samples:
raise ValueError(
f"Number of samples in CDEs should be the same as in z_test.Currently {n_obs} and {n_samples}."
)
if n_grid != n_grid_points:
raise ValueError(
f"Number of grid points in CDEs should be the same as in z_grid. Currently {n_grid} and {n_grid_points}."
)
if feats_samples != feats_grid:
raise ValueError(
f"Dimensionality of test points and grid points need to coincise. Currently {feats_samples} and {feats_grid}."
)
integrals = np.trapz(cde_estimates**2, np.squeeze(y_grid), axis=1)
nn_ids = np.argmin(np.abs(y_grid - y_test.T), axis=0)
likeli = cde_estimates[(tuple(np.arange(n_samples)), tuple(nn_ids))]
losses = integrals - 2 * likeli
loss = np.mean(losses)
se_error = np.std(losses, axis=0) / (n_obs**0.5)
return loss, se_error
[docs]
def kolmogorov_smirnov_statistic(cdf_test: np.ndarray, cdf_ref: np.ndarray) -> np.ndarray:
"""
Calculate the Kolmogorov-Smirnov statistic between two cumulative distribution functions (CDFs).
Parameters:
cdf_test (np.ndarray): CDF of the test distribution.
cdf_ref (np.ndarray): CDF of the reference distribution on the same grid.
Returns:
np.ndarray: The Kolmogorov-Smirnov statistic.
"""
ks = np.max(np.abs(cdf_test - cdf_ref), axis=-1)
return ks
[docs]
def cramer_von_mises_statistic(cdf_test: np.ndarray, cdf_ref: np.ndarray) -> np.ndarray:
"""
Calculates the Cramer-von Mises statistic between two cumulative distribution functions (CDFs).
Args:
cdf_test (np.ndarray): CDF of the test distribution.
cdf_ref (np.ndarray): CDF of the reference distribution on the same grid.
Returns:
np.ndarray: The Cramer-von Mises statistic.
"""
diff = (cdf_test - cdf_ref) ** 2
cvm2 = np.trapz(diff, cdf_ref, axis=-1)
return np.sqrt(cvm2)
[docs]
def anderson_darling_statistic(cdf_test: np.ndarray, cdf_ref: np.ndarray, n_tot: int = 1) -> np.ndarray:
"""
Calculates the Anderson-Darling statistic between two cumulative distribution functions (CDFs).
Args:
cdf_test (np.ndarray): CDF of the test distribution (1D array).
cdf_ref (np.ndarray): CDF of the reference distribution on the same grid (1D array).
n_tot (int): Scaling factor equal to the number of PDFs used to construct ECDF.
Returns:
np.ndarray: The Anderson-Darling statistic.
"""
num = (cdf_test - cdf_ref) ** 2
den = cdf_ref * (1 - cdf_ref)
ad2 = n_tot * np.trapz((num / den), cdf_ref, axis=-1)
return np.sqrt(ad2)