# Source code for dodiscover.ci.kernel_utils

from typing import Optional, Tuple

import numpy as np
from numpy.typing import ArrayLike
from scipy.linalg import logm
from scipy.optimize import minimize_scalar
from scipy.stats import gaussian_kde
from sklearn.metrics import pairwise_distances, pairwise_kernels

[docs]def von_neumann_divergence(A: ArrayLike, B: ArrayLike) -> float: """Compute Von Neumann divergence between two PSD matrices. Parameters ---------- A : ArrayLike of shape (n_features, n_features) The first PSD matrix. B : ArrayLike of shape (n_features, n_features) The second PSD matrix. Returns ------- div : float The divergence value. Notes ----- The Von Neumann divergence, or what is known as the Bregman divergence in :footcite:Yu2020Bregman is computed as follows with :math:D(A || B) = Tr(A (log(A) - log(B)) - A + B). """ div = np.trace(A.dot(logm(A) - logm(B)) - A + B) return div
[docs]def f_divergence_score(y_stat_q: ArrayLike, y_stat_p: ArrayLike) -> float: r"""Compute f-divergence upper bound on KL-divergence. See definition 4 in :footcite:Mukherjee2020ccmi, where the function is reversed to give an upper-bound for the sake of gradient descent. The f-divergence bound gives an upper bound on KL-divergence: .. math:: D_{KL}(p || q) \le \sup_f E_{x \sim q}[exp(f(x) - 1)] - E_{x \sim p}[f(x)] Parameters ---------- y_stat_q : ArrayLike of shape (n_samples_q,) Samples from the distribution Q, the variational class. y_stat_p : : ArrayLike of shape (n_samples_p,) Samples from the distribution P, the joint distribution. Returns ------- f_div : float The f-divergence score. """ f_div = np.mean(np.exp(y_stat_q - 1)) - np.mean(y_stat_p) return f_div
[docs]def kl_divergence_score(y_stat_q: ArrayLike, y_stat_p: ArrayLike, eps: float) -> float: r"""Compute f-divergence upper bound on KL-divergence. See definition 4 in :footcite:Mukherjee2020ccmi, where the function is reversed to give an upper-bound for the sake of gradient descent. The KL-divergence can be estimated with the following formula: .. math:: \hat{D}_{KL}(p || q) = \frac{1}{n} \sum_{i=1}^n log L(Y_i^p) - log (\frac{1}{m} \sum_{j=1}^m L(Y_j^q)) Parameters ---------- y_stat_q : ArrayLike of shape (n_samples_q,) Samples from the distribution Q, the variational class. This corresponds to :math:Y_j^q samples. y_stat_p : : ArrayLike of shape (n_samples_p,) Samples from the distribution P, the joint distribution. This corresponds to :math:Y_i^p samples. Returns ------- metric : float The KL-divergence score. """ # compute point-wise likelihood ratio for each prediction p_l_ratio = (y_stat_p + eps) / np.abs(1 - y_stat_p - eps) q_l_ratio = (y_stat_q + eps) / np.abs(1 - y_stat_q - eps) # now compute KL-divergence estimate kldiv = np.mean(np.log(p_l_ratio)) - np.log(np.mean(q_l_ratio)) return kldiv
[docs]def corrent_matrix( data: ArrayLike, metric: str = "rbf", kwidth: Optional[float] = None, distance_metric="euclidean", n_jobs: Optional[int] = None, ) -> ArrayLike: """Compute the centered correntropy of a matrix. Parameters ---------- data : ArrayLike of shape (n_samples, n_features) The data. metric : str The kernel metric. kwidth : float The kernel width. distance_metric : str The distance metric to infer kernel width. n_jobs : int, optional The number of jobs to run computations in parallel, by default None. Returns ------- data : ArrayLike of shape (n_features, n_features) A symmetric centered correntropy matrix of the data. Notes ----- The estimator for the correntropy array is given by the formula :math:1 / N \\sum_{i=1}^N k(x_i, y_i) - 1 / N**2 \\sum_{i=1}^N \\sum_{j=1}^N k(x_i, y_j). The first term is the estimate, and the second term is the bias, and together they form an unbiased estimate. """ n_samples, n_features = data.shape corren_arr = np.zeros(shape=(n_features, n_features)) # compute kernel between each feature, which is now (n_features, n_features) array for idx in range(n_features): for jdx in range(idx + 1): K, kwidth = compute_kernel( data[:, idx][:, np.newaxis], data[:, jdx][:, np.newaxis], metric=metric, distance_metric=distance_metric, kwidth=kwidth, centered=False, n_jobs=n_jobs, ) # compute the bias due to finite-samples bias = np.sum(K) / n_samples**2 # compute the sample centered correntropy corren = (1.0 / n_samples) * np.trace(K) - bias corren_arr[idx, jdx] = corren_arr[jdx, idx] = corren return corren_arr
[docs]def compute_kernel( X: ArrayLike, Y: Optional[ArrayLike] = None, metric: str = "rbf", distance_metric: str = "euclidean", kwidth: Optional[float] = None, centered: bool = True, n_jobs: Optional[int] = None, ) -> Tuple[ArrayLike, float]: """Compute a kernel matrix and corresponding width. Also optionally estimates the kernel width parameter. Parameters ---------- X : ArrayLike of shape (n_samples_X, n_features_X) The X array. Y : ArrayLike of shape (n_samples_Y, n_features_Y), optional The Y array, by default None. metric : str, optional The metric to compute the kernel function, by default 'rbf'. Can be any string as defined in :func:sklearn.metrics.pairwise.pairwise_kernels. Note 'rbf' and 'gaussian' are the same metric. distance_metric : str, optional The distance metric to compute distances among samples within each data matrix, by default 'euclidean'. Can be any valid string as defined in :func:sklearn.metrics.pairwise_distances. kwidth : float, optional The kernel width, by default None, which will then be estimated as the median L2 distance between the X features. centered : bool, optional Whether to center the kernel matrix or not, by default True. n_jobs : int, optional The number of jobs to run computations in parallel, by default None. Returns ------- kernel : ArrayLike of shape (n_samples_X, n_samples_X) or (n_samples_X, n_samples_Y) The kernel matrix. med : float The estimated kernel width. """ # if the width of the kernel is not set, then use the median trick to set the # kernel width based on the data X if kwidth is None: med = _estimate_kwidth(X, method="median", distance_metric=distance_metric, n_jobs=n_jobs) else: med = kwidth extra_kwargs = dict() if metric == "rbf": # compute the normalization factor of the width of the Gaussian kernel gamma = 1.0 / (2 * (med**2)) extra_kwargs["gamma"] = gamma elif metric == "polynomial": degree = 2 extra_kwargs["degree"] = degree # compute the potentially pairwise kernel kernel = pairwise_kernels(X, Y=Y, metric=metric, n_jobs=n_jobs, **extra_kwargs) if centered: kernel = _center_kernel(kernel) return kernel, med
def _estimate_kwidth( X: ArrayLike, method="scott", distance_metric: Optional[str] = None, n_jobs: Optional[int] = None, ) -> float: """Estimate kernel width. Parameters ---------- X : ArrayLike of shape (n_samples, n_features) The data. method : str, optional Method to use, by default "scott". distance_metric : str, optional The distance metric to compute distances among samples within each data matrix, by default 'euclidean'. Can be any valid string as defined in :func:sklearn.metrics.pairwise_distances. n_jobs : int, optional The number of jobs to run computations in parallel, by default None. Returns ------- kwidth : float The estimated kernel width for X. """ if method == "scott": kde = gaussian_kde(X) kwidth = kde.scotts_factor() elif method == "silverman": kde = gaussian_kde(X) kwidth = kde.silverman_factor() elif method == "median": # Note: sigma = 1 / np.sqrt(kwidth) # compute N x N pairwise distance matrix dists = pairwise_distances(X, metric=distance_metric, n_jobs=n_jobs) # compute median of off diagonal elements med = np.median(dists[dists > 0]) # prevents division by zero when used on label vectors kwidth = med if med else 1 return kwidth def _center_kernel(K: ArrayLike): """Centers a kernel matrix. Applies a transformation H * K * H, where H is a diagonal matrix with 1/n along the diagonal. Parameters ---------- K : ArrayLike of shape (n_features, n_features) The kernel matrix. Returns ------- K : ArrayLike of shape (n_features, n_features) The centered kernel matrix. """ n = K.shape[0] H = np.eye(n) - 1.0 / n return H.dot(K).dot(H) def _default_regularization(K: ArrayLike) -> float: """Computes a default regularization for Kernel Logistic Regression. Parameters ---------- K : ArrayLike of shape (n_samples, n_samples) The kernel matrix. Returns ------- x : float The default l2 regularization term. """ n_samples = K.shape[0] svals = np.linalg.svd(K, compute_uv=False, hermitian=True) res = minimize_scalar( lambda reg: np.sum(svals**2 / (svals + reg) ** 2) / n_samples + reg, bounds=(0.0001, 1000), method="bounded", ) return res.x