Source code for dodiscover.ci.cmi_test

from typing import Optional, Set, Tuple

import numpy as np
import pandas as pd
import scipy
import scipy.spatial
import scipy.special
from numpy.typing import ArrayLike
from sklearn.preprocessing import StandardScaler

from dodiscover.typing import Column

from .base import BaseConditionalIndependenceTest, CMIMixin


[docs]class CMITest(BaseConditionalIndependenceTest, CMIMixin): r"""Conditional mutual information independence test. Implements the conditional independence test using conditional mutual information proposed in :footcite:`Runge2018cmi`. Parameters ---------- k : float, optional Number of nearest-neighbors for each sample point. If the number is smaller than 1, it is computed as a fraction of the number of samples, by default 0.2. transform : str, optional Transform the data by standardizing the data, by default 'rank', which converts data to ranks. Can be 'rank', 'uniform', 'standardize'. n_jobs : int, optional The number of CPUs to use, by default -1, which corresponds to using all CPUs available. n_shuffle_nbrs : int, optional Number of nearest-neighbors within the Z covariates for shuffling, by default 5. n_shuffle : int The number of times to shuffle the dataset to generate the null distribution. By default, 1000. random_seed : int, optional The random seed that is used to seed via ``np.random.defaultrng``. Notes ----- Conditional mutual information (CMI) is defined as: .. math:: I(X;Y|Z) = \iiint p(z) p(x,y|z) \log \frac{ p(x,y|z)}{p(x|z)\cdot p(y |z)} \,dx dy dz It can be seen that when :math:`X \perp Y | Z`, then CMI is equal to 0. Hence, CMI is a general measure for conditional dependence. The estimator for CMI proposed in :footcite:`Runge2018cmi` is a k-nearest-neighbor based estimator: .. math:: \widehat{I}(X;Y|Z) = \psi (k) + \frac{1}{T} \sum_{t=1}^T (\psi(k_{Z,t}) - \psi(k_{XZ,t}) - \psi(k_{YZ,t})) where :math:`\psi` is the Digamma (i.e. see `scipy.special.digamma`) function. :math:`k` determines the size of hyper-cubes around each (high-dimensional) sample point. Then :math:`k_{Z,},k_{XZ},k_{YZ}` are the numbers of neighbors in the respective subspaces. :math:`k` can be viewed as a density smoothing parameter (although it is data-adaptive unlike fixed-bandwidth estimators). For large :math:`k`, the underlying dependencies are more smoothed and CMI has a larger bias, but lower variance, which is more important for significance testing. Note that the estimated CMI values can be slightly negative while CMI is a non- negative quantity. The estimator implemented here assumes the data is continuous. References ---------- .. footbibliography:: """ random_state: np.random.Generator def __init__( self, k: float = 0.2, transform: str = "rank", n_jobs: int = -1, n_shuffle_nbrs: int = 5, n_shuffle: int = 100, random_seed: Optional[int] = None, ) -> None: self.k = k self.n_shuffle_nbrs = n_shuffle_nbrs self.transform = transform self.n_jobs = n_jobs self.n_shuffle = n_shuffle self.random_seed = random_seed self.random_state = np.random.default_rng(self.random_seed)
[docs] def test( self, df: pd.DataFrame, x_vars: Set[Column], y_vars: Set[Column], z_covariates: Optional[Set] = None, ) -> Tuple[float, float]: if z_covariates is None: z_covariates = set() self._check_test_input(df, x_vars, y_vars, z_covariates) # preprocess and transform the data; called here only once df = self._preprocess_data(df) # compute the estimate of the CMI val = self._compute_cmi(df, x_vars, y_vars, z_covariates) # compute the significance of the CMI value null_dist = self._estimate_null_dist( df, x_vars, y_vars, z_covariates, n_shuffle_nbrs=self.n_shuffle_nbrs, n_shuffle=self.n_shuffle, ) # compute pvalue pvalue = (null_dist >= val).mean() self.stat_ = val self.pvalue_ = pvalue self.null_dist_ = null_dist return val, pvalue
def _compute_cmi(self, df, x_vars, y_vars, z_covariates: Set): n_samples, _ = df.shape if self.k < 1: knn_here = max(1, int(self.k * n_samples)) else: knn_here = max(1, int(self.k)) # compute the K nearest neighbors in sub-spaces k_xz, k_yz, k_z = self._get_knn(df, x_vars, y_vars, z_covariates) # compute the final CMI value hxyz = scipy.special.digamma(knn_here) hxz = scipy.special.digamma(k_xz) hyz = scipy.special.digamma(k_yz) hz = scipy.special.digamma(k_z) val = hxyz - (hxz + hyz - hz).mean() return val def _preprocess_data(self, data: pd.DataFrame) -> pd.DataFrame: n_samples, n_dims = data.shape # make a copy of the data to prevent changing it data = data.copy() # add minor noise to make sure there are no ties random_noise = self.random_state.random((n_samples, n_dims)) data += 1e-5 * random_noise @ data.std(axis=0).to_numpy().reshape(n_dims, 1) if self.transform == "standardize": # standardize with standard scaling data = data.astype(np.float64) scaler = StandardScaler() data[data.columns] = scaler.fit_transform(data[data.columns]) elif self.transform == "uniform": data = self._trafo2uniform(data) elif self.transform == "rank": # rank transform each column data = data.rank(axis=0) return data def _get_knn( self, data: pd.DataFrame, x_vars: Set[Column], y_vars: Set[Column], z_covariates: Set ) -> Tuple[ArrayLike, ArrayLike, ArrayLike]: """Compute the nearest neighbor in the variable subspaces. Parameters ---------- data : pd.DataFrame The dataset. x_var : Set[Column] The X variable column(s). y_var : Set[Column] The Y variable column(s). z_covariates : Set[Column] The Z variable column(s). Returns ------- k_xz : np.ArrayLike of shape (n_samples,) Nearest neighbors in subspace of ``x_var`` and the ``z_covariates``. k_yz : np.ArrayLike of shape (n_samples,) Nearest neighbors in subspace of ``y_var`` and the ``z_covariates``. k_z : np.ArrayLike of shape (n_samples,) Nearest neighbors in subspace of ``z_covariates``. """ n_samples, _ = data.shape if self.k < 1: knn = max(1, int(self.k * n_samples)) else: knn = max(1, int(self.k)) xz_cols = list(x_vars) for z_var in z_covariates: xz_cols.append(z_var) yz_cols = list(y_vars) for z_var in z_covariates: yz_cols.append(z_var) z_columns = list(z_covariates) columns = list(set(xz_cols).union(set(yz_cols))) data = data[columns] tree_xyz = scipy.spatial.KDTree(data.to_numpy()) epsarray = tree_xyz.query( data.to_numpy(), k=[knn + 1], p=np.inf, eps=0.0, workers=self.n_jobs )[0][:, 0].astype(np.float64) # To search neighbors < eps epsarray = np.multiply(epsarray, 0.99999) # Find nearest neighbors in subspaces of X and Z xz = data[xz_cols].to_numpy() tree_xz = scipy.spatial.KDTree(xz) k_xz = tree_xz.query_ball_point( xz, r=epsarray, eps=0.0, p=np.inf, workers=self.n_jobs, return_length=True ) # Find nearest neighbors in subspaces of Y and Z yz = data[yz_cols].to_numpy() tree_yz = scipy.spatial.KDTree(yz) k_yz = tree_yz.query_ball_point( yz, r=epsarray, eps=0.0, p=np.inf, workers=self.n_jobs, return_length=True ) # Find nearest neighbors in subspaces of just the Z covariates if len(z_columns) > 0: z = data[z_columns].to_numpy() tree_z = scipy.spatial.KDTree(z) k_z = tree_z.query_ball_point( z, r=epsarray, eps=0.0, p=np.inf, workers=self.n_jobs, return_length=True ) else: # Number of neighbors is n_samples when estimating just standard MI k_z = np.full(n_samples, n_samples, dtype=np.float64) return k_xz, k_yz, k_z def _trafo2uniform(self, df): """Transforms input array to uniform marginals. Assumes x.shape = (dim, T) Parameters ---------- df : pandas.DataFrame The input data with (n_samples,) rows and (n_features,) columns. Returns ------- u : array-like array with uniform marginals. """ def trafo(xi): xisorted = np.sort(xi) yi = np.linspace(1.0 / len(xi), 1, len(xi)) return np.interp(xi, xisorted, yi) # apply a uniform transformation for each feature for idx in range(len(df.columns)): marginalized_feature = trafo(df.iloc[:, idx].to_numpy().squeeze()) df.iloc[:, idx] = marginalized_feature return df