# Source code for dodiscover.ci.categorical_test

```# Copyright (c) 2013-2021 pgmpy
# Modified from pgmpy.

import logging
from functools import reduce
from typing import Optional, Set, Tuple

import numpy as np
import pandas as pd
from numpy.typing import ArrayLike
from scipy import stats

from dodiscover.ci.base import BaseConditionalIndependenceTest
from dodiscover.typing import Column

def power_divergence(
X: ArrayLike, Y: ArrayLike, Z: ArrayLike, data: pd.DataFrame, lambda_: str = "cressie-read"
) -> Tuple[float, float, int]:
"""
Computes the Cressie-Read power divergence statistic [1]. The null hypothesis
for the test is X is independent of Y given Z. A lot of the frequency comparison
based statistics (eg. chi-square, G-test etc) belong to power divergence family,
and are special cases of this test.

Parameters
----------
X: int, string, hashable object
A variable name contained in the data set

Y: int, string, hashable object
A variable name contained in the data set, different from X

Z: list, array-like
A list of variable names contained in the data set, different from X and Y.
This is the separating set that (potentially) makes X and Y independent.
Default: []

data: pandas.DataFrame
The dataset on which to test the independence condition.

lambda_: float or string
The lambda parameter for the power_divergence statistic. Some values of
lambda_ results in other well known tests:
"pearson"             1          "Chi-squared test"
"log-likelihood"      0          "G-test or log-likelihood"
"freeman-tukey"     -1/2        "freeman-tukey Statistic"
"mod-log-likelihood"  -1         "Modified Log-likelihood"
"neyman"              -2         "Neyman's statistic"
"cressie-read"        2/3        "The value recommended in the paper

Returns
-------
CI Test Results: tuple
Returns a tuple (chi, p_value, dof). `chi` is the
chi-squared test statistic. The `p_value` for the test, i.e. the
probability of observing the computed chi-square statistic (or an even
higher value), given the null hypothesis that X \u27C2 Y | Zs is True.
If boolean = True, returns True if the p_value of the test is greater
than `significance_level` else returns False.

--------
scipy.stats.power_divergence

References
----------
.. footbibliography::

Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> data = pd.DataFrame(np.random.randint(0, 2, size=(50000, 4)), columns=list('ABCD'))
>>> data['E'] = data['A'] + data['B'] + data['C']
>>> chi_square(X='A', Y='C', Z=[], data=data, boolean=True, significance_level=0.05)
True
>>> chi_square(X='A', Y='B', Z=['D'], data=data, boolean=True, significance_level=0.05)
True
>>> chi_square(X='A', Y='B', Z=['D', 'E'], data=data, boolean=True, significance_level=0.05)
False
"""

# Step 1: Check if the arguments are valid and type conversions.
if isinstance(Z, str):
Z = [Z]
if (X in Z) or (Y in Z):
raise ValueError(f"The variables X or Y can't be in Z. Found {X if X in Z else Y} in Z.")

# Step 2: Do a simple contingency test if there are no conditional variables.
if len(Z) == 0:
chi, p_value, dof, expected = stats.chi2_contingency(
data.groupby([X, Y]).size().unstack(Y, fill_value=0), lambda_=lambda_
)

# Step 3: If there are conditionals variables, iterate over unique states and do
#         the contingency test.
else:
chi = 0
dof = 0
for z_state, df in data.groupby(Z):
try:
# Note: The fill value is set to 1e-7 to avoid the following error:
# where there are not enough samples in the data, which results in a nan pvalue
sub_table_z = df.groupby([X, Y]).size().unstack(Y, fill_value=1e-7)
c, _, d, _ = stats.chi2_contingency(sub_table_z, lambda_=lambda_)
chi += c
dof += d
except ValueError:
# If one of the values is 0 in the 2x2 table.
if isinstance(z_state, str):
logging.info(
f"Skipping the test {X} \u27C2 {Y} | {Z[0]}={z_state}. Not enough samples"
)
else:
z_str = ", ".join([f"{var}={state}" for var, state in zip(Z, z_state)])
logging.info(f"Skipping the test {X} \u27C2 {Y} | {z_str}. Not enough samples")

if np.isnan(c):
raise RuntimeError(
f"The resulting chi square test statistic is NaN, which occurs "
f"when there are not enough samples in your data "
f"{df.shape}, {sub_table_z}."
)

p_value = 1 - stats.chi2.cdf(chi, df=dof)

# Step 4: Return the values
return chi, p_value, dof

[docs]class CategoricalCITest(BaseConditionalIndependenceTest):
"""CI test for categorical data.

Uses the power-divergence class of test statistics to test categorical data
for (conditional) independences.

Parameters
----------
lambda_ : str, optional
The lambda parameter for the power_divergence statistic, by default 'cressie-read'.
Some values of lambda_ results in other well known tests:
"pearson"             1          "Chi-squared test"
"log-likelihood"      0          "G-test or log-likelihood"
"freeman-tukey"     -1/2        "freeman-tukey Statistic"
"mod-log-likelihood"  -1         "Modified Log-likelihood"
"neyman"              -2         "Neyman's statistic"
"cressie-read"        2/3        "The value recommended in the paper

--------
scipy.stats.power_divergence

References
----------
.. footbibliography::
"""
self.lambda_ = lambda_

[docs]    def test(
self,
df: pd.DataFrame,
x_vars: Set[Column],
y_vars: Set[Column],
z_covariates: Optional[Set[Column]] = None,
) -> Tuple[float, float]:
x_vars = reduce(lambda x: x, x_vars)  # type: ignore
y_vars = reduce(lambda x: x, y_vars)  # type: ignore
stat, pvalue, dof = power_divergence(
x_vars, y_vars, z_covariates, data=df, lambda_=self.lambda_
)
self.dof_ = dof
return stat, pvalue

class CausalLearnCITest(BaseConditionalIndependenceTest):
def __init__(self, method_name="gsq") -> None:
self.method_name = method_name

def test(
self,
df: pd.DataFrame,
x_vars: Set[Column],
y_vars: Set[Column],
z_covariates: Optional[Set[Column]] = None,
) -> Tuple[float, float]:
import numpy as np
from causallearn.utils.cit import Chisq_or_Gsq

data = df.to_numpy()
x_vars = reduce(lambda x: x, x_vars)  # type: ignore
y_vars = reduce(lambda x: x, y_vars)  # type: ignore
x = np.argwhere(df.columns == x_vars)
y = np.argwhere(df.columns == y_vars)
z = []
if z_covariates is not None:
for _z in z_covariates:
z.append(np.argwhere(df.columns == _z).squeeze())
z = np.array(z)

tester = Chisq_or_Gsq(data, method_name=self.method_name)
return np.nan, tester(x, y, z)
```