Source code for dodiscover.cd.bregman

from typing import Optional, Set, Tuple

import numpy as np
import pandas as pd
from numpy.typing import ArrayLike

from dodiscover.ci.kernel_utils import corrent_matrix, von_neumann_divergence
from dodiscover.typing import Column

from .base import BaseConditionalDiscrepancyTest


[docs]class BregmanCDTest(BaseConditionalDiscrepancyTest):
    """Bregman divergence conditional discrepancy test.

    Tests the equality of conditional distributions using a kernel approach
    to estimate Bregman divergences outlined in :footcite:`Yu2020Bregman`.

    Parameters
    ----------
    metric : str, optional
        The kernel metric, by default 'rbf'.
    distance_metric : str, optional
        The distance metric, by default 'euclidean'.
    kwidth : float, optional
        The width of the kernel, by default None, which we will then estimate
        using the default procedure in :func:`dodiscover.ci.kernel_utils.compute_kernel`.
    null_reps : int, optional
        Number of times to sample null distribution, by default 1000.
    n_jobs : int, optional
        Number of CPUs to use, by default None.
    propensity_model : callable, optional
        The propensity model to estimate propensity scores among the groups. If `None`
        (default) will use :class:`sklearn.linear_model.LogisticRegression`. The
        ``propensity_model`` passed in must implement a ``predict_proba`` method in
        order to be used. See https://scikit-learn.org/stable/glossary.html#term-predict_proba
        for more information.
    propensity_est : array-like of shape (n_samples, n_groups,), optional
        The propensity estimates for each group. Must match the cardinality of the
        ``group_col`` in the data passed to ``test`` function. If `None` (default),
        will build a propensity model using the argument in ``propensity_model``.
    random_state : int, optional
        Random seed, by default None.

    Notes
    -----
    Currently only testing among two groups are supported. Therefore ``df[group_col]`` must
    only contain binary indicators and ``propensity_est`` must contain only two columns.
    References
    ----------
    .. footbibliography::
    """

    def __init__(
        self,
        metric: str = "rbf",
        distance_metric: str = "euclidean",
        kwidth: Optional[float] = None,
        null_reps: int = 1000,
        n_jobs: Optional[int] = None,
        propensity_model=None,
        propensity_est=None,
        random_state: Optional[int] = None,
    ) -> None:
        self.metric = metric
        self.distance_metric = distance_metric
        self.kwidth = kwidth
        self.null_reps = null_reps
        self.n_jobs = n_jobs
        self.random_state = random_state

        self.propensity_model = propensity_model
        self.propensity_est = propensity_est

[docs]    def test(
        self,
        df: pd.DataFrame,
        group_col: Set[Column],
        y_vars: Set[Column],
        x_vars: Set[Column],
    ) -> Tuple[float, float]:
        # check test input
        self._check_test_input(df, y_vars, group_col, x_vars)
        group_col_var: Column = list(group_col)[0]

        x_cols = list(x_vars)
        y_cols = list(y_vars)
        group_ind = df[group_col_var].to_numpy()
        if set(np.unique(group_ind)) != {0, 1}:
            raise RuntimeError(f"Group indications in {group_col_var} column should be all 1 or 0.")

        # get the X and Y dataset
        X = df[x_cols].to_numpy()
        Y = df[y_cols].to_numpy()

        # We are interested in testing: P_1(y|x) = P_2(y|x)
        # compute the conditional divergence, which is symmetric by construction
        # 1/2 * (D(p_1(y|x) || p_2(y|x)) + D(p_2(y|x) || p_1(y|x)))
        conditional_div = self._statistic(X, Y, group_ind)

        # compute propensity scores
        e_hat = self._compute_propensity_scores(group_ind, K=X)

        # now compute null distribution
        null_dist = self.compute_null(
            e_hat, X, Y, null_reps=self.null_reps, random_state=self.random_state
        )
        self.null_dist_ = null_dist

        # compute pvalue
        pvalue = (1.0 + np.sum(null_dist >= conditional_div)) / (1 + self.null_reps)
        return conditional_div, pvalue

    def _statistic(self, X: ArrayLike, Y: ArrayLike, group_ind: ArrayLike) -> float:
        first_group = group_ind == 0
        second_group = group_ind == 1
        X1 = X[first_group, :]
        X2 = X[second_group, :]
        Y1 = Y[first_group, :]
        Y2 = Y[second_group, :]

        # first compute the centered correntropy matrices, C_xy^1
        Cx1y1 = corrent_matrix(np.hstack((X1, Y1)), kwidth=self.kwidth)
        Cx2y2 = corrent_matrix(np.hstack((X2, Y2)), kwidth=self.kwidth)

        # compute the centered correntropy matrices for just C_x^1 and C_x^2
        Cx1 = corrent_matrix(
            X1,
            metric=self.metric,
            distance_metric=self.distance_metric,
            kwidth=self.kwidth,
            n_jobs=self.n_jobs,
        )
        Cx2 = corrent_matrix(
            X2,
            metric=self.metric,
            distance_metric=self.distance_metric,
            kwidth=self.kwidth,
            n_jobs=self.n_jobs,
        )

        # compute the conditional divergence with the Von Neumann div
        # D(p_1(y|x) || p_2(y|x))
        joint_div1 = von_neumann_divergence(Cx1y1, Cx2y2)
        joint_div2 = von_neumann_divergence(Cx2y2, Cx1y1)
        x_div1 = von_neumann_divergence(Cx1, Cx2)
        x_div2 = von_neumann_divergence(Cx2, Cx1)

        # compute the conditional divergence, which is symmetric by construction
        # 1/2 * (D(p_1(y|x) || p_2(y|x)) + D(p_2(y|x) || p_1(y|x)))
        conditional_div = 1.0 / 2 * (joint_div1 - x_div1 + joint_div2 - x_div2)
        return conditional_div