Source code for dowhy.utils.cit

from collections import defaultdict
from math import exp, log

import numpy as np
import pandas as pd
from scipy.stats import norm, t


[docs]def compute_ci(r=None, nx=None, ny=None, confidence=0.95):
    """Compute Parametric confidence intervals around correlation coefficient.
    See : https://online.stat.psu.edu/stat505/lesson/6/6.3

    This is done by applying Fisher's r to z transform
    z = .5[ln((1+r)/(1-r))] = arctanh(r)

    The Standard error is 1/sqrt(N-3) where N is sample size

    The critical value for normal distribution for a corresponding confidence
    level is calculated from stats.norm.ppf((1 - alpha)/2) for two tailed test

    The lower and upper condidence intervals in z space are calculated with the formula
    z ± critical value*error

    The confidence interval is then converted back to r space

    :param stat : correlation coefficient
    :param nx : length of vector x
    :param ny :length of vector y
    :param confidence : Confidence level (0.95 = 95%)

    :returns : array containing confidence interval
    """

    assert r is not None and nx is not None
    assert isinstance(confidence, float)
    assert 0 < confidence < 1

    z = np.arctanh(r)  # Fisher Transform  from r to z
    se = 1 / np.sqrt(nx - 3)  # Standard error = 1/sqrt(N-3) where N is sample size
    crit = np.abs(norm.ppf((1 - confidence) / 2))  # Z-critical value
    ci_z = np.array([z - crit * se, z + crit * se])  # CI = point estimator ± critical value*error
    ci = np.tanh(ci_z)  # Back Transform to r-space

    return ci


[docs]def partial_corr(data=None, x=None, y=None, z=None, method="pearson"):
    """Calculate Partial correlation which is the degree of association between
    x and y after removing effect of z. This is done by calculating correlation
    coefficient between the residuals of two linear regressions :
    x\sim z, y\sim z
    See : 1 https://en.wikipedia.org/wiki/Partial_correlation
          2 https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-842X.2004.00360.x?casa_token=p_D3joHC8C0AAAAA:qigIZHVfcVi8vsz1j2t7uQYOorrYaF3Tm4lpQOUzqG_J9gJgtFerOyliKBnQPVG187nJxbA-wcbXU3QcOw
          3 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4681537/
          4 http://parker.ad.siu.edu/Olive/slch6.pdf
    :param data : pandas dataframe
    :param x : Column name in data
    :param y : Column name in data
    :param z : string or list
    :param method : string denoting the correlation type - "pearson" or "spearman"

    : returns: a python dictionary with keys as
        n: Sample size
        r: Partial correlation coefficient
        CI95: 95% parametric confidence intervals
        p-val: p-value
    """

    assert data.shape[0] > 2  # Check for atleast 3 samples
    assert x != z  # x and z should be distinct
    assert y != z  # y and z should be distinct
    assert x != y  # x and y should be distinct
    if isinstance(z, list):
        assert x not in z  # x and z should be distinct
        assert y not in z  # y and z should be distinct

    combined_variables = [x, y]  # Combine all variables - x, y and z
    for var in z:
        combined_variables.append(var)
    data = data[combined_variables].dropna()  # Drop missing values
    n = data.shape[0]  # Number of samples after dropping missing values
    k = data.shape[1] - 2  # Number of covariates
    assert n > 2

    if method == "spearman":
        V = data.rank(na_option="keep").cov()  # Change data to rank for spearman correlation
    else:
        V = data.astype(float).cov()  # Computing Covariance Matrix
    Vi = np.linalg.pinv(V, hermitian=True)  # Computing Inverse Covariance Matrix
    Vi_diag = Vi.diagonal()  # Storing variance
    D = np.diag(np.sqrt(1 / Vi_diag))  # Storing Standard Deviations from diagonal of inverse covariance matrix
    pcor = -1 * (D @ Vi @ D)
    r = pcor[0, 1]

    if np.isnan(r):
        return {"n": n, "r": np.nan, "CI95%": np.nan, "p-val": np.nan}

    # Finding p-value using student T test
    dof = n - k - 2  # Degree of freedom for multivariate analysis
    tval = r * np.sqrt(dof / (1 - r**2))  # Test statistic
    pval = 2 * t.sf(np.abs(tval), dof)  # Calculate p-value corresponding to the test statistic and degree of freedom

    ci = compute_ci(r=r, nx=(n - k), ny=(n - k))  # Finding Confidence Interval
    ci = np.round(ci, 3)
    stats = {
        "n": n,
        "r": r,
        "CI95%": [ci],
        "p-val": pval.round(5),
    }
    return stats


[docs]def entropy(x):
    """ "
    Returns entropy for a random variable x
    H(x) = - Σ p(x)log(p(x))
    :param x : random variable to calculate entropy for
    :returns : entropy of random variable
    """
    d = defaultdict(lambda: 0)
    s = 0.0
    entr = 0.0
    for i in x:
        d[i] += 1  # Calculating frequency of an event
        s += 1
    for i in d:
        p = d[i] / s  # Calculating probability for an event
        entr -= p * log(p, 2)  # H(x) = - Σ p(x)log(p(x))
    return entr


[docs]def conditional_MI(data=None, x=None, y=None, z=None):
    """
    Method to return conditional mutual information between X and Y given Z
    I(X, Y | Z) = H(X|Z) - H(X|Y,Z)
                = H(X,Z) - H(Z) - H(X,Y,Z) + H(Y,Z)
                = H(X,Z) + H(Y,Z) - H(X,Y,Z) - H(Z)
    :param data : dataset
    :param x,y,z : column names from dataset
    :returns : conditional mutual information between X and Y given Z
    """
    X = data[list(x)].astype(int)
    Y = data[list(y)].astype(int)
    t = list(z)
    Z = data[t].astype(int)
    Z = Z.values.tolist()
    Z = list(data[t].itertuples(index=False, name=None))
    Hxz = entropy(map(lambda x: "%s/%s" % x, zip(X, Z)))  # Finding Joint entropy of X and Z
    Hyz = entropy(map(lambda x: "%s/%s" % x, zip(Y, Z)))  # Finding Joint entropy of Y and Z
    Hz = entropy(Z)  # Finding Entropy of Z
    Hxyz = entropy(map(lambda x: "%s/%s/%s" % x, zip(X, Y, Z)))  # Finding Joint Entropy of X, Y and Z
    return Hxz + Hyz - Hxyz - Hz