Source code for dowhy.utils.cit

from scipy.stats import t, norm
import pandas as pd
import numpy as np
from collections import defaultdict
from math import log, exp


[docs]def compute_ci(r=None, nx=None, ny=None, confidence=.95):
    """Compute Parametric confidence intervals around correlation coefficient.
    See : https://online.stat.psu.edu/stat505/lesson/6/6.3

    This is done by applying Fisher's r to z transform
    z = .5[ln((1+r)/(1-r))] = arctanh(r)

    The Standard error is 1/sqrt(N-3) where N is sample size

    The critical value for normal distribution for a corresponding confidence
    level is calculated from stats.norm.ppf((1 - alpha)/2) for two tailed test

    The lower and upper condidence intervals in z space are calculated with the formula
    z ± critical value*error

    The confidence interval is then converted back to r space
    
    :param stat : correlation coefficient
    :param nx : length of vector x
    :param ny :length of vector y
    :param confidence : Confidence level (0.95 = 95%)
    
    :returns : array containing confidence interval
    """

    assert r is not None and nx is not None
    assert isinstance(confidence, float)
    assert 0 < confidence < 1

    z = np.arctanh(r) # Fisher Transform  from r to z
    se = 1 / np.sqrt(nx - 3) # Standard error = 1/sqrt(N-3) where N is sample size
    crit = np.abs(norm.ppf((1 - confidence) / 2)) # Z-critical value 
    ci_z = np.array([z - crit * se, z + crit * se]) # CI = point estimator ± critical value*error
    ci = np.tanh(ci_z)  # Back Transform to r-space

    return ci

[docs]def partial_corr(data=None, x=None, y=None, z=None, method="pearson"):
    """ Calculate Partial correlation which is the degree of association between 
    x and y after removing effect of z. This is done by calculating correlation
    coefficient between the residuals of two linear regressions :
    x\sim z, y\sim z
    See : 1 https://en.wikipedia.org/wiki/Partial_correlation
          2 https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-842X.2004.00360.x?casa_token=p_D3joHC8C0AAAAA:qigIZHVfcVi8vsz1j2t7uQYOorrYaF3Tm4lpQOUzqG_J9gJgtFerOyliKBnQPVG187nJxbA-wcbXU3QcOw
          3 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4681537/
          4 http://parker.ad.siu.edu/Olive/slch6.pdf
    :param data : pandas dataframe 
    :param x : Column name in data
    :param y : Column name in data
    :param z : string or list
    :param method : string denoting the correlation type - "pearson" or "spearman"

    : returns: a python dictionary with keys as 
        n: Sample size
        r: Partial correlation coefficient
        CI95: 95% parametric confidence intervals
        p-val: p-value
    """

    assert data.shape[0] > 2 # Check for atleast 3 samples
    assert x != z # x and z should be distinct
    assert y != z # y and z should be distinct
    assert x != y # x and y should be distinct
    if isinstance(z, list):
        assert x not in z # x and z should be distinct
        assert y not in z # y and z should be distinct

    combined_variables = [x,y] # Combine all variables - x, y and z
    for var in z:
        combined_variables.append(var) 
    data = data[combined_variables].dropna() #Drop missing values
    n = data.shape[0] # Number of samples after dropping missing values
    k = data.shape[1] - 2 # Number of covariates
    assert n > 2

    if method == "spearman":
        V = data.rank(na_option='keep').cov() # Change data to rank for spearman correlation
    else:
        V = data.astype(float).cov() # Computing Covariance Matrix
    Vi = np.linalg.pinv(V, hermitian=True)   # Computing Inverse Covariance Matrix
    Vi_diag = Vi.diagonal()  # Storing variance
    D = np.diag(np.sqrt(1 / Vi_diag)) # Storing Standard Deviations from diagonal of inverse covariance matrix
    pcor = -1 * (D @ Vi @ D)  
    r = pcor[0,1]
    
    if np.isnan(r):
        return {'n': n, 'r': np.nan, 'CI95%': np.nan, 'p-val': np.nan}

    # Finding p-value using student T test
    dof = n - k - 2  # Degree of freedom for multivariate analysis
    tval = r * np.sqrt(dof / (1 - r**2)) # Test statistic
    pval = 2 * t.sf(np.abs(tval), dof) # Calculate p-value corresponding to the test statistic and degree of freedom

    ci = compute_ci(r=r, nx=(n - k), ny=(n - k)) #Finding Confidence Interval
    ci=np.round(ci, 3)
    stats = {
        'n': n,
        'r': r,
        'CI95%': [ci],
        'p-val': pval.round(5),
    }
    return stats

[docs]def entropy(x):
    """"
    Returns entropy for a random variable x
    H(x) = - Σ p(x)log(p(x))
    :param x : random variable to calculate entropy for
    :returns : entropy of random variable
    """
    d=defaultdict(lambda:0)
    s = 0.0
    entr = 0.0
    for i in x:
        d[i]+=1  # Calculating frequency of an event
        s+=1
    for i in d:
        p = d[i]/s # Calculating probability for an event
        entr-=p*log(p,2) # H(x) = - Σ p(x)log(p(x))
    return entr


[docs]def conditional_MI(data=None,x=None,y=None,z=None):
    """
    Method to return conditional mutual information between X and Y given Z
    I(X, Y | Z) = H(X|Z) - H(X|Y,Z)
                = H(X,Z) - H(Z) - H(X,Y,Z) + H(Y,Z)
                = H(X,Z) + H(Y,Z) - H(X,Y,Z) - H(Z)
    :param data : dataset 
    :param x,y,z : column names from dataset
    :returns : conditional mutual information between X and Y given Z
    """
    X = data[list(x)].astype(int)
    Y = data[list(y)].astype(int)
    t = list(z)
    Z = data[t].astype(int)
    Z = Z.values.tolist()
    Z = list(data[t].itertuples(index=False, name=None)) 
    Hxz = entropy(map(lambda x:'%s/%s'%x,zip(X,Z)))       # Finding Joint entropy of X and Z
    Hyz = entropy(map(lambda x:'%s/%s'%x,zip(Y,Z)))       # Finding Joint entropy of Y and Z
    Hz = entropy(Z)                                       # Finding Entropy of Z
    Hxyz = entropy(map(lambda x:'%s/%s/%s'%x,zip(X,Y,Z))) # Finding Joint Entropy of X, Y and Z
    return Hxz + Hyz - Hxyz - Hz