Source code for dowhy.datasets

"""Module for generating some sample datasets.

"""

import math

import numpy as np
import pandas as pd
from numpy.random import choice

[docs]def sigmoid(x): return 1 / (1 + math.exp(-x))
[docs]def stochastically_convert_to_binary(x): p = sigmoid(x) return choice([0, 1], 1, p=[1-p, p])
[docs]def convert_to_categorical(arr, num_vars, num_discrete_vars, quantiles = [0.25, 0.5, 0.75], one_hot_encode=False): arr_with_dummy = arr.copy() # Below loop assumes that the last indices of W are alwawys converted to discrete for arr_index in range(num_vars - num_discrete_vars, num_vars): # one-hot encode discrete W arr_bins = np.quantile(arr[:, arr_index], q=quantiles) arr_categorical = np.digitize(arr[:, arr_index], bins=arr_bins) if one_hot_encode: dummy_vecs = np.eye(len(quantiles)+1)[arr_categorical] arr_with_dummy = np.concatenate((arr_with_dummy, dummy_vecs), axis=1) else: arr_with_dummy = np.concatenate((arr_with_dummy, arr_categorical[:,np.newaxis ]), axis=1) # Now deleting the old continuous value for arr_index in range(num_vars -1,num_vars-num_discrete_vars-1, -1): arr_with_dummy = np.delete(arr_with_dummy, arr_index, axis=1) return arr_with_dummy
[docs]def construct_col_names(name, num_vars, num_discrete_vars, num_discrete_levels, one_hot_encode): colnames = [(name + str(i)) for i in range(0, num_vars - num_discrete_vars)] if one_hot_encode: discrete_colnames = [name+str(i) + "_" + str(j) for i in range(num_vars - num_discrete_vars, num_vars) for j in range(0, num_discrete_levels)] colnames = colnames + discrete_colnames else: colnames = colnames + [(name + str(i)) for i in range(num_vars-num_discrete_vars, num_vars)] return colnames
[docs]def linear_dataset(beta, num_common_causes, num_samples, num_instruments=0, num_effect_modifiers=0, num_treatments = 1, num_frontdoor_variables=0, treatment_is_binary=True, outcome_is_binary=False, num_discrete_common_causes=0, num_discrete_instruments=0, num_discrete_effect_modifiers=0, one_hot_encode = False): W, X, Z, FD, c1, c2, ce, cz, cfd1, cfd2 = [None]*10 W_with_dummy, X_with_categorical = (None, None) beta = float(beta) # Making beta an array if type(beta) not in [list, np.ndarray]: beta = np.repeat(beta, num_treatments) num_cont_common_causes = num_common_causes-num_discrete_common_causes num_cont_instruments = num_instruments -num_discrete_instruments num_cont_effect_modifiers = num_effect_modifiers - num_discrete_effect_modifiers if num_common_causes > 0: range_c1 = max(beta)*0.5 range_c2 = max(beta)*0.5 means = np.random.uniform(-1, 1, num_common_causes) cov_mat = np.diag(np.ones(num_common_causes)) W = np.random.multivariate_normal(means, cov_mat, num_samples) W_with_dummy = convert_to_categorical(W, num_common_causes, num_discrete_common_causes, quantiles=[0.25, 0.5, 0.75], one_hot_encode=one_hot_encode) c1 = np.random.uniform(0, range_c1, (W_with_dummy.shape[1], num_treatments)) c2 = np.random.uniform(0, range_c2, W_with_dummy.shape[1]) if num_instruments > 0: range_cz = beta p = np.random.uniform(0, 1, num_instruments) Z = np.zeros((num_samples, num_instruments)) for i in range(num_instruments): if (i % 2) == 0: Z[:, i] = np.random.binomial(n=1, p=p[i], size=num_samples) else: Z[:, i] = np.random.uniform(0, 1, size=num_samples) # TODO Ensure that we do not generate weak instruments cz = np.random.uniform(range_cz - (range_cz * 0.05), range_cz + (range_cz * 0.05), (num_instruments, num_treatments)) if num_effect_modifiers >0: range_ce = beta*0.5 means = np.random.uniform(-1, 1, num_effect_modifiers) cov_mat = np.diag(np.ones(num_effect_modifiers)) X = np.random.multivariate_normal(means, cov_mat, num_samples) X_with_categorical = convert_to_categorical(X, num_effect_modifiers, num_discrete_effect_modifiers, quantiles=[0.25, 0.5, 0.75], one_hot_encode=one_hot_encode) ce = np.random.uniform(0, range_ce, X_with_categorical.shape[1]) # TODO - test all our methods with random noise added to covariates (instead of the stochastic treatment assignment) t = np.random.normal(0, 1, (num_samples, num_treatments)) if num_common_causes > 0: t += W_with_dummy @ c1 # + np.random.normal(0, 0.01) if num_instruments > 0: t += Z @ cz # Converting treatment to binary if required if treatment_is_binary: t = np.vectorize(stochastically_convert_to_binary)(t) # Generating frontdoor variables if asked for if num_frontdoor_variables > 0: range_cfd1 = max(beta)*0.5 range_cfd2 = max(beta)*0.5 cfd1 = np.random.uniform(0, range_cfd1, (num_treatments, num_frontdoor_variables)) cfd2 = np.random.uniform(0, range_cfd2, num_frontdoor_variables) FD_noise = np.random.normal(0, 1, (num_samples, num_frontdoor_variables)) FD = FD_noise FD += t @ cfd1 if num_common_causes >0: range_c1_frontdoor = range_c1/10.0 c1_frontdoor = np.random.uniform(0, range_c1_frontdoor, (W_with_dummy.shape[1], num_frontdoor_variables)) FD += W_with_dummy @ c1_frontdoor def _compute_y(t, W, X, FD, beta, c2, ce, cfd2): y = np.random.normal(0,0.01, num_samples) if num_frontdoor_variables > 0: y += FD @ cfd2 else: y += t @ beta if num_common_causes > 0: y += W @ c2 if num_effect_modifiers > 0: y += (X @ ce) * np.prod(t, axis=1) if outcome_is_binary: y = np.vectorize(stochastically_convert_to_binary)(y) return y y = _compute_y(t, W_with_dummy, X_with_categorical, FD, beta, c2, ce, cfd2) data = np.column_stack((t, y)) if num_common_causes > 0: data = np.column_stack((W_with_dummy, data)) if num_instruments > 0: data = np.column_stack((Z, data)) if num_effect_modifiers > 0: data = np.column_stack((X_with_categorical, data)) if num_frontdoor_variables > 0: data = np.column_stack((FD, data)) # Computing ATE FD_T1, FD_T0 = None, None T1 = np.ones((num_samples, num_treatments)) T0 = np.zeros((num_samples, num_treatments)) if num_frontdoor_variables > 0: FD_T1 = FD_noise + (T1 @ cfd1) FD_T0 = FD_noise + (T0 @ cfd1) ate = np.mean( _compute_y(T1, W_with_dummy, X_with_categorical, FD_T1, beta, c2, ce, cfd2) - _compute_y(T0, W_with_dummy, X_with_categorical, FD_T0, beta, c2, ce, cfd2)) treatments = [("v" + str(i)) for i in range(0, num_treatments)] outcome = "y" # constructing column names for one-hot encoded discrete features common_causes = construct_col_names("W", num_common_causes, num_discrete_common_causes, num_discrete_levels=4, one_hot_encode=one_hot_encode) instruments = [("Z" + str(i)) for i in range(0, num_instruments)] frontdoor_variables = [("FD" + str(i)) for i in range(0, num_frontdoor_variables)] effect_modifiers = construct_col_names("X", num_effect_modifiers, num_discrete_effect_modifiers, num_discrete_levels=4, one_hot_encode=one_hot_encode) other_variables = None col_names = frontdoor_variables + effect_modifiers + instruments + common_causes + treatments + [outcome] data = pd.DataFrame(data, columns=col_names) # Specifying the correct dtypes if treatment_is_binary: data = data.astype({tname:'bool' for tname in treatments}, copy=False) if outcome_is_binary: data = data.astype({outcome: 'bool'}, copy=False) if num_discrete_common_causes >0 and not one_hot_encode: data = data.astype({wname:'int64' for wname in common_causes[num_cont_common_causes:]}, copy=False) data = data.astype({wname:'category' for wname in common_causes[num_cont_common_causes:]}, copy=False) if num_discrete_effect_modifiers >0 and not one_hot_encode: data = data.astype({emodname:'int64' for emodname in effect_modifiers[num_cont_effect_modifiers:]}, copy=False) data = data.astype({emodname:'category' for emodname in effect_modifiers[num_cont_effect_modifiers:]}, copy=False) # Now specifying the corresponding graph strings dot_graph = create_dot_graph(treatments, outcome, common_causes, instruments, effect_modifiers, frontdoor_variables) # Now writing the gml graph gml_graph = create_gml_graph(treatments, outcome, common_causes, instruments, effect_modifiers, frontdoor_variables) ret_dict = { "df": data, "treatment_name": treatments, "outcome_name": outcome, "common_causes_names": common_causes, "instrument_names": instruments, "effect_modifier_names": effect_modifiers, "frontdoor_variables_names": frontdoor_variables, "dot_graph": dot_graph, "gml_graph": gml_graph, "ate": ate } return ret_dict
[docs]def simple_iv_dataset(beta, num_samples, num_treatments = 1, treatment_is_binary=True, outcome_is_binary=False): """ Simple instrumental variable dataset with a single IV and a single confounder. """ W, Z, c1, c2, cz = [None]*5 num_instruments = 1 num_common_causes = 1 beta = float(beta) # Making beta an array if type(beta) not in [list, np.ndarray]: beta = np.repeat(beta, num_treatments) c1 = np.random.uniform(0,1, (num_common_causes, num_treatments)) c2 = np.random.uniform(0,1, num_common_causes) range_cz = beta # cz is much higher than c1 and c2 cz = np.random.uniform(range_cz - (range_cz * 0.05), range_cz + (range_cz * 0.05), (num_instruments, num_treatments)) W = np.random.uniform(0, 1, (num_samples, num_common_causes)) Z = np.random.normal(0, 1, (num_samples, num_instruments)) t = np.random.normal(0, 1, (num_samples, num_treatments)) + Z @ cz + W @ c1 if treatment_is_binary: t = np.vectorize(stochastically_convert_to_binary)(t) def _compute_y(t, W, beta, c2): y = t @ beta + W @ c2 return y y = _compute_y(t, W, beta, c2) # creating data frame data = np.column_stack((Z, W, t, y)) treatments = [("v" + str(i)) for i in range(0, num_treatments)] outcome = "y" common_causes = [("W" + str(i)) for i in range(0, num_common_causes)] ate = np.mean(_compute_y(np.ones((num_samples, num_treatments)), W, beta, c2 ) - _compute_y(np.zeros((num_samples, num_treatments)), W, beta, c2)) instruments = [("Z" + str(i)) for i in range(0, num_instruments)] other_variables = None col_names = instruments + common_causes + treatments + [outcome] data = pd.DataFrame(data, columns=col_names) # Specifying the correct dtypes if treatment_is_binary: data = data.astype({tname:'bool' for tname in treatments}, copy=False) if outcome_is_binary: data = data.astype({outcome: 'bool'}, copy=False) # Now specifying the corresponding graph strings dot_graph = create_dot_graph(treatments, outcome, common_causes, instruments) # Now writing the gml graph gml_graph = create_gml_graph(treatments, outcome, common_causes, instruments) ret_dict = { "df": data, "treatment_name": treatments, "outcome_name": outcome, "common_causes_names": common_causes, "instrument_names": instruments, "effect_modifier_names": None, "dot_graph": dot_graph, "gml_graph": gml_graph, "ate": ate } return ret_dict
[docs]def create_dot_graph(treatments, outcome, common_causes, instruments, effect_modifiers=[], frontdoor_variables=[]): dot_graph = ('digraph {{' ' U[label="Unobserved Confounders"];' ' U->{0};' ).format(outcome) for currt in treatments: if len(frontdoor_variables) == 0: dot_graph += '{0}->{1};'.format(currt, outcome) dot_graph += 'U->{0};'.format(currt) dot_graph += " ".join([v + "-> " + currt + ";" for v in common_causes]) dot_graph += " ".join([v + "-> " + currt + ";" for v in instruments]) dot_graph += " ".join([currt + "-> " + v + ";" for v in frontdoor_variables]) dot_graph += " ".join([v + "-> " + outcome + ";" for v in common_causes]) dot_graph += " ".join([v + "-> " + outcome + ";" for v in effect_modifiers]) dot_graph += " ".join([v + "-> " + outcome + ";" for v in frontdoor_variables]) dot_graph = dot_graph + "}" # Adding edges between common causes and the frontdoor mediator for v1 in common_causes: dot_graph += " ".join([v1 + "-> " + v2 + ";" for v2 in frontdoor_variables]) return dot_graph
[docs]def create_gml_graph(treatments, outcome, common_causes, instruments, effect_modifiers=[], frontdoor_variables=[]): gml_graph = ('graph[directed 1' 'node[ id "{0}" label "{0}"]' 'node[ id "{1}" label "{1}"]' 'edge[source "{1}" target "{0}"]' ).format(outcome, "Unobserved Confounders") gml_graph += " ".join(['node[ id "{0}" label "{0}"]'.format(v) for v in common_causes]) gml_graph += " ".join(['node[ id "{0}" label "{0}"]'.format(v) for v in instruments]) gml_graph += " ".join(['node[ id "{0}" label "{0}"]'.format(v) for v in frontdoor_variables]) for currt in treatments: gml_graph += ('node[ id "{0}" label "{0}"]' 'edge[source "{1}" target "{0}"]' ).format(currt, "Unobserved Confounders") if len(frontdoor_variables) == 0: gml_graph += 'edge[source "{0}" target "{1}"]'.format(currt, outcome) gml_graph += " ".join(['edge[ source "{0}" target "{1}"]'.format(v, currt) for v in common_causes]) gml_graph += " ".join(['edge[ source "{0}" target "{1}"]'.format(v, currt) for v in instruments]) gml_graph += " ".join(['edge[ source "{0}" target "{1}"]'.format(currt, v) for v in frontdoor_variables]) gml_graph = gml_graph + " ".join(['edge[ source "{0}" target "{1}"]'.format(v, outcome) for v in common_causes]) gml_graph = gml_graph + " ".join(['node[ id "{0}" label "{0}"] edge[ source "{0}" target "{1}"]'.format(v, outcome) for v in effect_modifiers]) gml_graph = gml_graph + " ".join(['edge[ source "{0}" target "{1}"]'.format(v, outcome) for v in frontdoor_variables]) for v1 in common_causes: gml_graph = gml_graph + " ".join(['edge[ source "{0}" target "{1}"]'.format(v1, v2) for v2 in frontdoor_variables]) gml_graph = gml_graph + ']' return gml_graph
[docs]def xy_dataset(num_samples, effect=True, num_common_causes=1, is_linear=True, sd_error=1): treatment = 'Treatment' outcome = 'Outcome' common_causes = ['w'+str(i) for i in range(num_common_causes)] time_var = 's' # Error terms E1 = np.random.normal(loc=0, scale=sd_error, size=num_samples) E2 = np.random.normal(loc=0, scale=sd_error, size=num_samples) S = np.random.uniform(0, 10, num_samples) T1 = 4 - (S - 3) * (S - 3) T1[S >= 5] = 0 T2 = (S - 7) * (S - 7) - 4 T2[S <= 5] = 0 W0 = T1 + T2 # hidden confounder tterm, yterm = 0,0 if num_common_causes > 1: means = np.random.uniform(-1, 1, num_common_causes-1) cov_mat = np.diag(np.ones(num_common_causes-1)) otherW = np.random.multivariate_normal(means, cov_mat, num_samples) c1 = np.random.uniform(0, 1, (otherW.shape[1], 1)) c2 = np.random.uniform(0, 1, (otherW.shape[1], 1)) tterm = (otherW @ c1)[:,0] yterm = (otherW @ c2)[:,0] if is_linear: V = 6 + W0 + tterm + E1 Y = 6 + W0 + yterm + E2 # + (V-8)*(V-8) if effect: Y += V else: Y += (6 + W0) else: V = 6 + W0*W0 + tterm + E1 Y = 6 + W0*W0 + yterm + E2 # + (V-8)*(V-8) if effect: Y += V #/20 # divide by 10 to scale the value of Y to be comparable to V else: Y += (6 + W0) #else: # V = 6 + W0 + tterm + E1 # Y = 12 + W0*W0 + W0*W0 + yterm + E2 # E2_new dat = { treatment: V, outcome: Y, common_causes[0]: W0, time_var: S } if num_common_causes > 1: for i in range(otherW.shape[1]): dat[common_causes[i+1]] = otherW[:,i] data = pd.DataFrame(data=dat) ret_dict = { "df": data, "treatment_name": treatment, "outcome_name": outcome, "common_causes_names": common_causes, "time_val": time_var, "instrument_names": None, "dot_graph": None, "gml_graph": None, "ate": None, } return ret_dict