Source code for dowhy.datasets

import math

import numpy as np
import pandas as pd
from numpy.random import choice


[docs]def sigmoid(x): return 1 / (1 + math.exp(-x))
[docs]def stochastically_convert_to_binary(x): p = sigmoid(x) return choice([0, 1], 1, p=[1-p, p])
[docs]def linear_dataset(beta, num_common_causes, num_samples, num_instruments=0, treatment_is_binary=True): beta = float(beta) if num_common_causes > 0: range_c1 = beta*0.5 range_c2 = beta*0.5 means = np.random.uniform(-1, 1, num_common_causes) cov_mat = np.diag(np.ones(num_common_causes)) X = np.random.multivariate_normal(means, cov_mat, num_samples) c1 = np.random.uniform(0, range_c1, num_common_causes) c2 = np.random.uniform(0, range_c2, num_common_causes) if num_instruments > 0: range_cz = beta*0.5 p = np.random.uniform(0, 1, num_instruments) Z = np.zeros((num_samples, num_instruments)) for i in range(num_instruments): if (i % 2) == 0: Z[:, i] = np.random.binomial(n=1, p=p[i], size=num_samples) else: Z[:, i] = np.random.uniform(0, 1, size=num_samples) cz = np.random.uniform(0, range_cz, num_instruments) # TODO - test all our methods with random noise added to covariates (instead of the stochastic treatment assignment) t = np.random.normal(0, 1) if num_common_causes > 0: t += X @ c1 # + np.random.normal(0, 0.01) if num_instruments > 0: t += Z @ cz if treatment_is_binary: t = np.vectorize(stochastically_convert_to_binary)(t) y = beta*t # + np.random.normal(0,0.01) if num_common_causes>0: y += X @ c2 data = np.column_stack((t, y)) if num_common_causes > 0: data = np.column_stack((X, data)) if num_instruments > 0: data = np.column_stack((Z, data)) treatment = "v" outcome = "y" common_causes = [("X" + str(i)) for i in range(0, num_common_causes)] ate = beta instruments = [("Z" + str(i)) for i in range(0, num_instruments)] other_variables = None col_names = instruments + common_causes + [treatment, outcome] data = pd.DataFrame(data, columns=col_names) dot_graph = ('digraph {{ {0} ->{1};' ' U[label="Unobserved Confounders"];' ' U->{0}; U->{1};' ).format(treatment, outcome) dot_graph = dot_graph + " ".join([v + "-> " + treatment + ";" for v in common_causes]) dot_graph = dot_graph + " ".join([v + "-> " + outcome + ";" for v in common_causes]) dot_graph = dot_graph + " ".join([v + "-> " + treatment + ";" for v in instruments]) dot_graph = dot_graph + "}" gml_graph = ('graph[directed 1' 'node[ id "{0}" label "{0}"]' 'node[ id "{1}" label "{1}"]' 'node[ id "{2}" label "{2}"]' 'edge[source "{0}" target "{1}"]' 'edge[source "{2}" target "{0}"]' 'edge[source "{2}" target "{1}"]' ).format(treatment, outcome, "Unobserved Confounders") gml_graph = gml_graph + " ".join(['node[ id "{0}" label "{0}"] edge[ source "{0}" target "{1}"]'.format(v, treatment) for v in common_causes]) gml_graph = gml_graph + " ".join(['edge[ source "{0}" target "{1}"]'.format(v, outcome) for v in common_causes]) gml_graph = gml_graph + " ".join(['node[ id "{0}" label "{0}"] edge[ source "{0}" target "{1}"]'.format(v, treatment) for v in instruments]) gml_graph = gml_graph + ']' ret_dict = { "df": data, "treatment_name": treatment, "outcome_name": outcome, "common_causes_names": common_causes, "instrument_names": instruments, "dot_graph": dot_graph, "gml_graph": gml_graph, "ate": ate } return ret_dict
[docs]def xy_dataset(num_samples, effect=True, sd_error=1): treatment = 'Treatment' outcome = 'Outcome' common_causes = ['w0'] time_var = 's' E1 = np.random.normal(loc=0, scale=sd_error, size=num_samples) E2 = np.random.normal(loc=0, scale=sd_error, size=num_samples) S = np.random.uniform(0, 10, num_samples) T1 = 4 - (S - 3) * (S - 3) T1[S >= 5] = 0 T2 = (S - 7) * (S - 7) - 4 T2[S <= 5] = 0 W = T1 + T2 # hidden confounder if effect: U = None V = 6 + W + E1 Y = 6 + V + W + E2 # + (V-8)*(V-8) else: U = W # np.random.normal(0, 1, num_samples) V = 6 + W + E1 Y = 12 + W + W + E2 # E2_new dat = { treatment: V, outcome: Y, common_causes[0]: W, time_var: S } data = pd.DataFrame(data=dat) ret_dict = { "df": data, "treatment_name": treatment, "outcome_name": outcome, "common_causes_names": common_causes, "time_val": time_var, "instrument_names": None, "dot_graph": None, "gml_graph": None, "ate": None, } return ret_dict