Source code for dowhy.causal_refuters.bootstrap_refuter

from dowhy.causal_refuter import CausalRefuter, CausalRefutation
from dowhy.causal_estimator import CausalEstimator
import numpy as np
import random
from sklearn.utils import resample
import logging

[docs]class BootstrapRefuter(CausalRefuter): """ Refute an estimate by running it on a random sample of the data containing measurement error in the confounders. This allows us to find the ability of the estimator to find the effect of the treatment on the outcome. It supports additional parameters that can be specified in the refute_estimate() method. Parameters ----------- -'num_simulations': int, CausalRefuter.DEFAULT_NUM_SIMULATIONS by default The number of simulations to be run - 'sample_size': int, Size of the original data by default The size of each bootstrap sample - 'required_variables': int, list, bool, True by default A user can input either an integer value, list or bool. 1. An integer argument refers to how many variables will be modified 2. A list allows the user to explicitly refer to which variables should be selected to be made noisy Furthermore, a user can either choose to select the variables desired. Or they can deselect the variables, that they do not want in their analysis. For example: We need to pass required_variables = [W0,W1] is we want W0 and W1. We need to pass required_variables = [-W0,-W1] if we want all variables excluding W0 and W1. 3. If the user passes True, noise is added to confounders, instrumental variables and effect modifiers If the value is False, we just Bootstrap the existing dataset - 'noise': float, BootstrapRefuter.DEFAULT_STD_DEV by default The standard deviation of the noise to be added to the data - 'probability_of_change': float, 'noise' by default if the value is less than 1 It specifies the probability with which we change the data for a boolean or categorical variable - 'random_state': int, RandomState, None by default The seed value to be added if we wish to repeat the same random behavior. For this purpose, we repeat the same seed in the psuedo-random generator. """ DEFAULT_STD_DEV = 0.1 DEFAULT_SUCCESS_PROBABILITY = 0.5 DEFAULT_NUMBER_OF_TRIALS = 1 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._num_simulations = kwargs.pop("num_simulations", CausalRefuter.DEFAULT_NUM_SIMULATIONS ) self._sample_size = kwargs.pop("sample_size", len(self._data)) required_variables = kwargs.pop("required_variables", True) self._noise = kwargs.pop("noise", BootstrapRefuter.DEFAULT_STD_DEV ) self._probability_of_change = kwargs.pop("probability_of_change", None) self._random_state = kwargs.pop("random_state", None) if 'logging_level' in kwargs: logging.basicConfig(level=kwargs['logging_level']) else: logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__name__) self._chosen_variables = self.choose_variables(required_variables) if self._chosen_variables is None: self.logger.info("INFO: There are no chosen variables") else: self.logger.info("INFO: The chosen variables are: " + ",".join(self._chosen_variables)) if self._probability_of_change is None: if self._noise > 1: self.logger.error("Error in using noise:{} for Binary Flip. The value is greater than 1".format(self._noise)) raise ValueError("The value for Binary Flip cannot be greater than 1") else: self._probability_of_change = self._noise elif self._probability_of_change > 1: self.logger.error("The probability of flip is: {}, However, this value cannot be greater than 1".format(self._probability_of_change)) raise ValueError("Probability of Flip cannot be greater than 1")
[docs] def refute_estimate(self, *args, **kwargs): if self._sample_size > len(self._data): self.logger.warning("The sample size is larger than the population size") sample_estimates = np.zeros(self._num_simulations) self.logger.info("Refutation over {} simulated datasets of size {} each" .format(self._num_simulations ,self._sample_size ) ) for index in range(self._num_simulations): if self._random_state is None: new_data = resample(self._data, n_samples=self._sample_size ) else: new_data = resample(self._data, n_samples=self._sample_size, random_state=self._random_state ) if self._chosen_variables is not None: for variable in self._chosen_variables: if ('float' or 'int') in new_data[variable].dtype.name: scaling_factor = new_data[variable].std() new_data[variable] += np.random.normal(loc=0.0, scale=self._noise * scaling_factor,size=self._sample_size) elif 'bool' in new_data[variable].dtype.name: probs = np.random.uniform(0, 1, self._sample_size ) new_data[variable] = np.where(probs < self._probability_of_change, np.logical_not(new_data[variable]), new_data[variable]) elif 'category' in new_data[variable].dtype.name: categories = new_data[variable].unique() # Find the set difference for each row changed_data = new_data[variable].apply( lambda row: list( set(categories) - set([row]) ) ) # Choose one out of the remaining changed_data = changed_data.apply( lambda row: random.choice(row) ) new_data[variable] = np.where(probs < self._probability_of_change, changed_data) new_data[variable].astype('category') new_estimator = CausalEstimator.get_estimator_object(new_data, self._target_estimand, self._estimate) new_effect = new_estimator.estimate_effect() sample_estimates[index] = new_effect.value refute = CausalRefutation( self._estimate.value, np.mean(sample_estimates), refutation_type="Refute: Bootstrap Sample Dataset" ) # We want to see if the estimate falls in the same distribution as the one generated by the refuter # Ideally that should be the case as running bootstrap should not have a significant effect on the ability # of the treatment to affect the outcome refute.add_significance_test_results( self.test_significance(self._estimate, sample_estimates) ) return refute