{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Getting started with DoWhy: A simple example\n", "This is a quick introduction to the DoWhy causal inference library.\n", "We will load in a sample dataset and estimate the causal effect of a (pre-specified)treatment variable on a (pre-specified) outcome variable.\n", "\n", "First, let us add the required path for Python to find the DoWhy code and load all required packages." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os, sys\n", "sys.path.append(os.path.abspath(\"../../../\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's check the python version. " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.6.9 (default, Nov 7 2019, 10:44:02) \n", "[GCC 8.3.0]\n" ] } ], "source": [ "print(sys.version)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "import dowhy\n", "from dowhy import CausalModel\n", "import dowhy.datasets " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, let us load a dataset. For simplicity, we simulate a dataset with linear relationships between common causes and treatment, and common causes and outcome. \n", "\n", "Beta is the true causal effect. " ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(10000, 5)\n", "(10000,)\n" ] }, { "ename": "IndexError", "evalue": "arrays used as indices must be of integer (or boolean) type", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mnum_samples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10000\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mtreatment_is_binary\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m num_discrete_common_causes=1)\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"df\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/mnt/c/Users/amshar/code/dowhy/dowhy/datasets.py\u001b[0m in \u001b[0;36mlinear_dataset\u001b[0;34m(beta, num_common_causes, num_samples, num_instruments, num_effect_modifiers, num_treatments, treatment_is_binary, outcome_is_binary, num_discrete_common_causes, num_discrete_instruments, num_discrete_effect_modifiers)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mw_bins\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquantile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mW\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mq\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mquantiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0mW\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw_index\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdigitize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mW\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbins\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mw_bins\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 50\u001b[0;31m \u001b[0mdummy_vecs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meye\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquantiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mW\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw_index\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 51\u001b[0m \u001b[0mW_dummy\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdelete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mW_dummy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0mW_dummy\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcatenate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mW_dummy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdummy_vecs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIndexError\u001b[0m: arrays used as indices must be of integer (or boolean) type" ] } ], "source": [ "data = dowhy.datasets.linear_dataset(beta=10,\n", " num_common_causes=5,\n", " num_instruments = 2,\n", " num_effect_modifiers=1,\n", " num_samples=10000, \n", " treatment_is_binary=True,\n", " num_discrete_common_causes=1)\n", "df = data[\"df\"]\n", "print(df.head())\n", "print(data[\"dot_graph\"])\n", "print(\"\\n\")\n", "print(data[\"gml_graph\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that we are using a pandas dataframe to load the data. At present, DoWhy only supports pandas dataframe as input." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Interface 1 (recommended): Input causal graph" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We now input a causal graph in the GML graph format (recommended). You can also use the DOT format." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# With graph\n", "model=CausalModel(\n", " data = df,\n", " treatment=data[\"treatment_name\"],\n", " outcome=data[\"outcome_name\"],\n", " graph=data[\"gml_graph\"]\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.view_model()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "from IPython.display import Image, display\n", "display(Image(filename=\"causal_model.png\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The above causal graph shows the assumptions encoded in the causal model. We can now use this graph to first identify \n", "the causal effect (go from a causal estimand to a probability expression), and then estimate the causal effect." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**DoWhy philosophy: Keep identification and estimation separate**\n", "\n", "Identification can be achieved without access to the data, acccesing only the graph. This results in an expression to be computed. This expression can then be evaluated using the available data in the estimation step.\n", "It is important to understand that these are orthogonal steps.\n", "\n", "* Identification" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "identified_estimand = model.identify_effect()\n", "print(identified_estimand)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If you want to disable the warning for ignoring unobserved confounders, you can add a parameter flag ( *proceed\\_when\\_unidentifiable* ). The same parameter can also be added when instantiating the CausalModel object. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)\n", "print(identified_estimand)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Estimation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "causal_estimate = model.estimate_effect(identified_estimand,\n", " method_name=\"backdoor.propensity_score_stratification\")\n", "print(causal_estimate)\n", "print(\"Causal Estimate is \" + str(causal_estimate.value))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can input additional parameters to the estimate_effect method. For instance, to estimate the effect on any subset of the units, you can specify the \"target_units\" parameter which can be a string (\"ate\", \"att\", or \"atc\"), lambda function that filters rows of the data frame, or a new dataframe on which to compute the effect. You can also specify \"effect modifiers\" to estimate heterogeneous effects across these variables. See `help(CausalModel.estimate_effect)`. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Causal effect on the control group (ATC)\n", "causal_estimate_att = model.estimate_effect(identified_estimand,\n", " method_name=\"backdoor.propensity_score_stratification\",\n", " target_units = \"atc\")\n", "print(causal_estimate_att)\n", "print(\"Causal Estimate is \" + str(causal_estimate_att.value))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Interface 2: Specify common causes and instruments" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Without graph \n", "model= CausalModel( \n", " data=df, \n", " treatment=data[\"treatment_name\"], \n", " outcome=data[\"outcome_name\"], \n", " common_causes=data[\"common_causes_names\"],\n", " effect_modifiers=data[\"effect_modifier_names\"]) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.view_model()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.display import Image, display\n", "display(Image(filename=\"causal_model.png\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We get the same causal graph. Now identification and estimation is done as before." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "identified_estimand = model.identify_effect(proceed_when_unidentifiable=True) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Estimation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "estimate = model.estimate_effect(identified_estimand,\n", " method_name=\"backdoor.propensity_score_stratification\") \n", "print(estimate)\n", "print(\"Causal Estimate is \" + str(estimate.value))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Refuting the estimate\n", "\n", "Let us now look at ways of refuting the estimate obtained." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Adding a random common cause variable" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res_random=model.refute_estimate(identified_estimand, estimate, method_name=\"random_common_cause\")\n", "print(res_random)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Adding an unobserved common cause variable" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res_unobserved=model.refute_estimate(identified_estimand, estimate, method_name=\"add_unobserved_common_cause\",\n", " confounders_effect_on_treatment=\"binary_flip\", confounders_effect_on_outcome=\"linear\",\n", " effect_strength_on_treatment=0.01, effect_strength_on_outcome=0.02)\n", "print(res_unobserved)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Replacing treatment with a random (placebo) variable" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res_placebo=model.refute_estimate(identified_estimand, estimate,\n", " method_name=\"placebo_treatment_refuter\", placebo_type=\"permute\")\n", "print(res_placebo)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Removing a random subset of the data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res_subset=model.refute_estimate(identified_estimand, estimate,\n", " method_name=\"data_subset_refuter\", subset_fraction=0.9)\n", "print(res_subset)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As you can see, the propensity score stratification estimator is reasonably robust to refutations.\n", "For reproducibility, you can add a parameter \"random_seed\" to any refutation method, as shown below." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res_subset=model.refute_estimate(identified_estimand, estimate,\n", " method_name=\"data_subset_refuter\", subset_fraction=0.9, random_seed = 1)\n", "print(res_subset)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }