{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# DoWhy example on ihdp (Infant Health and Development Program) dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# importing required libraries\n", "import os, sys\n", "sys.path.append(os.path.abspath(\"../../\"))\n", "import dowhy\n", "from dowhy.do_why import CausalModel\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loading Data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
treatmenty_factualy_cfactualmu0mu1x1x2x3x4x5...x16x17x18x19x20x21x22x23x24x25
015.5999164.3187803.2682566.854457-0.528603-0.3434551.1285540.161703-0.316603...1111000000
106.8758567.8564956.6360597.562718-1.736945-1.8020020.3838282.244320-0.629189...1111000000
202.9962736.6339521.5705366.121617-0.807451-0.202946-0.360898-0.8796060.808706...1011000000
301.3662065.6972391.2447385.8891250.3900830.596582-1.850350-0.879606-0.004017...1011000000
401.9635386.2025821.6850486.191994-1.045229-0.6027100.0114650.1617030.683672...1111000000
\n", "

5 rows × 30 columns

\n", "
" ], "text/plain": [ " treatment y_factual y_cfactual mu0 mu1 x1 x2 \\\n", "0 1 5.599916 4.318780 3.268256 6.854457 -0.528603 -0.343455 \n", "1 0 6.875856 7.856495 6.636059 7.562718 -1.736945 -1.802002 \n", "2 0 2.996273 6.633952 1.570536 6.121617 -0.807451 -0.202946 \n", "3 0 1.366206 5.697239 1.244738 5.889125 0.390083 0.596582 \n", "4 0 1.963538 6.202582 1.685048 6.191994 -1.045229 -0.602710 \n", "\n", " x3 x4 x5 ... x16 x17 x18 x19 x20 x21 x22 x23 \\\n", "0 1.128554 0.161703 -0.316603 ... 1 1 1 1 0 0 0 0 \n", "1 0.383828 2.244320 -0.629189 ... 1 1 1 1 0 0 0 0 \n", "2 -0.360898 -0.879606 0.808706 ... 1 0 1 1 0 0 0 0 \n", "3 -1.850350 -0.879606 -0.004017 ... 1 0 1 1 0 0 0 0 \n", "4 0.011465 0.161703 0.683672 ... 1 1 1 1 0 0 0 0 \n", "\n", " x24 x25 \n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", "[5 rows x 30 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data= pd.read_csv(\"https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv\", header = None)\n", "col = [\"treatment\", \"y_factual\", \"y_cfactual\", \"mu0\", \"mu1\" ,]\n", "\n", "for i in range(1,26):\n", " col.append(\"x\"+str(i))\n", "data.columns = col\n", "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.Model" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:dowhy.do_why:Causal Graph not provided. DoWhy will construct a graph based on data inputs.\n", "INFO:dowhy.do_why:Model to find the causal effect of treatment ['treatment'] on outcome ['y_factual']\n" ] } ], "source": [ "# Create a causal model from the data and given common causes.\n", "xs = \"\"\n", "for i in range(1,26):\n", " xs += (\"x\"+str(i)+\"+\")\n", " \n", "model=CausalModel(\n", " data = data,\n", " treatment='treatment',\n", " outcome='y_factual',\n", " common_causes=xs.split('+')\n", " )\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2.Identify" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:dowhy.causal_identifier:Common causes of treatment and outcome:['', 'x21', 'x22', 'x9', 'x8', 'x11', 'x16', 'x25', 'x4', 'x5', 'x20', 'x10', 'x17', 'x13', 'x7', 'x2', 'x23', 'x3', 'x24', 'x1', 'x15', 'x14', 'x6', 'x19', 'x18', 'x12']\n", "WARNING:dowhy.causal_identifier:There are unobserved common causes. Causal effect cannot be identified.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "WARN: Do you want to continue by ignoring these unobserved confounders? [y/n] y\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:dowhy.causal_identifier:Instrumental variables for treatment and outcome:[]\n" ] } ], "source": [ "#Identify the causal effect\n", "identified_estimand = model.identify_effect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. Estimate (using different methods)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3.1 Using Linear Regression" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:dowhy.causal_estimator:INFO: Using Linear Regression Estimator\n", "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "*** Causal Estimate ***\n", "\n", "## Target estimand\n", "Estimand type: ate\n", "### Estimand : 1\n", "Estimand name: iv\n", "No such variable found!\n", "### Estimand : 2\n", "Estimand name: backdoor\n", "Estimand expression:\n", " d \n", "──────────(Expectation(y_factual|x21,x22,x9,x8,x11,x16,x25,x4,x5,x20,x10,x17,x\n", "dtreatment \n", "\n", " \n", "13,x7,x2,x23,x3,x24,x1,x15,x14,x6,x19,x18,x12))\n", " \n", "Estimand assumption 1, Unconfoundedness: If U→treatment and U→y_factual then P(y_factual|treatment,x21,x22,x9,x8,x11,x16,x25,x4,x5,x20,x10,x17,x13,x7,x2,x23,x3,x24,x1,x15,x14,x6,x19,x18,x12,U) = P(y_factual|treatment,x21,x22,x9,x8,x11,x16,x25,x4,x5,x20,x10,x17,x13,x7,x2,x23,x3,x24,x1,x15,x14,x6,x19,x18,x12)\n", "\n", "## Realized estimand\n", "b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n", "## Estimate\n", "Value: 3.928671750872715\n", "\n", "## Statistical Significance\n", "p-value: <0.001\n", "\n", "Causal Estimate is 3.92867175087\n", "ATE 4.02112101243\n" ] } ], "source": [ "# Estimate the causal effect and compare it with Average Treatment Effect\n", "estimate = model.estimate_effect(identified_estimand,\n", " method_name=\"backdoor.linear_regression\", test_significance=True\n", ")\n", "\n", "print(estimate)\n", "\n", "print(\"Causal Estimate is \" + str(estimate.value))\n", "data_1 = data[data[\"treatment\"]==1]\n", "data_0 = data[data[\"treatment\"]==0]\n", "\n", "print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3.2 Using Propensity Score Matching" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Matching Estimator\n", "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Causal Estimate is 3.8436503200364402\n", "ATE 4.02112101243\n" ] } ], "source": [ "estimate = model.estimate_effect(identified_estimand,\n", " method_name=\"backdoor.propensity_score_matching\"\n", ")\n", "\n", "print(\"Causal Estimate is \" + str(estimate.value))\n", "\n", "print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3.3 Using Propensity Score Stratification" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Stratification Estimator\n", "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Causal Estimate is 4.0560672956\n", "ATE 4.02112101243\n" ] } ], "source": [ "estimate = model.estimate_effect(identified_estimand,\n", " method_name=\"backdoor.propensity_score_stratification\", method_params={'num_strata'50, 'clipping_threshold':5}\n", ")\n", "\n", "print(\"Causal Estimate is \" + str(estimate.value))\n", "print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3.4 Using Propensity Score Weighting" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n", "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Causal Estimate is 4.04761815345\n", "ATE 4.02112101243\n" ] } ], "source": [ "estimate = model.estimate_effect(identified_estimand,\n", " method_name=\"backdoor.propensity_score_weighting\"\n", ")\n", "\n", "print(\"Causal Estimate is \" + str(estimate.value))\n", "\n", "print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 4. Refute\n", "##### Refute the obtained estimate using multiple robustness checks.\n", "##### 4.1 Adding a random common cause" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n", "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12+w_random\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Refute: Add a Random Common Cause\n", "Estimated effect:(4.0476181534545397,)\n", "New effect:(4.0480367100453618,)\n", "\n" ] } ], "source": [ "refute_results=model.refute_estimate(identified_estimand, estimate,\n", " method_name=\"random_common_cause\")\n", "print(refute_results)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### 4.2 Using a placebo treatment" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n", "INFO:dowhy.causal_estimator:b: y_factual~placebo+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Refute: Use a Placebo Treatment\n", "Estimated effect:(4.0476181534545397,)\n", "New effect:(0.057511331649253705,)\n", "\n" ] } ], "source": [ "res_placebo=model.refute_estimate(identified_estimand, estimate,\n", " method_name=\"placebo_treatment_refuter\", placebo_type=\"permute\")\n", "print(res_placebo)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 4.3 Data Subset Refuter" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n", "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Refute: Use a subset of data\n", "Estimated effect:(4.0476181534545397,)\n", "New effect:(4.0274748385128563,)\n", "\n" ] } ], "source": [ "res_subset=model.refute_estimate(identified_estimand, estimate,\n", " method_name=\"data_subset_refuter\", subset_fraction=0.9)\n", "print(res_subset)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }