{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DoWhy example on ihdp (Infant Health and Development Program) dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# importing required libraries\n",
    "import os, sys\n",
    "sys.path.append(os.path.abspath(\"../../\"))\n",
    "import dowhy\n",
    "from dowhy.do_why import CausalModel\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loading Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>treatment</th>\n",
       "      <th>y_factual</th>\n",
       "      <th>y_cfactual</th>\n",
       "      <th>mu0</th>\n",
       "      <th>mu1</th>\n",
       "      <th>x1</th>\n",
       "      <th>x2</th>\n",
       "      <th>x3</th>\n",
       "      <th>x4</th>\n",
       "      <th>x5</th>\n",
       "      <th>...</th>\n",
       "      <th>x16</th>\n",
       "      <th>x17</th>\n",
       "      <th>x18</th>\n",
       "      <th>x19</th>\n",
       "      <th>x20</th>\n",
       "      <th>x21</th>\n",
       "      <th>x22</th>\n",
       "      <th>x23</th>\n",
       "      <th>x24</th>\n",
       "      <th>x25</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>5.599916</td>\n",
       "      <td>4.318780</td>\n",
       "      <td>3.268256</td>\n",
       "      <td>6.854457</td>\n",
       "      <td>-0.528603</td>\n",
       "      <td>-0.343455</td>\n",
       "      <td>1.128554</td>\n",
       "      <td>0.161703</td>\n",
       "      <td>-0.316603</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>6.875856</td>\n",
       "      <td>7.856495</td>\n",
       "      <td>6.636059</td>\n",
       "      <td>7.562718</td>\n",
       "      <td>-1.736945</td>\n",
       "      <td>-1.802002</td>\n",
       "      <td>0.383828</td>\n",
       "      <td>2.244320</td>\n",
       "      <td>-0.629189</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>2.996273</td>\n",
       "      <td>6.633952</td>\n",
       "      <td>1.570536</td>\n",
       "      <td>6.121617</td>\n",
       "      <td>-0.807451</td>\n",
       "      <td>-0.202946</td>\n",
       "      <td>-0.360898</td>\n",
       "      <td>-0.879606</td>\n",
       "      <td>0.808706</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>1.366206</td>\n",
       "      <td>5.697239</td>\n",
       "      <td>1.244738</td>\n",
       "      <td>5.889125</td>\n",
       "      <td>0.390083</td>\n",
       "      <td>0.596582</td>\n",
       "      <td>-1.850350</td>\n",
       "      <td>-0.879606</td>\n",
       "      <td>-0.004017</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>1.963538</td>\n",
       "      <td>6.202582</td>\n",
       "      <td>1.685048</td>\n",
       "      <td>6.191994</td>\n",
       "      <td>-1.045229</td>\n",
       "      <td>-0.602710</td>\n",
       "      <td>0.011465</td>\n",
       "      <td>0.161703</td>\n",
       "      <td>0.683672</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 30 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   treatment  y_factual  y_cfactual       mu0       mu1        x1        x2  \\\n",
       "0          1   5.599916    4.318780  3.268256  6.854457 -0.528603 -0.343455   \n",
       "1          0   6.875856    7.856495  6.636059  7.562718 -1.736945 -1.802002   \n",
       "2          0   2.996273    6.633952  1.570536  6.121617 -0.807451 -0.202946   \n",
       "3          0   1.366206    5.697239  1.244738  5.889125  0.390083  0.596582   \n",
       "4          0   1.963538    6.202582  1.685048  6.191994 -1.045229 -0.602710   \n",
       "\n",
       "         x3        x4        x5 ...   x16  x17  x18  x19  x20  x21  x22  x23  \\\n",
       "0  1.128554  0.161703 -0.316603 ...     1    1    1    1    0    0    0    0   \n",
       "1  0.383828  2.244320 -0.629189 ...     1    1    1    1    0    0    0    0   \n",
       "2 -0.360898 -0.879606  0.808706 ...     1    0    1    1    0    0    0    0   \n",
       "3 -1.850350 -0.879606 -0.004017 ...     1    0    1    1    0    0    0    0   \n",
       "4  0.011465  0.161703  0.683672 ...     1    1    1    1    0    0    0    0   \n",
       "\n",
       "   x24  x25  \n",
       "0    0    0  \n",
       "1    0    0  \n",
       "2    0    0  \n",
       "3    0    0  \n",
       "4    0    0  \n",
       "\n",
       "[5 rows x 30 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data= pd.read_csv(\"https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv\", header = None)\n",
    "col =  [\"treatment\", \"y_factual\", \"y_cfactual\", \"mu0\", \"mu1\" ,]\n",
    "\n",
    "for i in range(1,26):\n",
    "    col.append(\"x\"+str(i))\n",
    "data.columns = col\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:dowhy.do_why:Causal Graph not provided. DoWhy will construct a graph based on data inputs.\n",
      "INFO:dowhy.do_why:Model to find the causal effect of treatment ['treatment'] on outcome ['y_factual']\n"
     ]
    }
   ],
   "source": [
    "# Create a causal model from the data and given common causes.\n",
    "xs = \"\"\n",
    "for i in range(1,26):\n",
    "    xs += (\"x\"+str(i)+\"+\")\n",
    "    \n",
    "model=CausalModel(\n",
    "        data = data,\n",
    "        treatment='treatment',\n",
    "        outcome='y_factual',\n",
    "        common_causes=xs.split('+')\n",
    "        )\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.Identify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dowhy.causal_identifier:Common causes of treatment and outcome:['', 'x21', 'x22', 'x9', 'x8', 'x11', 'x16', 'x25', 'x4', 'x5', 'x20', 'x10', 'x17', 'x13', 'x7', 'x2', 'x23', 'x3', 'x24', 'x1', 'x15', 'x14', 'x6', 'x19', 'x18', 'x12']\n",
      "WARNING:dowhy.causal_identifier:There are unobserved common causes. Causal effect cannot be identified.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARN: Do you want to continue by ignoring these unobserved confounders? [y/n] y\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dowhy.causal_identifier:Instrumental variables for treatment and outcome:[]\n"
     ]
    }
   ],
   "source": [
    "#Identify the causal effect\n",
    "identified_estimand = model.identify_effect()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Estimate (using different methods)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.1 Using Linear Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dowhy.causal_estimator:INFO: Using Linear Regression Estimator\n",
      "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "*** Causal Estimate ***\n",
      "\n",
      "## Target estimand\n",
      "Estimand type: ate\n",
      "### Estimand : 1\n",
      "Estimand name: iv\n",
      "No such variable found!\n",
      "### Estimand : 2\n",
      "Estimand name: backdoor\n",
      "Estimand expression:\n",
      "    d                                                                         \n",
      "──────────(Expectation(y_factual|x21,x22,x9,x8,x11,x16,x25,x4,x5,x20,x10,x17,x\n",
      "dtreatment                                                                    \n",
      "\n",
      "                                               \n",
      "13,x7,x2,x23,x3,x24,x1,x15,x14,x6,x19,x18,x12))\n",
      "                                               \n",
      "Estimand assumption 1, Unconfoundedness: If U→treatment and U→y_factual then P(y_factual|treatment,x21,x22,x9,x8,x11,x16,x25,x4,x5,x20,x10,x17,x13,x7,x2,x23,x3,x24,x1,x15,x14,x6,x19,x18,x12,U) = P(y_factual|treatment,x21,x22,x9,x8,x11,x16,x25,x4,x5,x20,x10,x17,x13,x7,x2,x23,x3,x24,x1,x15,x14,x6,x19,x18,x12)\n",
      "\n",
      "## Realized estimand\n",
      "b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n",
      "## Estimate\n",
      "Value: 3.928671750872715\n",
      "\n",
      "## Statistical Significance\n",
      "p-value: <0.001\n",
      "\n",
      "Causal Estimate is 3.92867175087\n",
      "ATE 4.02112101243\n"
     ]
    }
   ],
   "source": [
    "# Estimate the causal effect and compare it with Average Treatment Effect\n",
    "estimate = model.estimate_effect(identified_estimand,\n",
    "        method_name=\"backdoor.linear_regression\", test_significance=True\n",
    ")\n",
    "\n",
    "print(estimate)\n",
    "\n",
    "print(\"Causal Estimate is \" + str(estimate.value))\n",
    "data_1 = data[data[\"treatment\"]==1]\n",
    "data_0 = data[data[\"treatment\"]==0]\n",
    "\n",
    "print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.2 Using Propensity Score Matching"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Matching Estimator\n",
      "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Causal Estimate is 3.8436503200364402\n",
      "ATE 4.02112101243\n"
     ]
    }
   ],
   "source": [
    "estimate = model.estimate_effect(identified_estimand,\n",
    "        method_name=\"backdoor.propensity_score_matching\"\n",
    ")\n",
    "\n",
    "print(\"Causal Estimate is \" + str(estimate.value))\n",
    "\n",
    "print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.3 Using Propensity Score Stratification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Stratification Estimator\n",
      "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Causal Estimate is 4.0560672956\n",
      "ATE 4.02112101243\n"
     ]
    }
   ],
   "source": [
    "estimate = model.estimate_effect(identified_estimand,\n",
    "        method_name=\"backdoor.propensity_score_stratification\", method_params={'num_strata'50, 'clipping_threshold':5}\n",
    ")\n",
    "\n",
    "print(\"Causal Estimate is \" + str(estimate.value))\n",
    "print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 3.4 Using Propensity Score Weighting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n",
      "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Causal Estimate is 4.04761815345\n",
      "ATE 4.02112101243\n"
     ]
    }
   ],
   "source": [
    "estimate = model.estimate_effect(identified_estimand,\n",
    "        method_name=\"backdoor.propensity_score_weighting\"\n",
    ")\n",
    "\n",
    "print(\"Causal Estimate is \" + str(estimate.value))\n",
    "\n",
    "print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. Refute\n",
    "##### Refute the obtained estimate using multiple robustness checks.\n",
    "##### 4.1 Adding a random common cause"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n",
      "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12+w_random\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Refute: Add a Random Common Cause\n",
      "Estimated effect:(4.0476181534545397,)\n",
      "New effect:(4.0480367100453618,)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "refute_results=model.refute_estimate(identified_estimand, estimate,\n",
    "        method_name=\"random_common_cause\")\n",
    "print(refute_results)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### 4.2 Using a placebo treatment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n",
      "INFO:dowhy.causal_estimator:b: y_factual~placebo+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Refute: Use a Placebo Treatment\n",
      "Estimated effect:(4.0476181534545397,)\n",
      "New effect:(0.057511331649253705,)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "res_placebo=model.refute_estimate(identified_estimand, estimate,\n",
    "        method_name=\"placebo_treatment_refuter\", placebo_type=\"permute\")\n",
    "print(res_placebo)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4.3 Data Subset Refuter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n",
      "INFO:dowhy.causal_estimator:b: y_factual~treatment+x21+x22+x9+x8+x11+x16+x25+x4+x5+x20+x10+x17+x13+x7+x2+x23+x3+x24+x1+x15+x14+x6+x19+x18+x12\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Refute: Use a subset of data\n",
      "Estimated effect:(4.0476181534545397,)\n",
      "New effect:(4.0274748385128563,)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "res_subset=model.refute_estimate(identified_estimand, estimate,\n",
    "        method_name=\"data_subset_refuter\", subset_fraction=0.9)\n",
    "print(res_subset)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}