Source code for dowhy.utils.regression

import numpy as np
from econml.sklearn_extensions.model_selection import GridSearchCVList
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


[docs]def get_numeric_features(X):
    """
    Finds the numeric feature columns in a dataset

    :param X: pandas dataframe

    returns: list of indices of numeric features
    """
    numeric_features_names = list(X.select_dtypes("number"))
    numeric_features = []
    for col_name in numeric_features_names:
        col_index = X.columns.get_loc(col_name)
        numeric_features.append(col_index)
    return numeric_features


[docs]def get_generic_regressor(
    cv, X, Y, max_degree=3, estimator_list=None, estimator_param_list=None, numeric_features=None
):
    """
    Finds the best estimator for regression function (g_s)

    :param cv: training and testing data indices obtained afteer Kfolding the dataset
    :param X: regressors data for training the regression model
    :param Y: outcome data for training the regression model
    :param max_degree: degree of the polynomial function used to approximate the regression function
    :param estimator_list: list of estimator objects for finding the regression function
    :param estimator_param_list: list of dictionaries with parameters for tuning respective estimators in estimator_list
    :param numeric_features: list of indices of numeric features in the dataset

    :returns: estimator for Reisz Regression function
    """
    if estimator_list is not None:
        estimator = GridSearchCVList(
            estimator_list, estimator_param_list, cv=cv, scoring="neg_mean_squared_error", n_jobs=-1
        ).fit(X, Y)
        return estimator.best_estimator_
    else:
        estimator = GridSearchCVList(
            [
                RandomForestRegressor(n_estimators=100, random_state=120),
                Pipeline(
                    [
                        (
                            "scale",
                            ColumnTransformer([("num", StandardScaler(), numeric_features)], remainder="passthrough"),
                        ),
                        ("lasso_model", Lasso()),
                    ]
                ),
                GradientBoostingRegressor(),
            ],
            param_grid_list=[
                {"n_estimators": [50], "max_depth": [3, 4, 5], "min_samples_leaf": [10, 50]},
                {"lasso_model__alpha": [0.01, 0.001, 1e-4, 1e-5, 1e-6]},
                {"learning_rate": [0.01, 0.001], "n_estimators": [50, 200]},
            ],
            cv=cv,
            scoring="neg_mean_squared_error",
            n_jobs=-1,
        ).fit(X, Y)
        return estimator.best_estimator_


[docs]def generate_moment_function(W, g):
    """
    Generate and returns moment function
    m(W,g) = g(1,W) - g(0,W) for Average Causal Effect
    """
    shape = (W.shape[0], 1)
    ones = np.ones(shape)
    zeros = np.zeros(shape)
    non_treatment_data = W[:, 1:]  # assume that treatment is one-dimensional.
    data_0 = np.hstack([zeros, non_treatment_data])  # data with treatment = 1
    data_1 = np.hstack([ones, non_treatment_data])  # data with treatment = 0
    return g(data_1) - g(data_0)


[docs]def create_polynomial_function(max_degree):
    """
    Creates a list of polynomial functions

    :param max_degree: degree of the polynomial function to be created

    :returns: list of lambda functions
    """
    polynomial_function = []
    for degree in range(max_degree + 1):

        def poly_term(x):
            return x[:, [0]] ** degree

        polynomial_function.append(poly_term)
    return polynomial_function