Source code for dowhy.causal_refuters.overrule.BCS.load_process_data_BCS

"""Code for Binarizing Features.

This module implements the boolean ruleset estimator from OverRule [1]. Code is adapted (with some simplifications)
from https://github.com/clinicalml/overlap-code, under the MIT License.

[1] Oberst, M., Johansson, F., Wei, D., Gao, T., Brat, G., Sontag, D., & Varshney, K. (2020). Characterization of
Overlap in Observational Studies. In S. Chiappa & R. Calandra (Eds.), Proceedings of the Twenty Third International
Conference on Artificial Intelligence and Statistics (Vol. 108, pp. 788–798). PMLR. https://arxiv.org/abs/1907.04138
"""

from typing import Dict, List

import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.preprocessing import OneHotEncoder


[docs]class FeatureBinarizer(TransformerMixin):
    """
    Transformer for binarizing categorical and ordinal (including continuous) features.

    Note that all features are converted into binary variables before learning Boolean rules.
    """

    def __init__(
        self,
        colCateg: List[str] = [],
        numThresh: int = 9,
        negations: bool = False,
        threshStr: bool = False,
        threshOverride: Dict = {},
        **kwargs,
    ):
        """
        Initialize transformer for binarizing categorical and ordinal (including continuous) features

        :param colCateg: List of categorical columns, defaults to [], 'object' dtype automatically treated as categorical
        :type colCateg: List[str], optional
        :param numThresh: Number of quantile thresholds to binarize ordinal features, defaults to 9
        :type numThresh: int, optional
        :param negations: Include negations, defaults to False
        :type negations: bool, optional
        :param threshStr: Convert thresholds to strings, defaults to False
        :type threshStr: bool, optional
        :param threshOverride: Dictionary to override quantile thresholds, defaults to {},
            formatted as `{colname : np.linspace object}` to define cuts
        :type threshOverride: Dict, optional
        """
        # List of categorical columns
        if type(colCateg) is pd.Series:
            self.colCateg = colCateg.tolist()
        elif type(colCateg) is not list:
            self.colCateg = [colCateg]
        else:
            self.colCateg = colCateg

        self.threshOverride = {} if threshOverride is None else threshOverride
        # Number of quantile thresholds used to binarize ordinal features
        self.numThresh = numThresh
        self.thresh: Dict[str, np.ndarray] = {}
        # whether to append negations
        self.negations = negations
        # whether to convert thresholds on ordinal features to strings
        self.threshStr = threshStr

[docs]    def fit(self, X):
        """
        Fit to data, including the learning of thresholds where appropriate.

        Sets the following internal variables:
        * `maps` = dictionary of mappings for unary/binary columns
        * `enc` = dictionary of OneHotEncoders for categorical columns
        * `thresh` = dictionary of lists of thresholds for ordinal columns
        * `NaN` = list of ordinal columns containing NaN values

        :param X: Original features as a Pandas Dataframe
        :type X: pd.DataFrame
        """
        data = X
        # Quantile probabilities
        quantProb = np.linspace(1.0 / (self.numThresh + 1.0), self.numThresh / (self.numThresh + 1.0), self.numThresh)
        # Initialize
        maps = {}
        enc = {}
        thresh = {}
        NaN = []

        # Iterate over columns
        for c in data:
            # number of unique values
            valUniq = data[c].nunique()

            # Constant or binary column
            if valUniq <= 2:
                # Mapping to 0, 1
                maps[c] = pd.Series(range(valUniq), index=np.sort(data[c].unique()))

            # Categorical column
            elif (c in self.colCateg) or (data[c].dtype == "object"):
                # OneHotEncoder object
                enc[c] = OneHotEncoder(sparse=False, dtype=int, handle_unknown="ignore")
                # Fit to observed categories
                enc[c].fit(data[[c]])

            # Ordinal column
            elif np.issubdtype(data[c].dtype, np.dtype(int).type) | np.issubdtype(data[c].dtype, np.dtype(float).type):
                # Few unique values
                if valUniq <= self.numThresh + 1:
                    # Thresholds are sorted unique values excluding maximum
                    thresh[c] = np.sort(data[c].unique())[:-1]
                # Many unique values
                elif c in self.threshOverride.keys():
                    thresh[c] = self.threshOverride[c]
                else:
                    # Thresholds are quantiles excluding repetitions
                    thresh[c] = data[c].quantile(q=quantProb).unique()
                if data[c].isnull().any():
                    # Contains NaN values
                    NaN.append(c)

            else:
                print(("Skipping column '" + str(c) + "': data type cannot be handled"))
                continue

        self.maps = maps
        self.enc = enc
        self.thresh = thresh
        self.NaN = NaN
        return self

[docs]    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Transform data into binary features.

        :param X: Original features as a Pandas Dataframe
        :type X: pd.DataFrame
        :return A: Binary feature dataframe
        :type A: pd.DataFrame
        """
        data = X
        maps = self.maps
        enc = self.enc
        thresh = self.thresh
        NaN = self.NaN

        # Initialize dataframe
        A = pd.DataFrame(
            index=data.index, columns=pd.MultiIndex.from_arrays([[], [], []], names=["feature", "operation", "value"])
        )

        # Iterate over columns
        for c in data:
            # Constant or binary column
            if c in maps:
                # Rename values to 0, 1
                A[(str(c), "", "")] = data[c].map(maps[c])
                if self.negations:
                    A[(str(c), "not", "")] = 1 - A[(str(c), "", "")]

            # Categorical column
            elif c in enc:
                # Apply OneHotEncoder
                Anew = enc[c].transform(data[[c]])
                Anew = pd.DataFrame(Anew, index=data.index, columns=enc[c].categories_[0].astype(str))
                if self.negations:
                    # Append negations
                    Anew = pd.concat([Anew, 1 - Anew], axis=1, keys=[(str(c), "=="), (str(c), "!=")])
                else:
                    Anew.columns = pd.MultiIndex.from_product([[str(c)], ["=="], Anew.columns])
                # Concatenate
                A = pd.concat([A, Anew], axis=1)

            # Ordinal column
            elif c in thresh:
                # Threshold values to produce binary arrays
                Anew = (data[c].values[:, np.newaxis] <= thresh[c]).astype(int)
                if self.negations:
                    # Append negations
                    Anew = np.concatenate((Anew, 1 - Anew), axis=1)
                    ops = ["<=", ">"]
                else:
                    ops = ["<="]
                # Convert to dataframe with column labels
                if self.threshStr:
                    Anew = pd.DataFrame(
                        Anew,
                        index=data.index,
                        columns=pd.MultiIndex.from_product([[str(c)], ops, thresh[c].astype(str)]),
                    )
                else:
                    Anew = pd.DataFrame(
                        Anew, index=data.index, columns=pd.MultiIndex.from_product([[str(c)], ops, thresh[c]])
                    )
                if c in NaN:
                    # Ensure that rows corresponding to NaN values are zeroed out
                    indNull = data[c].isnull()
                    Anew.loc[indNull] = 0
                    # Add NaN indicator column
                    Anew[(str(c), "==", "NaN")] = indNull.astype(int)
                    if self.negations:
                        Anew[(str(c), "!=", "NaN")] = (~indNull).astype(int)
                # Concatenate
                A = pd.concat([A, Anew], axis=1)

            else:
                print(("Skipping column '" + str(c) + "': data type cannot be handled"))
                continue

        return A