Source code for dowhy.causal_refuters.overrule.BCS.load_process_data_BCS

"""Code for Binarizing Features.

This module implements the boolean ruleset estimator from OverRule [1]. Code is adapted (with some simplifications)
from, under the MIT License.

[1] Oberst, M., Johansson, F., Wei, D., Gao, T., Brat, G., Sontag, D., & Varshney, K. (2020). Characterization of
Overlap in Observational Studies. In S. Chiappa & R. Calandra (Eds.), Proceedings of the Twenty Third International
Conference on Artificial Intelligence and Statistics (Vol. 108, pp. 788–798). PMLR.

from typing import Dict, List

import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.preprocessing import OneHotEncoder

[docs]class FeatureBinarizer(TransformerMixin): """ Transformer for binarizing categorical and ordinal (including continuous) features. Note that all features are converted into binary variables before learning Boolean rules. """ def __init__( self, colCateg: List[str] = [], numThresh: int = 9, negations: bool = False, threshStr: bool = False, threshOverride: Dict = {}, **kwargs, ): """ Initialize transformer for binarizing categorical and ordinal (including continuous) features :param colCateg: List of categorical columns, defaults to [], 'object' dtype automatically treated as categorical :type colCateg: List[str], optional :param numThresh: Number of quantile thresholds to binarize ordinal features, defaults to 9 :type numThresh: int, optional :param negations: Include negations, defaults to False :type negations: bool, optional :param threshStr: Convert thresholds to strings, defaults to False :type threshStr: bool, optional :param threshOverride: Dictionary to override quantile thresholds, defaults to {}, formatted as `{colname : np.linspace object}` to define cuts :type threshOverride: Dict, optional """ # List of categorical columns if type(colCateg) is pd.Series: self.colCateg = colCateg.tolist() elif type(colCateg) is not list: self.colCateg = [colCateg] else: self.colCateg = colCateg self.threshOverride = {} if threshOverride is None else threshOverride # Number of quantile thresholds used to binarize ordinal features self.numThresh = numThresh self.thresh: Dict[str, np.ndarray] = {} # whether to append negations self.negations = negations # whether to convert thresholds on ordinal features to strings self.threshStr = threshStr
[docs] def fit(self, X): """ Fit to data, including the learning of thresholds where appropriate. Sets the following internal variables: * `maps` = dictionary of mappings for unary/binary columns * `enc` = dictionary of OneHotEncoders for categorical columns * `thresh` = dictionary of lists of thresholds for ordinal columns * `NaN` = list of ordinal columns containing NaN values :param X: Original features as a Pandas Dataframe :type X: pd.DataFrame """ data = X # Quantile probabilities quantProb = np.linspace(1.0 / (self.numThresh + 1.0), self.numThresh / (self.numThresh + 1.0), self.numThresh) # Initialize maps = {} enc = {} thresh = {} NaN = [] # Iterate over columns for c in data: # number of unique values valUniq = data[c].nunique() # Constant or binary column if valUniq <= 2: # Mapping to 0, 1 maps[c] = pd.Series(range(valUniq), index=np.sort(data[c].unique())) # Categorical column elif (c in self.colCateg) or (data[c].dtype == "object"): # OneHotEncoder object enc[c] = OneHotEncoder(sparse=False, dtype=int, handle_unknown="ignore") # Fit to observed categories enc[c].fit(data[[c]]) # Ordinal column elif np.issubdtype(data[c].dtype, np.dtype(int).type) | np.issubdtype(data[c].dtype, np.dtype(float).type): # Few unique values if valUniq <= self.numThresh + 1: # Thresholds are sorted unique values excluding maximum thresh[c] = np.sort(data[c].unique())[:-1] # Many unique values elif c in self.threshOverride.keys(): thresh[c] = self.threshOverride[c] else: # Thresholds are quantiles excluding repetitions thresh[c] = data[c].quantile(q=quantProb).unique() if data[c].isnull().any(): # Contains NaN values NaN.append(c) else: print(("Skipping column '" + str(c) + "': data type cannot be handled")) continue self.maps = maps self.enc = enc self.thresh = thresh self.NaN = NaN return self
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Transform data into binary features. :param X: Original features as a Pandas Dataframe :type X: pd.DataFrame :return A: Binary feature dataframe :type A: pd.DataFrame """ data = X maps = self.maps enc = self.enc thresh = self.thresh NaN = self.NaN # Initialize dataframe A = pd.DataFrame( index=data.index, columns=pd.MultiIndex.from_arrays([[], [], []], names=["feature", "operation", "value"]) ) # Iterate over columns for c in data: # Constant or binary column if c in maps: # Rename values to 0, 1 A[(str(c), "", "")] = data[c].map(maps[c]) if self.negations: A[(str(c), "not", "")] = 1 - A[(str(c), "", "")] # Categorical column elif c in enc: # Apply OneHotEncoder Anew = enc[c].transform(data[[c]]) Anew = pd.DataFrame(Anew, index=data.index, columns=enc[c].categories_[0].astype(str)) if self.negations: # Append negations Anew = pd.concat([Anew, 1 - Anew], axis=1, keys=[(str(c), "=="), (str(c), "!=")]) else: Anew.columns = pd.MultiIndex.from_product([[str(c)], ["=="], Anew.columns]) # Concatenate A = pd.concat([A, Anew], axis=1) # Ordinal column elif c in thresh: # Threshold values to produce binary arrays Anew = (data[c].values[:, np.newaxis] <= thresh[c]).astype(int) if self.negations: # Append negations Anew = np.concatenate((Anew, 1 - Anew), axis=1) ops = ["<=", ">"] else: ops = ["<="] # Convert to dataframe with column labels if self.threshStr: Anew = pd.DataFrame( Anew, index=data.index, columns=pd.MultiIndex.from_product([[str(c)], ops, thresh[c].astype(str)]), ) else: Anew = pd.DataFrame( Anew, index=data.index, columns=pd.MultiIndex.from_product([[str(c)], ops, thresh[c]]) ) if c in NaN: # Ensure that rows corresponding to NaN values are zeroed out indNull = data[c].isnull() Anew.loc[indNull] = 0 # Add NaN indicator column Anew[(str(c), "==", "NaN")] = indNull.astype(int) if self.negations: Anew[(str(c), "!=", "NaN")] = (~indNull).astype(int) # Concatenate A = pd.concat([A, Anew], axis=1) else: print(("Skipping column '" + str(c) + "': data type cannot be handled")) continue return A