Source code for dowhy.gcm.util.catboost_encoder

from typing import Optional

import numpy as np
from sklearn.preprocessing import LabelEncoder


[docs]class CatBoostEncoder: """Implements the proposed method from "CatBoost: gradient boosting with categorical features support", Dorogush et al. (2018). The Catboost encoder is a target encoder for categorical features. In this implementation we follow Eq. (1) in https://arxiv.org/pdf/1810.11363.pdf. """ def __init__(self, p: float = 1, alpha: Optional[float] = None): """See Eq. (1) in https://arxiv.org/pdf/1810.11363.pdf :param p: The p parameter in the equation. This weights the impact of the given alpha. :param alpha: Alpha parameter in the equation. If None is given, the global mean is used as suggested in "A preprocessing scheme for high-cardinality categorical attributes in classification and prediction problems", Micci-Barreca (2001) """ self._p = p self._org_alpha = alpha self._category_means = None
[docs] def fit(self, X: np.ndarray, Y: np.ndarray, use_alpha_when_unique: bool = True) -> None: """Fits the Catboost encoder following https://arxiv.org/pdf/1810.11363.pdf Eq. (1). :param X: Input categorical data. :param Y: Target data (continuous or categorical) :param use_alpha_when_unique: If True, uses the alpha value when a category only appears exactly once. """ self._fit_transform(X, Y, use_alpha_when_unique, train=True)
[docs] def fit_transform(self, X: np.ndarray, Y: np.ndarray, use_alpha_when_unique: bool = True) -> np.ndarray: """ :param X: Input categorical data. :param Y: Target data (continuous or categorical). :param use_alpha_when_unique: If True, uses the alpha value when a category only appears exactly once. :return: Catboost encoded inputs based on the given Y. """ return self._fit_transform(X, Y, use_alpha_when_unique, train=True)
[docs] def transform( self, X: np.ndarray, Y: Optional[np.ndarray] = None, use_alpha_when_unique: bool = True ) -> np.ndarray: """Applies the Catboost encoder to the data. :param X: Input categorical data. :param Y: If target data is given, this data is used instead of the fitted data. :param use_alpha_when_unique: If True, uses the alpha value when a category only appears exactly once. :return: Catboost encoded inputs. If Y is given, it uses the idea if giving each row a time index and only use the previously observed data to estimate the encoding. If Y is not given, the previously fitted average for each category is used. This can be seen as using the whole training data set as past observations. """ if self._category_means is None: raise ValueError("Encoder must be fitted before calling transform") if Y is not None: return self._fit_transform(X, Y, use_alpha_when_unique, train=False) else: if X.ndim > 1 and X.shape[1] > 1: raise ValueError("CatBoost encoder only supports one dimensional categorical data!") X = X.reshape(-1) transformed_values = np.zeros(X.shape[0]) for category in np.unique(X): mask = X == category if category in self._category_means: transformed_values[mask] = self._category_means[category] else: transformed_values[mask] = self._alpha return transformed_values
def _fit_transform(self, X: np.ndarray, Y: np.ndarray, use_alpha_when_unique: bool, train: bool) -> np.ndarray: from dowhy.gcm.util.general import is_categorical if X.ndim > 1 and X.shape[1] > 1: raise ValueError("CatBoost encoder only supports one dimensional categorical data!") if Y.ndim > 1 and Y.shape[1] > 1: raise ValueError("CatBoost encoder only supports one dimensional target data!") X, Y = X.reshape(-1), Y.reshape(-1) if not is_categorical(X): raise ValueError("CatBoost encoder only supports categorical input data, i.e., strings!") if is_categorical(Y): Y = LabelEncoder().fit_transform(Y) if train: self._alpha = self._org_alpha if self._alpha is None: self._alpha = np.mean(Y) self._category_means = {} transformed_values = np.zeros(Y.shape[0]) for category in np.unique(X): mask = X == category reduced_Y = Y[mask] category_cumulative_sum = np.cumsum(reduced_Y) category_cumulative_count = np.cumsum(mask[mask]) # Eq. (1) in https://arxiv.org/pdf/1810.11363.pdf # Subtracting Y here since the cumulative sum includes the current element. The same reason we subtract 1 # from the count. transformed_values[mask] += (category_cumulative_sum - reduced_Y + self._alpha * self._p) / ( category_cumulative_count + self._p - 1 ) if train: if use_alpha_when_unique and category_cumulative_count[-1] == 1: self._category_means[category] = self._alpha else: self._category_means[category] = (category_cumulative_sum[-1] + self._alpha * self._p) / ( category_cumulative_count[-1] + self._p ) return transformed_values