Source code for dowhy.gcm.util.catboost_encoder
from typing import Optional
import numpy as np
from sklearn.preprocessing import LabelEncoder
[docs]class CatBoostEncoder:
"""Implements the proposed method from
"CatBoost: gradient boosting with categorical features support", Dorogush et al. (2018).
The Catboost encoder is a target encoder for categorical features. In this implementation we follow Eq. (1)
in https://arxiv.org/pdf/1810.11363.pdf.
"""
def __init__(self, p: float = 1, alpha: Optional[float] = None):
"""See Eq. (1) in https://arxiv.org/pdf/1810.11363.pdf
:param p: The p parameter in the equation. This weights the impact of the given alpha.
:param alpha: Alpha parameter in the equation. If None is given, the global mean is used as suggested in
"A preprocessing scheme for high-cardinality categorical attributes in classification and
prediction problems", Micci-Barreca (2001)
"""
self._p = p
self._org_alpha = alpha
self._category_means = None
[docs] def fit(self, X: np.ndarray, Y: np.ndarray, use_alpha_when_unique: bool = True) -> None:
"""Fits the Catboost encoder following https://arxiv.org/pdf/1810.11363.pdf Eq. (1).
:param X: Input categorical data.
:param Y: Target data (continuous or categorical)
:param use_alpha_when_unique: If True, uses the alpha value when a category only appears exactly once.
"""
self._fit_transform(X, Y, use_alpha_when_unique, train=True)
def _fit_transform(self, X: np.ndarray, Y: np.ndarray, use_alpha_when_unique: bool, train: bool) -> np.ndarray:
from dowhy.gcm.util.general import is_categorical
if X.ndim > 1 and X.shape[1] > 1:
raise ValueError("CatBoost encoder only supports one dimensional categorical data!")
if Y.ndim > 1 and Y.shape[1] > 1:
raise ValueError("CatBoost encoder only supports one dimensional target data!")
X, Y = X.reshape(-1), Y.reshape(-1)
if not is_categorical(X):
raise ValueError("CatBoost encoder only supports categorical input data, i.e., strings!")
if is_categorical(Y):
Y = LabelEncoder().fit_transform(Y)
if train:
self._alpha = self._org_alpha
if self._alpha is None:
self._alpha = np.mean(Y)
self._category_means = {}
transformed_values = np.zeros(Y.shape[0])
for category in np.unique(X):
mask = X == category
reduced_Y = Y[mask]
category_cumulative_sum = np.cumsum(reduced_Y)
category_cumulative_count = np.cumsum(mask[mask])
# Eq. (1) in https://arxiv.org/pdf/1810.11363.pdf
# Subtracting Y here since the cumulative sum includes the current element. The same reason we subtract 1
# from the count.
transformed_values[mask] += (category_cumulative_sum - reduced_Y + self._alpha * self._p) / (
category_cumulative_count + self._p - 1
)
if train:
if use_alpha_when_unique and category_cumulative_count[-1] == 1:
self._category_means[category] = self._alpha
else:
self._category_means[category] = (category_cumulative_sum[-1] + self._alpha * self._p) / (
category_cumulative_count[-1] + self._p
)
return transformed_values