Source code for auto_prep.preprocessing.utils

import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_is_fitted, column_or_1d

from ..utils.logging_config import setup_logger

logger = setup_logger(__name__)


[docs] class TolerantLabelEncoder(LabelEncoder): def __init__( self, ignore_unknown=True, unknown_original_value="unknown", unknown_encoded_value=-1, ): self.ignore_unknown = ignore_unknown self.unknown_original_value = unknown_original_value self.unknown_encoded_value = unknown_encoded_value
[docs] def transform(self, y, column): check_is_fitted(self, "classes_") y = column_or_1d(y, warn=True) indices = np.isin(y, self.classes_) if not self.ignore_unknown and not np.all(indices): raise ValueError( f"{column} contains new labels: {np.setdiff1d(y, self.classes_)}" ) elif not np.all(indices): logger.warning( f"{column} contains new labels: {len(np.setdiff1d(y, self.classes_))}" ) y_transformed = np.searchsorted(self.classes_, y) y_transformed[~indices] = self.unknown_encoded_value return y_transformed
[docs] def inverse_transform(self, y): check_is_fitted(self, "classes_") labels = np.arange(len(self.classes_)) indices = np.isin(y, labels) if not self.ignore_unknown and not np.all(indices): raise ValueError( "y contains new labels: %s" % str(np.setdiff1d(y, self.classes_)) ) y_transformed = np.asarray(self.classes_[y], dtype=object) y_transformed[~indices] = self.unknown_original_value return y_transformed