Source code for auto_prep.preprocessing.encoding

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

from ..utils.abstract import Categorical, RequiredStep
from ..utils.logging_config import setup_logger
from .utils import TolerantLabelEncoder

logger = setup_logger(__name__)


[docs] class ColumnEncoder(RequiredStep, Categorical): """ Encoder for categorical features. This class applies different encoding techniques (OneHotEncoding or LabelEncoding) based on the number of unique values in each column. For columns with less than 5 unique values, OneHotEncoder is used. For columns with 5 or more unique values, TolerantLabelEncoder is applied. Attributes: encoders (dict): A dictionary of fitted encoders for each column. columns (list): A list of columns that have been encoded. """ def __init__(self): """ Initializes the encoder with empty dictionaries for encoders and columns. """ self.encoders = {} # Dictionary to store encoder for each column self.columns = [] # List to store columns that are encoded
[docs] def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "ColumnEncoder": """ Fits the encoder to the categorical features in the data. Args: X (pd.DataFrame): The feature data to fit the encoder to. y (pd.Series, optional): The target variable (to fit the encoder). Returns: ColumnEncoder: The fitted encoder instance. The encoder will choose between OneHotEncoder and LabelEncoder based on the number of unique values in each column. OneHotEncoder is used for columns with fewer than 5 unique values, and TolerantLabelEncoder is used for columns with 5 or more unique values. """ logger.start_operation( f"Fitting ColumnEncoder to data with {X.shape[0]} rows and {X.shape[1]} columns." ) try: categorical_columns = X.select_dtypes(exclude="number").columns.tolist() for column in categorical_columns: unique_vals = len(X[column].unique()) if unique_vals < 5: # OneHotEncoder for columns with less than 5 unique values logger.debug( f"Column {column} has {unique_vals} unique values, using OneHotEncoder." ) self.encoders[column] = OneHotEncoder(sparse_output=False) self.encoders[column].fit(X[[column]]) else: # TolerantLabelEncoder for columns with 5 or more unique values logger.debug( f"Column {column} has {unique_vals} unique values, using TolerantLabelEncoder." ) self.encoders[column] = TolerantLabelEncoder() self.encoders[column].fit(X[column]) self.columns.append(column) except Exception as e: logger.error(f"Error in ColumnEncoder fit: {e}") raise e finally: logger.end_operation() return self
[docs] def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame: """ Transforms the feature data using the fitted encoders. Args: X (pd.DataFrame): The feature data to transform. y (pd.Series, optional): The target variable (to append to the result). Returns: pd.DataFrame: The transformed feature data, with encoded columns. """ logger.start_operation( f"Transforming data with {X.shape[0]} rows and {X.shape[1]} columns." ) try: for column in self.columns: try: if isinstance(self.encoders[column], OneHotEncoder): logger.debug(f"Applying OneHotEncoder to column {column}.") encoded_data = self.encoders[column].transform(X[[column]]) ohe_columns = [ f"{column}_{cat}" for cat in self.encoders[column].categories_[0] ] encoded_df = pd.DataFrame( encoded_data, columns=ohe_columns, index=X.index ) X = pd.concat([X.drop(column, axis=1), encoded_df], axis=1) else: logger.debug( f"Applying TolerantLabelEncoder to column {column}." ) X[column] = self.encoders[column].transform( X[column], column=column ) except Exception as e: raise Exception(f"Error in transforming column {column}") from e except Exception as e: logger.error(f"Error in ColumnEncoder transform: {e}") raise e finally: logger.end_operation() return X
[docs] def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame: """ Fits and transforms the feature data using the encoder. Args: X (pd.DataFrame): The feature data to transform. y (pd.Series, optional): The target variable (to append to the result). Returns: pd.DataFrame: The transformed feature data, with encoded columns. This method combines the fit and transform steps in one operation. """ logger.start_operation("Fitting and transforming data.") result = self.fit(X).transform(X) logger.end_operation() return result
[docs] def to_tex(self) -> dict: return { "desc": "Encodes categorical columns using OneHotEncoder (for columns with <5 unique values) or TolerantLabelEncoder (for columns with >=5 unique values). Encodes target variable using LabelEncoder if provided.", }