Source code for auto_prep.preprocessing.imputing

import pandas as pd
from sklearn.impute import SimpleImputer

from ..utils.abstract import NumericalCategorical, RequiredStep
from ..utils.logging_config import setup_logger

logger = setup_logger(__name__)


[docs] class NAImputer(RequiredStep, NumericalCategorical): """ Base class for imputing missing values. Provides functionality to identify columns with missing values and determine the strategy to handle them (remove columns with >50% missing data). Attributes: numeric_features (list): A list of numeric feature names. categorical_features (list): A list of categorical feature names. """ def __init__(self): self.numeric_features = [] self.categorical_features = [] self.cols_to_remove = [] self.categorical_imputer = SimpleImputer(strategy="most_frequent") self.numerical_imputer = SimpleImputer(strategy="median")
[docs] def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "NAImputer": """ Identifies columns with more than 50% missing values and removes them from the dataset. Args: X (pd.DataFrame): The input data with missing values. Returns: NAImputer: The fitted imputer instance. """ logger.start_operation( f"Fitting NAImputer to data with {X.shape[0]} rows and {X.shape[1]} columns." ) # Removing columns with >50% missing values try: missing_threshold = 0.5 cols_to_remove = [ col for col in X.columns if X[col].isnull().mean() > missing_threshold ] logger.debug( f"Columns to be removed due to >50% missing values: {cols_to_remove}" ) # Update internal state but do not modify input DataFrame self.cols_to_remove = cols_to_remove self.numeric_features = X.select_dtypes(include="number").columns.tolist() logger.debug(f"Identified numeric features: {self.numeric_features}") self.categorical_features = X.select_dtypes( exclude="number" ).columns.tolist() logger.debug( f"Identified categorical features: {self.categorical_features}" ) except Exception as e: logger.error(f"Error in NumericalImputer fit: {e}") raise e finally: logger.end_operation() return self
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Removes previously identified columns with >50% missing values. Args: X (pd.DataFrame): The input data to transform. Returns: pd.DataFrame: The transformed data. """ logger.start_operation("Transforming data.") try: X = X.drop(columns=self.cols_to_remove) # Impute missing values in numeric columns available_numeric_features = [ col for col in self.numeric_features if col in X.columns ] if available_numeric_features: self.numerical_imputer.fit(X[available_numeric_features]) X[available_numeric_features] = self.numerical_imputer.transform( X[available_numeric_features] ) # Impute missing values in categorical columns available_categorical_features = [ col for col in self.categorical_features if col in X.columns ] if available_categorical_features: self.categorical_imputer.fit(X[available_categorical_features]) X[available_categorical_features] = self.categorical_imputer.transform( X[available_categorical_features] ) for col in available_categorical_features: X[col] = X[col].fillna("Missing") for col in available_numeric_features: X[col] = X[col].fillna(X[col].median()) except Exception as e: logger.error(f"Error in NAImputer transform: {e}") raise e finally: logger.end_operation() return X
[docs] def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame: """ Fits and transforms the input data by imputing missing values. Args: X (pd.DataFrame): The input data. Returns: pd.DataFrame: The transformed data with missing values imputed. """ logger.start_operation("Fitting and transforming data.") try: self.fit(X) X = self.transform(X) except Exception as e: logger.error(f"Error in NAImputer fit_transform: {e}") raise e finally: logger.end_operation() return X
[docs] def to_tex(self) -> dict: """ Returns a description of the transformer in dictionary format. """ return { "desc": "Imputes missing data.", "params": { "numeric_imputer": self.numerical_imputer.strategy, "categorical_imputer": self.categorical_imputer.strategy, }, }