Source code for auto_prep.preprocessing.redundancy_filtering
import pandas as pd
from ..utils.abstract import Categorical, RequiredStep
from ..utils.logging_config import setup_logger
logger = setup_logger(__name__)
[docs]
class UniqueFilter(RequiredStep, Categorical):
"""
Transformer to remove categorical columns 100% unique values.
Attributes:
dropped_columns (list): List of dropped columns.
"""
def __init__(self):
"""
Initializes the transformer with an empty list of dropped columns.
"""
self.dropped_columns = []
[docs]
def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "UniqueFilter":
"""
Identifies categorical columns with 100% unique values.
Args:
X (pd.DataFrame): The input feature data.
Returns:
UniqueFilter: The fitted transformer instance.
"""
logger.start_operation("Fitting UniqueFilter")
try:
cat_cols = X.select_dtypes(exclude="number")
self.dropped_columns = [
col for col in cat_cols if X[col].nunique() == len(X)
]
logger.debug("Successfully fitted UniqueFilter")
except Exception as e:
logger.error(f"Failed to fit UniqueFilter : {e}", exc_info=True)
raise ValueError(f"Failed to fit Uniquefilter {e}")
finally:
logger.end_operation()
return self
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Drops the identified categorical columns with 100% unique values based on the fit method.
Args:
X (pd.DataFrame): The feature data.
Returns:
pd.DataFrame: The transformed data without dropped columns.
"""
logger.start_operation(
f"Transforming data UniqueFilter by dropping {len(self.dropped_columns)} columns with unique values"
)
try:
X_transformed = X.drop(columns=self.dropped_columns, errors="ignore")
logger.debug("Successfully transformed UniqueFilter")
except Exception as e:
logger.error(f"Failed to transform UniqueFilter : {e}", exc_info=True)
raise ValueError(f"Failed to transform Uniquefilter {e}")
finally:
logger.end_operation()
return X_transformed
[docs]
def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
"""
Fits and transforms the data in one step.
Args:
X (pd.DataFrame): The feature data.
Returns:
pd.DataFrame: The transformed data without dropped columns.
"""
logger.start_operation(
"Fitting and transforming categorical data with 100% unique values"
)
try:
X_transformed = self.fit(X).transform(X)
logger.debug("Successfully fit_transformed UniqueFilter")
except Exception as e:
logger.error(f"Failed to fit_transform UniqueFilter : {e}", exc_info=True)
raise ValueError(f"Failed to fit_transform Uniquefilter {e}")
finally:
logger.end_operation()
return X_transformed
[docs]
def to_tex(self) -> dict:
"""
Returns a description of the transformer in dictionary format.
"""
return {
"desc": f"Removes categorical columns with 100% unique values. Dropped columns: {self.dropped_columns}",
}