Source code for auto_prep.preprocessing.abstract

from abc import ABC, abstractmethod

import pandas as pd

from ..utils.abstract import NonRequiredStep, Numerical
from ..utils.logging_config import setup_logger

logger = setup_logger(__name__)



[docs]
class DimentionReducer(NonRequiredStep, Numerical, ABC):
    """
    Abstract class for dimensionality reduction techniques.
    """

    def __init__(self):
        super().__init__()
        self.reducer = None


[docs]
    @abstractmethod
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "DimentionReducer":
        pass



[docs]
    @abstractmethod
    def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        pass



[docs]
    @abstractmethod
    def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        pass



[docs]
    @abstractmethod
    def to_tex(self) -> dict:
        pass





[docs]
class FeatureImportanceSelector(NonRequiredStep):
    """
    Transformer to select k% (rounded to whole number) of features
    that are most important according to Random Forest model.

    Attributes:
        k (float): The percentage of top features to keep based on their importance.
        selected_columns (list): List of selected columns based on feature importance.
    """

    def __init__(self, k: float = 10.0):
        """
        Initializes the transformer with a specified model and percentage of top important features to keep.

        Args:
            k (float): The percentage of features to retain based on their importance.
        """
        if not (0 <= k <= 100):
            raise ValueError("k must be between 0 and 100.")
        self.k = k
        self.selected_columns = []


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "FeatureImportanceSelector":
        """
        Identifies the top k% (rounded to whole value) of features most important according to the model.

        Args:
            X (pd.DataFrame): The input feature data.
            y (pd.Series): The target variable.

        Returns:
            FeatureImportanceSelector: The fitted transformer instance.
        """
        pass



[docs]
    def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        """
        Selects the top k% of features most important according to the model.

        Args:
            X (pd.DataFrame): The feature data.
            y (pd.Series, optional): The target variable (to append to the result).

        Returns:
            pd.DataFrame: The transformed data with only the selected top k% important features.
        """
        pass



[docs]
    def fit_transform(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
        """
        Fits and transforms the data by selecting the top k% most important features. Performs fit and transform in one step.

        Args:
            X (pd.DataFrame): The feature data.
            y (pd.Series): The target variable.

        Returns:
            pd.DataFrame: The transformed data with selected features.
        """
        self.fit(X, y)
        return self.transform(X)