Source code for auto_prep.preprocessing.feature_selecting

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from ..utils.abstract import Categorical, NonRequiredStep, Numerical
from ..utils.config import config
from ..utils.logging_config import setup_logger
from .abstract import FeatureImportanceSelector

logger = setup_logger(__name__)



[docs]
class CorrelationSelector(NonRequiredStep, Numerical):
    """
    Transformer to select correlation_percent% (rounded to whole number) of features that are most correlated with the target variable.

    Attributes:
         selected_columns (list): List of selected columns based on correlation with the target.
    """

    def __init__(self):
        """
        Initializes the transformer with a specified percentage of top correlated features to keep.

        Args:
            correlation_percent (float): The percentage of features to retain based on their correlation with the target.
        """
        self.k = config.correlation_percent
        self.selected_columns = []


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "CorrelationSelector":
        """
        Identifies the top correlation_percent% (rounded to whole value) of features most correlated with the target variable.

        Args:
            X (pd.DataFrame): The input feature data.

        Returns:
            CorrelationSelector: The fitted transformer instance.
        """
        logger.start_operation(
            f"Fitting CorrelationSelector with top {self.k}% correlated features."
        )
        try:
            corr_with_target = X.corrwith(y).abs()
            sorted_corr = corr_with_target.sort_values(ascending=False)
            num_top_features = max(1, round(np.ceil(len(sorted_corr) * self.k)))
            self.selected_columns = sorted_corr.head(num_top_features).index.tolist()
        except Exception as e:
            logger.error(f"Error in CorrelationSelector fit: {e}")
            raise e
        finally:
            logger.end_operation()
        return self



[docs]
    def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        """
        Selects the top correlation_percent% of features most correlated with the target variable.

        Args:
            X (pd.DataFrame): The feature data.
            y (pd.Series, optional): The target variable (to append to the result).

        Returns:
            pd.DataFrame: The transformed data with only the selected top k% correlated features.
        """
        logger.start_operation(
            f"Transforming data by selecting {len(self.selected_columns)} most correlated features."
        )
        try:

            X_selected = X[self.selected_columns].copy()
            logger.debug("Successfully transformed CorrelationSelector")
        except Exception as e:
            logger.error(f"Error in CorrelationSelector transform: {e}")
            raise e
        finally:
            logger.end_operation()
        return X_selected



[docs]
    def fit_transform(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
        """
        Fits and transforms the data by selecting the top k% most correlated features. Performs fit and transform in one step.

        Args:
            X (pd.DataFrame): The feature data.
            y (pd.Series): The target variable.

        Returns:
            pd.DataFrame: The transformed data with selected features.
        """
        logger.start_operation(
            f"Fitting and transforming data with top {self.k}% correlated features."
        )
        try:
            self.fit(X, y)
            X = self.transform(X, y)
        except Exception as e:
            logger.error(f"Error in CorrelationSelector fit_transform: {e}")
            raise e
        finally:
            logger.end_operation()
        return X



[docs]
    def is_numerical(self) -> bool:
        return True



[docs]
    def to_tex(self) -> dict:
        """
        Returns a short description of the transformer in dictionary format.
        """
        return {
            "desc": f"Selects the top {self.k*100}% (rounded to whole number) of features most correlated with the target variable. Number of features that were selected: {len(self.selected_columns)}",
            "params": {"correlation_percent": self.k},
        }





[docs]
class FeatureImportanceClassSelector(FeatureImportanceSelector, Categorical):
    """
    Transformer to select k% (rounded to whole number) of features
    that are most important according to Random Forest model for classification.

    Attributes:
        k (float): The percentage of top features to keep based on their importance.
        selected_columns (list): List of selected columns based on feature importance.
    """

    def __init__(self):
        """
        Initializes the transformer with a specified percentage of top important features to keep.

        """
        super().__init__()
        self.feature_importances_ = None


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series) -> "FeatureImportanceClassSelector":
        """
        Identifies the feature importances according to the Random Forest model.

        Args:
            X (pd.DataFrame): The input feature data.
            y (pd.Series): The target variable.

        Returns:
            FeatureImportanceClassificationSelector: The fitted transformer instance.
        """
        logger.start_operation(
            f"Fitting FeatureImportanceClassificationSelector with top {self.k}% important features."
        )
        try:
            model = RandomForestClassifier(random_state=42)
            model.fit(X, y)
            self.feature_importances_ = model.feature_importances_
            total_features = len(self.feature_importances_)
            num_features_to_select = int(np.ceil(total_features * self.k / 100))
            if num_features_to_select == 0:
                num_features_to_select = 1
            indices = np.argsort(self.feature_importances_)[-num_features_to_select:][
                ::-1
            ]
            self.selected_columns = X.columns[indices].tolist()
        except Exception as e:
            logger.error(f"Error in FeatureImportanceClassificationSelector fit: {e}")
            raise e
        finally:
            logger.end_operation()
        return self



[docs]
    def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        """
        Selects the top k% of features most important according to the Random Forest model.

        Args:
            X (pd.DataFrame): The feature data.
            y (pd.Series, optional): The target variable (to append to the result).

        Returns:
            pd.DataFrame: The transformed data with only the selected top k% important features.
        """
        logger.start_operation(
            f"Transforming data by selecting {len(self.selected_columns)} most important features."
        )
        try:
            X_selected = X[self.selected_columns].copy()

        except Exception as e:
            logger.error(
                f"Error in FeatureImportanceClassificationSelector transform: {e}"
            )
            raise e
        finally:
            logger.end_operation()
        return X_selected



[docs]
    def fit_transform(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
        """
        Fits and transforms the data by selecting the top k% most important features. Performs fit and transform in one step.

        Args:
            X (pd.DataFrame): The feature data.
            y (pd.Series): The target variable.
        """

        logger.start_operation(
            f"Fitting and transforming data with top {self.k}% important features."
        )
        try:
            self.fit(X, y)
            X = self.transform(X, y)
        except Exception as e:
            logger.error(
                f"Error in FeatureImportanceClassificationSelector fit_transform: {e}"
            )
            raise e
        finally:
            logger.end_operation()
        return X



[docs]
    def to_tex(self) -> dict:
        """
        Returns a short description of the transformer in dictionary format.
        """
        return {
            "desc": f"Selects the top {self.k}% (rounded to whole number) of features most important according to Random Forest model for classification. Number of features that were selected: {len(self.selected_columns)}",
            "params": {"k": self.k},
        }





[docs]
class FeatureImportanceRegressSelector(FeatureImportanceSelector, Numerical):
    """
    Transformer to select k% (rounded to whole number) of features
    that are most important according to Random Forest model for regression.

    Attributes:
        k (float): The percentage of top features to keep based on their importance.
        selected_columns (list): List of selected columns based on feature importance.
    """

    def __init__(self):
        """
        Initializes the transformer with a specified percentage of top important features to keep.

        """
        super().__init__()
        self.feature_importances_ = None


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series) -> "FeatureImportanceRegressSelector":
        """
        Identifies the feature importances according to the Random Forest model.

        Args:
            X (pd.DataFrame): The input feature data.
            y (pd.Series): The target variable.

        Returns:
            FeatureImportanceRegressionSelector: The fitted transformer instance.
        """
        logger.start_operation(
            f"Fitting FeatureImportanceRegressionSelector with top {self.k}% important features."
        )
        try:
            model = RandomForestClassifier(random_state=42)
            model.fit(X, y)
            self.feature_importances_ = model.feature_importances_
            total_features = len(self.feature_importances_)
            num_features_to_select = int(np.ceil(total_features * self.k / 100))
            if num_features_to_select == 0:
                num_features_to_select = 1
            indices = np.argsort(self.feature_importances_)[-num_features_to_select:][
                ::-1
            ]
            self.selected_columns = X.columns[indices].tolist()
        except Exception as e:
            logger.error(f"Error in FeatureImportanceRegressionSelector fit: {e}")
            raise e
        finally:
            logger.end_operation()
        return self



[docs]
    def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        """
        Selects the top k% of features most important according to the Random Forest model.

        Args:
            X (pd.DataFrame): The feature data.
            y (pd.Series, optional): The target variable (to append to the result).

        Returns:
            pd.DataFrame: The transformed data with only the selected top k% important features.
        """
        logger.start_operation(
            f"Transforming data by selecting {len(self.selected_columns)} most important features."
        )
        try:

            X_selected = X[self.selected_columns].copy()

        except Exception as e:
            logger.error(f"Error in FeatureImportanceRegressionSelector transform: {e}")
            raise e
        finally:
            logger.end_operation()
        return X_selected



[docs]
    def fit_transform(self, X: pd.DataFrame, y):
        """
        Fits and transforms the data by selecting the top k% most important features. Performs fit and transform in one step.

        Args:
            X (pd.DataFrame): The feature data.
            y (pd.Series): The target variable.
        """

        logger.start_operation(
            f"Fitting and transforming data with top {self.k}% important features."
        )
        try:
            self.fit(X, y)
            X = self.transform(X, y)
        except Exception as e:
            logger.error(
                f"Error in FeatureImportanceRegressionSelector fit_transform: {e}"
            )
            raise e
        finally:
            logger.end_operation()
        return X



[docs]
    def to_tex(self) -> dict:
        """
        Returns a short description of the transformer in dictionary format.
        """
        return {
            "desc": f"Selects the top {self.k}% (rounded to whole number) of features most important according to Random Forest model for regression. Number of features that were selected: {len(self.selected_columns)}",
            "params": {"k": self.k},
        }