Source code for auto_prep.preprocessing.feature_selecting

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from ..utils.abstract import Categorical, NonRequiredStep, Numerical
from ..utils.config import config
from ..utils.logging_config import setup_logger
from .abstract import FeatureImportanceSelector

logger = setup_logger(__name__)


[docs] class CorrelationSelector(NonRequiredStep, Numerical): """ Transformer to select correlation_percent% (rounded to whole number) of features that are most correlated with the target variable. Attributes: selected_columns (list): List of selected columns based on correlation with the target. """ def __init__(self): """ Initializes the transformer with a specified percentage of top correlated features to keep. Args: correlation_percent (float): The percentage of features to retain based on their correlation with the target. """ self.k = config.correlation_percent self.selected_columns = []
[docs] def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "CorrelationSelector": """ Identifies the top correlation_percent% (rounded to whole value) of features most correlated with the target variable. Args: X (pd.DataFrame): The input feature data. Returns: CorrelationSelector: The fitted transformer instance. """ logger.start_operation( f"Fitting CorrelationSelector with top {self.k}% correlated features." ) try: corr_with_target = X.corrwith(y).abs() sorted_corr = corr_with_target.sort_values(ascending=False) num_top_features = max(1, round(np.ceil(len(sorted_corr) * self.k))) self.selected_columns = sorted_corr.head(num_top_features).index.tolist() except Exception as e: logger.error(f"Error in CorrelationSelector fit: {e}") raise e finally: logger.end_operation() return self
[docs] def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame: """ Selects the top correlation_percent% of features most correlated with the target variable. Args: X (pd.DataFrame): The feature data. y (pd.Series, optional): The target variable (to append to the result). Returns: pd.DataFrame: The transformed data with only the selected top k% correlated features. """ logger.start_operation( f"Transforming data by selecting {len(self.selected_columns)} most correlated features." ) try: X_selected = X[self.selected_columns].copy() logger.debug("Successfully transformed CorrelationSelector") except Exception as e: logger.error(f"Error in CorrelationSelector transform: {e}") raise e finally: logger.end_operation() return X_selected
[docs] def fit_transform(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: """ Fits and transforms the data by selecting the top k% most correlated features. Performs fit and transform in one step. Args: X (pd.DataFrame): The feature data. y (pd.Series): The target variable. Returns: pd.DataFrame: The transformed data with selected features. """ logger.start_operation( f"Fitting and transforming data with top {self.k}% correlated features." ) try: self.fit(X, y) X = self.transform(X, y) except Exception as e: logger.error(f"Error in CorrelationSelector fit_transform: {e}") raise e finally: logger.end_operation() return X
[docs] def is_numerical(self) -> bool: return True
[docs] def to_tex(self) -> dict: """ Returns a short description of the transformer in dictionary format. """ return { "desc": f"Selects the top {self.k*100}% (rounded to whole number) of features most correlated with the target variable. Number of features that were selected: {len(self.selected_columns)}", "params": {"correlation_percent": self.k}, }
[docs] class FeatureImportanceClassSelector(FeatureImportanceSelector, Categorical): """ Transformer to select k% (rounded to whole number) of features that are most important according to Random Forest model for classification. Attributes: k (float): The percentage of top features to keep based on their importance. selected_columns (list): List of selected columns based on feature importance. """ def __init__(self): """ Initializes the transformer with a specified percentage of top important features to keep. """ super().__init__() self.feature_importances_ = None
[docs] def fit(self, X: pd.DataFrame, y: pd.Series) -> "FeatureImportanceClassSelector": """ Identifies the feature importances according to the Random Forest model. Args: X (pd.DataFrame): The input feature data. y (pd.Series): The target variable. Returns: FeatureImportanceClassificationSelector: The fitted transformer instance. """ logger.start_operation( f"Fitting FeatureImportanceClassificationSelector with top {self.k}% important features." ) try: model = RandomForestClassifier(random_state=42) model.fit(X, y) self.feature_importances_ = model.feature_importances_ total_features = len(self.feature_importances_) num_features_to_select = int(np.ceil(total_features * self.k / 100)) if num_features_to_select == 0: num_features_to_select = 1 indices = np.argsort(self.feature_importances_)[-num_features_to_select:][ ::-1 ] self.selected_columns = X.columns[indices].tolist() except Exception as e: logger.error(f"Error in FeatureImportanceClassificationSelector fit: {e}") raise e finally: logger.end_operation() return self
[docs] def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame: """ Selects the top k% of features most important according to the Random Forest model. Args: X (pd.DataFrame): The feature data. y (pd.Series, optional): The target variable (to append to the result). Returns: pd.DataFrame: The transformed data with only the selected top k% important features. """ logger.start_operation( f"Transforming data by selecting {len(self.selected_columns)} most important features." ) try: X_selected = X[self.selected_columns].copy() except Exception as e: logger.error( f"Error in FeatureImportanceClassificationSelector transform: {e}" ) raise e finally: logger.end_operation() return X_selected
[docs] def fit_transform(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: """ Fits and transforms the data by selecting the top k% most important features. Performs fit and transform in one step. Args: X (pd.DataFrame): The feature data. y (pd.Series): The target variable. """ logger.start_operation( f"Fitting and transforming data with top {self.k}% important features." ) try: self.fit(X, y) X = self.transform(X, y) except Exception as e: logger.error( f"Error in FeatureImportanceClassificationSelector fit_transform: {e}" ) raise e finally: logger.end_operation() return X
[docs] def to_tex(self) -> dict: """ Returns a short description of the transformer in dictionary format. """ return { "desc": f"Selects the top {self.k}% (rounded to whole number) of features most important according to Random Forest model for classification. Number of features that were selected: {len(self.selected_columns)}", "params": {"k": self.k}, }
[docs] class FeatureImportanceRegressSelector(FeatureImportanceSelector, Numerical): """ Transformer to select k% (rounded to whole number) of features that are most important according to Random Forest model for regression. Attributes: k (float): The percentage of top features to keep based on their importance. selected_columns (list): List of selected columns based on feature importance. """ def __init__(self): """ Initializes the transformer with a specified percentage of top important features to keep. """ super().__init__() self.feature_importances_ = None
[docs] def fit(self, X: pd.DataFrame, y: pd.Series) -> "FeatureImportanceRegressSelector": """ Identifies the feature importances according to the Random Forest model. Args: X (pd.DataFrame): The input feature data. y (pd.Series): The target variable. Returns: FeatureImportanceRegressionSelector: The fitted transformer instance. """ logger.start_operation( f"Fitting FeatureImportanceRegressionSelector with top {self.k}% important features." ) try: model = RandomForestClassifier(random_state=42) model.fit(X, y) self.feature_importances_ = model.feature_importances_ total_features = len(self.feature_importances_) num_features_to_select = int(np.ceil(total_features * self.k / 100)) if num_features_to_select == 0: num_features_to_select = 1 indices = np.argsort(self.feature_importances_)[-num_features_to_select:][ ::-1 ] self.selected_columns = X.columns[indices].tolist() except Exception as e: logger.error(f"Error in FeatureImportanceRegressionSelector fit: {e}") raise e finally: logger.end_operation() return self
[docs] def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame: """ Selects the top k% of features most important according to the Random Forest model. Args: X (pd.DataFrame): The feature data. y (pd.Series, optional): The target variable (to append to the result). Returns: pd.DataFrame: The transformed data with only the selected top k% important features. """ logger.start_operation( f"Transforming data by selecting {len(self.selected_columns)} most important features." ) try: X_selected = X[self.selected_columns].copy() except Exception as e: logger.error(f"Error in FeatureImportanceRegressionSelector transform: {e}") raise e finally: logger.end_operation() return X_selected
[docs] def fit_transform(self, X: pd.DataFrame, y): """ Fits and transforms the data by selecting the top k% most important features. Performs fit and transform in one step. Args: X (pd.DataFrame): The feature data. y (pd.Series): The target variable. """ logger.start_operation( f"Fitting and transforming data with top {self.k}% important features." ) try: self.fit(X, y) X = self.transform(X, y) except Exception as e: logger.error( f"Error in FeatureImportanceRegressionSelector fit_transform: {e}" ) raise e finally: logger.end_operation() return X
[docs] def to_tex(self) -> dict: """ Returns a short description of the transformer in dictionary format. """ return { "desc": f"Selects the top {self.k}% (rounded to whole number) of features most important according to Random Forest model for regression. Number of features that were selected: {len(self.selected_columns)}", "params": {"k": self.k}, }