Source code for auto_prep.preprocessing.dimention_reducing

import numpy as np
import pandas as pd
import umap.umap_ as umap
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor

from ..utils.config import config
from ..utils.logging_config import setup_logger
from .abstract import DimentionReducer

logger = setup_logger(__name__)



[docs]
class PCADimentionReducer(DimentionReducer):
    """
    Combines data standardization and PCA with automatic selection of the number of components
    to preserve 95% of the variance.
    """

    def __init__(self):
        """
        Initializes the PCA object with additional parameters.
        """
        super().__init__()
        self.reducer = None  # PCA will be initialized in fit
        self.n_components = None  # Will be determined in fit


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "PCADimentionReducer":
        """
        Fits PCA to the data, determining the number of components to preserve
        95% of the variance.

        Args:
            X (pd.DataFrame or np.ndarray): Input data.
            y (optional): Target values (ignored).

        Returns:
            PCADimentionReducer: The fitted transformer.
        """
        logger.start_operation(
            f"Fitting PCADimentionReducer to data with {X.shape[0]} rows and {X.shape[1]} columns."
        )
        try:
            # Fit PCA to determine the number of components
            temp_pca = PCA()
            temp_pca.fit(X)
            cumulative_variance = np.cumsum(temp_pca.explained_variance_ratio_)
            self.n_components = np.argmax(cumulative_variance >= 0.95) + 1

            # Initialize PCA with the determined number of components
            self.reducer = PCA(n_components=self.n_components)
            self.reducer.fit(X)

            logger.debug(f"Number of components selected: {self.n_components}")
        except Exception as e:
            logger.error(f"Error in PCADimentionReducer fit: {e}")
            raise e
        finally:
            logger.end_operation()
        return self



[docs]
    def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        """
        Transforms the input data using fitted PCA.

        Args:
            X (pd.DataFrame or np.ndarray): Input data.
            y (optional): Target values (ignored).

        Returns:
            np.ndarray: Transformed data.
        """
        logger.start_operation(
            f"Transforming data with {X.shape[0]} rows and {X.shape[1]} columns."
        )
        try:
            X = pd.DataFrame(self.reducer.transform(X))
        except Exception as e:
            logger.error(f"Error in PCADimentionReducer transform: {e}")
            raise e
        finally:
            logger.end_operation()
        return X



[docs]
    def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        """
        Fits the transformer to the data and then transforms it.

        Args:
            X (pd.DataFrame or np.ndarray): Input data.
            y (optional): Target values (ignored).

        Returns:
            np.ndarray: Transformed data.
        """
        logger.start_operation(
            "Fitting and transforming data using PCADimentionReducer."
        )
        try:
            self.fit(X, y)
        except Exception as e:
            logger.error(f"Error in PCADimentionReducer fit_transform: {e}")
            raise e
        finally:
            return self.transform(X)



[docs]
    def to_tex(self) -> dict:
        return {
            "desc": "Combines PCA with automatic selection of the number of components to preserve 95% of the variance.",
            "params": {"n_components": self.n_components},
        }





[docs]
class VIFDimentionReducer(DimentionReducer):
    """
    Removes columns with high variance inflation factor (VIF > 10).
    """

    def __init__(self):
        """
        Initializes the VIFDimentionReducer.
        """
        self.multicollinear_columns = []


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "VIFDimentionReducer":
        """
        Fits the VIFDimentionReducer to the data, identifying columns with high VIF.

        Args:
            X (pd.DataFrame): Input data.
            y (optional): Target values (ignored).

        Returns:
            VIFDimentionReducer: The fitted transformer.
        """
        logger.start_operation(
            f"Fitting VIF to data with {X.shape[0]} rows and {X.shape[1]} columns."
        )
        try:
            for col in X.columns:
                if X.shape[1] > 1:
                    vif = variance_inflation_factor(X.values, X.columns.get_loc(col))
                    if vif > 10:
                        self.multicollinear_columns.append(col)
            logger.debug(f"Columns with high VIF: {self.multicollinear_columns}")
        except Exception as e:
            logger.error(f"Error in VIFDimentionReducer fit: {e}")
            raise e
        finally:
            logger.end_operation()
        return self



[docs]
    def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        """
        Removes columns with high VIF from the data.

        Args:
            X (pd.DataFrame): Input data.
            y (optional): Target values (ignored).

        Returns:
            pd.DataFrame: Transformed data.
        """
        logger.start_operation("Transforming data.")
        try:
            X_copy = X.copy()
            X_copy.drop(columns=self.multicollinear_columns, inplace=True)
        except Exception as e:
            logger.error(f"Error in VIFDimentionReducer transform: {e}")
            raise e
        finally:
            logger.end_operation()
        return X_copy



[docs]
    def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        """
        Fits the VIFDimentionReducer to the data and then transforms it.

        Args:
            X (pd.DataFrame): Input data.
            y (optional): Target values (ignored).

        Returns:
            pd.DataFrame: Transformed data.
        """
        logger.start_operation("Fitting and transforming data using VIF.")
        try:
            self.fit(X)
            logger.debug(
                f"Removing columns with high VIF: {self.multicollinear_columns}"
            )
        except Exception as e:
            logger.error(f"Error in VIFDimentionReducer fit_transform: {e}")
            raise e
        finally:
            logger.end_operation()
        return self.transform(X)



[docs]
    def to_tex(self) -> dict:
        return {
            "desc": "Removes columns with high variance inflation factor (VIF > 10).",
        }





[docs]
class UMAPDimentionReducer(DimentionReducer):
    """
    Reduces the dimensionality of the data using UMAP.
    """

    def __init__(self):
        self.reducer = None
        self.n_components = None


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "UMAPDimentionReducer":
        """
        Fits the UMAPDimentionReducer to the data.
        """
        logger.start_operation(
            f"Fitting UMAPDimentionReducer to data with {X.shape[0]} rows and {X.shape[1]} columns."
        )
        try:
            if X.shape[1] > 100:
                self.n_components = config.umap_components
            else:
                self.n_components = max(int(X.shape[1] / 2), 1)
            self.reducer = umap.UMAP(n_components=self.n_components)
            self.reducer.fit(X)
            logger.debug(f"Number of components selected: {self.n_components}")
        except Exception as e:
            logger.error(f"Error in DimentionReducerUMAP fit: {e}")
            raise e
        finally:
            logger.end_operation()
        return self



[docs]
    def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        """
        Transforms the input data using the fitted UMAP reducer.
        """
        logger.start_operation(
            f"Transforming data with {X.shape[0]} rows and {X.shape[1]} columns."
        )
        try:
            X = pd.DataFrame(self.reducer.transform(X))
        except Exception as e:
            logger.error(f"Error in DimentionReducerUMAP transform: {e}")
            raise e
        finally:
            logger.end_operation()
        return X



[docs]
    def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
        """
        Fits the transformer to the data and then transforms it.
        """
        logger.start_operation(
            "Fitting and transforming data using DimentionReducerUMAP."
        )
        try:
            self.fit(X)
            X = self.transform(X)
            logger.debug(f"Reducing data to {self.n_components} components.")
        except Exception as e:
            logger.error(f"Error in DimentionReducerUMAP fit_transform: {e}")
            raise e
        finally:
            logger.end_operation()
        return X



[docs]
    def to_tex(self) -> dict:
        return {
            "desc": "Reduces the dimensionality of the data using UMAP.",
            "params": {"n_components": self.n_components},
        }