Source code for auto_prep.preprocessing.dimention_reducing
import numpy as np
import pandas as pd
import umap.umap_ as umap
from sklearn.decomposition import PCA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from ..utils.config import config
from ..utils.logging_config import setup_logger
from .abstract import DimentionReducer
logger = setup_logger(__name__)
[docs]
class PCADimentionReducer(DimentionReducer):
"""
Combines data standardization and PCA with automatic selection of the number of components
to preserve 95% of the variance.
"""
def __init__(self):
"""
Initializes the PCA object with additional parameters.
"""
super().__init__()
self.reducer = None # PCA will be initialized in fit
self.n_components = None # Will be determined in fit
[docs]
def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "PCADimentionReducer":
"""
Fits PCA to the data, determining the number of components to preserve
95% of the variance.
Args:
X (pd.DataFrame or np.ndarray): Input data.
y (optional): Target values (ignored).
Returns:
PCADimentionReducer: The fitted transformer.
"""
logger.start_operation(
f"Fitting PCADimentionReducer to data with {X.shape[0]} rows and {X.shape[1]} columns."
)
try:
# Fit PCA to determine the number of components
temp_pca = PCA()
temp_pca.fit(X)
cumulative_variance = np.cumsum(temp_pca.explained_variance_ratio_)
self.n_components = np.argmax(cumulative_variance >= 0.95) + 1
# Initialize PCA with the determined number of components
self.reducer = PCA(n_components=self.n_components)
self.reducer.fit(X)
logger.debug(f"Number of components selected: {self.n_components}")
except Exception as e:
logger.error(f"Error in PCADimentionReducer fit: {e}")
raise e
finally:
logger.end_operation()
return self
[docs]
def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
"""
Transforms the input data using fitted PCA.
Args:
X (pd.DataFrame or np.ndarray): Input data.
y (optional): Target values (ignored).
Returns:
np.ndarray: Transformed data.
"""
logger.start_operation(
f"Transforming data with {X.shape[0]} rows and {X.shape[1]} columns."
)
try:
X = pd.DataFrame(self.reducer.transform(X))
except Exception as e:
logger.error(f"Error in PCADimentionReducer transform: {e}")
raise e
finally:
logger.end_operation()
return X
[docs]
def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
"""
Fits the transformer to the data and then transforms it.
Args:
X (pd.DataFrame or np.ndarray): Input data.
y (optional): Target values (ignored).
Returns:
np.ndarray: Transformed data.
"""
logger.start_operation(
"Fitting and transforming data using PCADimentionReducer."
)
try:
self.fit(X, y)
except Exception as e:
logger.error(f"Error in PCADimentionReducer fit_transform: {e}")
raise e
finally:
return self.transform(X)
[docs]
def to_tex(self) -> dict:
return {
"desc": "Combines PCA with automatic selection of the number of components to preserve 95% of the variance.",
"params": {"n_components": self.n_components},
}
[docs]
class VIFDimentionReducer(DimentionReducer):
"""
Removes columns with high variance inflation factor (VIF > 10).
"""
def __init__(self):
"""
Initializes the VIFDimentionReducer.
"""
self.multicollinear_columns = []
[docs]
def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "VIFDimentionReducer":
"""
Fits the VIFDimentionReducer to the data, identifying columns with high VIF.
Args:
X (pd.DataFrame): Input data.
y (optional): Target values (ignored).
Returns:
VIFDimentionReducer: The fitted transformer.
"""
logger.start_operation(
f"Fitting VIF to data with {X.shape[0]} rows and {X.shape[1]} columns."
)
try:
for col in X.columns:
if X.shape[1] > 1:
vif = variance_inflation_factor(X.values, X.columns.get_loc(col))
if vif > 10:
self.multicollinear_columns.append(col)
logger.debug(f"Columns with high VIF: {self.multicollinear_columns}")
except Exception as e:
logger.error(f"Error in VIFDimentionReducer fit: {e}")
raise e
finally:
logger.end_operation()
return self
[docs]
def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
"""
Removes columns with high VIF from the data.
Args:
X (pd.DataFrame): Input data.
y (optional): Target values (ignored).
Returns:
pd.DataFrame: Transformed data.
"""
logger.start_operation("Transforming data.")
try:
X_copy = X.copy()
X_copy.drop(columns=self.multicollinear_columns, inplace=True)
except Exception as e:
logger.error(f"Error in VIFDimentionReducer transform: {e}")
raise e
finally:
logger.end_operation()
return X_copy
[docs]
def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
"""
Fits the VIFDimentionReducer to the data and then transforms it.
Args:
X (pd.DataFrame): Input data.
y (optional): Target values (ignored).
Returns:
pd.DataFrame: Transformed data.
"""
logger.start_operation("Fitting and transforming data using VIF.")
try:
self.fit(X)
logger.debug(
f"Removing columns with high VIF: {self.multicollinear_columns}"
)
except Exception as e:
logger.error(f"Error in VIFDimentionReducer fit_transform: {e}")
raise e
finally:
logger.end_operation()
return self.transform(X)
[docs]
def to_tex(self) -> dict:
return {
"desc": "Removes columns with high variance inflation factor (VIF > 10).",
}
[docs]
class UMAPDimentionReducer(DimentionReducer):
"""
Reduces the dimensionality of the data using UMAP.
"""
def __init__(self):
self.reducer = None
self.n_components = None
[docs]
def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "UMAPDimentionReducer":
"""
Fits the UMAPDimentionReducer to the data.
"""
logger.start_operation(
f"Fitting UMAPDimentionReducer to data with {X.shape[0]} rows and {X.shape[1]} columns."
)
try:
if X.shape[1] > 100:
self.n_components = config.umap_components
else:
self.n_components = max(int(X.shape[1] / 2), 1)
self.reducer = umap.UMAP(n_components=self.n_components)
self.reducer.fit(X)
logger.debug(f"Number of components selected: {self.n_components}")
except Exception as e:
logger.error(f"Error in DimentionReducerUMAP fit: {e}")
raise e
finally:
logger.end_operation()
return self
[docs]
def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
"""
Transforms the input data using the fitted UMAP reducer.
"""
logger.start_operation(
f"Transforming data with {X.shape[0]} rows and {X.shape[1]} columns."
)
try:
X = pd.DataFrame(self.reducer.transform(X))
except Exception as e:
logger.error(f"Error in DimentionReducerUMAP transform: {e}")
raise e
finally:
logger.end_operation()
return X
[docs]
def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
"""
Fits the transformer to the data and then transforms it.
"""
logger.start_operation(
"Fitting and transforming data using DimentionReducerUMAP."
)
try:
self.fit(X)
X = self.transform(X)
logger.debug(f"Reducing data to {self.n_components} components.")
except Exception as e:
logger.error(f"Error in DimentionReducerUMAP fit_transform: {e}")
raise e
finally:
logger.end_operation()
return X
[docs]
def to_tex(self) -> dict:
return {
"desc": "Reduces the dimensionality of the data using UMAP.",
"params": {"n_components": self.n_components},
}