Source code for auto_prep.preprocessing.scaling
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from ..utils.abstract import Numerical, RequiredStep
from ..utils.logging_config import setup_logger
logger = setup_logger(__name__)
[docs]
class ColumnScaler(RequiredStep, Numerical):
"""
Scaler for all numerical features. This class applies scaling technique based on users choice to
all numerical features.
Available scaling methods: MinMaxScaler, StandardScaler, RobustScaler from sklearn.
Attributes:
scaler (object): fitted scaler instance.
"""
PARAMS_GRID = {
"method": ["standard", "minmax", "robust"],
}
def __init__(self, method: str = "standard"):
"""
Initializes the scaler with the specified scaling type. Default : StandardScaler
Args:
method (str): The type of scaler to use ('minmax', 'standard', or 'robust').
"""
self.method = method
if self.method == "minmax":
self.scaler = MinMaxScaler()
elif self.method == "standard":
self.scaler = StandardScaler()
elif self.method == "robust":
self.scaler = RobustScaler()
else:
raise ValueError(
"Invalid scaler_type. Choose from : 'minmax', 'standard', 'robust'."
)
[docs]
def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "ColumnScaler":
"""
Fits the chosen scaler to the numerical features in the data.
Args:
X (pd.DataFrame): The feature data to fit the scaler to.
Returns:
ColumnScaler: The fitted scaler instance.
"""
logger.start_operation(
f"Fitting ColumnScaler with type '{self.method}' to data with {X.shape[0]} rows and {X.shape[1]} columns."
)
try:
numerical_cols = X.select_dtypes(include=["number"]).columns
if numerical_cols.empty:
raise ValueError("Scaler: No numerical columns found in the dataset.")
self.scaler.fit(X[numerical_cols])
logger.debug(f"Successfully fitted ColumnScaler with method {self.method}")
except Exception as e:
logger.error(f"Failed to fit ColumnScaler: {e}", exc_info=True)
raise ValueError(f"An error occurred while fitting ColumnScaler: {e}")
finally:
logger.end_operation()
return self
[docs]
def transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
"""
Transforms numeric feature data using the fitted scaler.
Args:
X (pd.DataFrame): The feature data to transform.
y (pd.Series, optional): The target variable.
Returns:
pd.DataFrame: The transformed feature data.
"""
logger.start_operation(
f"Scaler: Transforming data with {X.shape[0]} rows and {X.shape[1]} columns."
)
try:
X_transformed = X.copy()
numerical_cols = X_transformed.select_dtypes(include=["number"]).columns
X_transformed[numerical_cols] = self.scaler.transform(
X_transformed[numerical_cols]
)
logger.debug(
f"Successfully transformed ColumnScaler with method {self.method}"
)
except Exception as e:
logger.error(f"Failed to transform ColumnScaler {e}", exc_info=True)
raise ValueError(f"An error occurred while transforming ColumnScaler: {e}")
finally:
logger.end_operation()
return X_transformed
[docs]
def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
"""
Fits and transforms the feature data using the chosen scaler.
Args:
X (pd.DataFrame): The feature data to transform.
y (pd.Series, optional): The target variable (to append to the result).
Returns:
pd.DataFrame: The transformed feature data.
"""
logger.start_operation(
f"Fitting and transforming data using '{self.method}' scaler."
)
try:
result = self.fit(X).transform(X, y)
logger.debug(
f"Successfully fit_transformed data with ColumnScaler method : {self.method}"
)
except Exception as e:
logger.error(f"Failed to fit_transform ColumnScaler {e}", exc_info=True)
raise ValueError(f"An error occurred while fit_transform ColumnScaler: {e}")
finally:
logger.end_operation()
return result
[docs]
def to_tex(self) -> dict:
"""
This method returns a short description of the Scaler that was used in a form of dictionary.
"""
return {
"desc": "Scales numerical columns using one of 3 scaling methods.",
"params": {"method": self.method},
}