Source code for auto_prep.preprocessing.binning
import numpy as np
import pandas as pd
from ..utils.abstract import NonRequiredStep, Numerical
from ..utils.config import config
from ..utils.logging_config import setup_logger
logger = setup_logger(__name__)
[docs]
class BinningTransformer(NonRequiredStep, Numerical):
"""
Transformer for performing binning (using qcut) or equal-width binning (using cut)
on continuous variables and replacing the values with numeric labels, but only if the number of unique values
exceeds 50% of the number of samples in the column.
Attributes:
threshold (float) : percent of unique values in a column in order to classify for binning. Default : 0.5
should_bin (dict) : dictionary to track which columns should be binned.
bin_edges (dict) : dictionary to store the bin edged for each column.
"""
PARAMS_GRID = {
"binning_method": ["qcut", "cut"],
}
def __init__(self, binning_method: str = "qcut"):
"""
Initializes the transformer with the number of bins for quantile binning and the binning method to use ('cut' or 'qcut').
Args:
n_bins (int): The number of bins to create (default is 4).
binning_method (str): The binning method to use ('cut' for equal-width, 'qcut' for quantile binning) (default 'qcut').
"""
self.n_bins = config.n_bins
self.threshold = 0.5
self.should_bin = {} # A dictionary to track which columns should be binned
self.bin_edges = {} # Dictionary to store the bin edges for each column
self.binning_method = binning_method
[docs]
def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "BinningTransformer":
"""
Fits the transformer by calculating the bin edges for each continuous column if the number of unique values
exceeds the threshold of 50%.
Args:
X (pd.DataFrame): The input feature data.
Returns:
BinningTransformer: The fitted transformer instance.
"""
logger.start_operation(
f"Fitting BinningTransformer with {self.n_bins} binning method : {self.binning_method}"
)
try:
for column in X.select_dtypes(include=[np.number]).columns:
unique_values_ratio = len(X[column].unique()) / len(X[column])
if unique_values_ratio > self.threshold:
self.should_bin[column] = True
if self.binning_method == "cut":
logger.debug(
f"BinningTransformer: calculating bin edges for {column} using cut"
)
self.bin_edges[column] = np.linspace(
X[column].min(), X[column].max(), self.n_bins + 1
)
elif self.binning_method == "qcut":
logger.debug(
f"BinningTransformer: calculating bin edges for {column} using qcut"
)
self.bin_edges[column] = np.percentile(
X[column], np.linspace(0, 100, self.n_bins + 1)
)
else:
self.should_bin[column] = False
logger.debug("Successfully fitted BinningTransformer.")
except Exception as e:
logger.error(
f"Failed to fit BinningTransformer with method {self.binning_method}: {e}",
exc_info=True,
)
raise ValueError(
f"Failed to fit BinningTransformer with method {self.binning_method}: {e}"
)
finally:
logger.end_operation()
return self
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the data by replacing continuous values with their respective bin labels (numeric).
Args:
X (pd.DataFrame): The feature data.
Returns:
pd.DataFrame: The transformed data with bin labels.
"""
logger.start_operation(
f"Transforming BinningTransformer with {self.n_bins} and binning_method {self.binning_method}"
)
try:
X_transformed = X.copy()
for column in X_transformed.columns:
if self.should_bin.get(column, False):
if self.binning_method == "cut":
logger.debug(
f"BinningTransformer: transforming column: {column} using cut"
)
X_transformed[column] = pd.cut(
X_transformed[column],
bins=self.bin_edges[column],
labels=False,
include_lowest=True,
)
elif self.binning_method == "qcut":
logger.debug(
f"BinningTransformer: transforming column : {column} using qcut"
)
X_transformed[column] = pd.qcut(
X_transformed[column],
q=self.n_bins,
labels=False,
duplicates="drop",
)
logger.debug("Successfully transformed data with BinningTransformer")
except Exception as e:
logger.error(
f"Failed to transform BinningTransformer with method {self.binning_method}: {e}",
exc_info=True,
)
raise ValueError(
f"Failed to transform BinningTransformer with method {self.binning_method}: {e}"
)
finally:
logger.end_operation()
return X_transformed
[docs]
def fit_transform(self, X: pd.DataFrame, y: pd.Series = None) -> pd.DataFrame:
"""
Fits and transforms the data in one step.
Args:
X (pd.DataFrame): The feature data.
Returns:
pd.DataFrame: The transformed data with bin labels.
"""
logger.start_operation(
f"Fitting ans transforming data with BinningTransformer n_bins: {self.n_bins} and binning_method: {self.binning_method}"
)
try:
transformed_X = self.fit(X).transform(X)
logger.debug(
f"Successfully fit_transformed data data with BinningTransformer n_bins: {self.n_bins} and binning_method: {self.binning_method}"
)
except Exception as e:
logger.error(
f"Failed to fit_transform BinningTransformer with method {self.binning_method}: {e}",
exc_info=True,
)
raise ValueError(
f"Failed to fit_transform BinningTransformer with method {self.binning_method}: {e}"
)
finally:
logger.end_operation()
return transformed_X
[docs]
def to_tex(self) -> dict:
"""
Returns a description of the transformer in dictionary format.
Returns:
dict: Description of the transformer.
"""
return {
"desc": "Performs binning on continuous variables and replaces them with numeric labels. ",
"params": {"n_bins": self.n_bins, "binning_method": self.binning_method},
}