Source code for auto_prep.utils.config

import logging
import os
import shutil
import warnings
from typing import Union

import numpy as np
from pylatex import NoEscape
from sklearn import set_config
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score

os.environ["OMP_DISPLAY_ENV"] = "FALSE"
# Suppress UserWarnings and RuntimeWarnings
warnings.simplefilter("ignore", category=UserWarning)
warnings.simplefilter("ignore", category=RuntimeWarning)


set_config(transform_output="pandas")

# ANSI color codes
COLORS: dict = {
    "DEBUG": "\033[36m",  # Cyan
    "INFO": "\033[32m",  # Green
    "WARNING": "\033[33m",  # Yellow
    "ERROR": "\033[31m",  # Red
    "CRITICAL": "\033[41m",  # Red background
    "RESET": "\033[0m",  # Reset color
}

LOG_FORMAT: str = "%(asctime)s %(levelname)s %(name)s: %(message)s"
LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
LOG_LEVEL: str = logging.CRITICAL

DEFAULT_TEX_GEOMETRY: dict = {
    "margin": "0.5in",
    "headheight": "10pt",
    "footskip": "0.2in",
    "tmargin": "0.5in",
    "bmargin": "0.5in",
}

DEFAULT_ABSTRACT: str = NoEscape(
    r"""
    \begin{abstract}
    This raport has been generated with AutoPrep.
    \end{abstract}
    """
)

DEFAULT_CHARTS_SETTINGS: dict = {
    "theme": "white",
    "title_fontsize": 18,
    "title_fontweight": "bold",
    "xlabel_fontsize": 15,
    "ylabel_fontsize": 15,
    "tick_label_rotation": 45,
    "palette": "pastel",
    "plot_width": 20,
    "plot_height_per_row": 8,
    "heatmap_cmap": "coolwarm",
    "heatmap_fmt": ".2f",
}

DEFAULT_CORRELATION_SELECTOR_SETTINGS: dict = {
    "threshold": 0.8,
    "k": 10,
}

DEFAULT_OUTLIER_DETECTOR_SETTINGS: dict = {
    "zscore_threshold": 3,
    "isol_forest_n_estimators": 100,
    "cook_threshold": 1,
}

DEFAULT_IMPUTTER_SETTINGS: dict = {
    "categorical_strategy": "most_frequent",
    "numerical_strategy": "mean",
    "n_iter": 10,
}

DEFAULT_TUNING_PARAMS: dict = {
    "cv": 3,
    "verbose": 0,
    "n_jobs": -1,
    "random_state": 42,
    "n_iter": 10,
}


[docs] class GlobalConfig: """Global config class.""" _instance = None def __new__(cls, *args, **kwargs): if not cls._instance: cls._instance = super(GlobalConfig, cls).__new__(cls, *args, **kwargs) cls._instance.set() return cls._instance
[docs] def set( self, raport_name: str = "raport", raport_title: str = "ML Raport", raport_author: str = "AutoPrep", raport_abstract: str = DEFAULT_ABSTRACT, root_dir: str = "raport", return_tex_: bool = True, logger_colors_map: dict = COLORS, log_format: str = LOG_FORMAT, log_date_format: str = LOG_DATE_FORMAT, log_level: str = LOG_LEVEL, log_dir: str = None, max_log_file_size_in_mb: int = 5, tex_geomatry: dict = DEFAULT_TEX_GEOMETRY, train_size: float = 0.8, test_size: float = 0.1, valid_size: float = 0.1, random_state: int = 42, max_datasets_after_preprocessing: int = 3, perform_only_required_: bool = False, raport_decimal_precision: int = 4, chart_settings: dict = DEFAULT_CHARTS_SETTINGS, correlation_selectors_settings: dict = DEFAULT_CORRELATION_SELECTOR_SETTINGS, outlier_detector_settings: dict = DEFAULT_OUTLIER_DETECTOR_SETTINGS, imputer_settings: dict = DEFAULT_IMPUTTER_SETTINGS, umap_components: int = 50, correlation_threshold: float = 0.8, correlation_percent: float = 0.7, n_bins: int = 4, outlier_detector_method: str = "zscore", max_unique_values_classification: int = 20, regression_pipeline_scoring_model: BaseEstimator = RandomForestRegressor( n_estimators=100, random_state=42, max_depth=5, n_jobs=-1, warm_start=True ), classification_pipeline_scoring_model: BaseEstimator = RandomForestClassifier( n_estimators=100, random_state=42, max_depth=5, n_jobs=-1, warm_start=True ), regression_pipeline_scoring_func: Union[callable, str] = ( mean_squared_error, "min", ), classification_pipeline_scoring_func_bin: Union[callable, str] = ( roc_auc_score, "max", ), classification_pipeline_scoring_func_multi: Union[callable, str] = ( accuracy_score, "max", ), max_workers: int = None, tuning_params: dict = DEFAULT_TUNING_PARAMS, max_models: int = 3, ): """ Args: raport_name (str) - Raport name. Defaults to "raport.pdf". raport_title (str) - Raport title. Defaults to "ML Raport". raport_title (str) - Raport author. Defaults to "AutoPrep". raport_abstract (str) - Raport abstract section. Can be set to "". Defaults to :obj:`DEFAULT_ABSTRACT`. root_dir (str) - Root directory. Here raport will be stored and all cache. Defaults to "raport". return_tex_ (bool) - If true it will create .tex file alongsite the pdf. Defaults to True. logger_colors_map (dict) - Color map for the loggers. Defaults to :obj:`COLORS`. log_format (str) - Log format for logging liblary. Defaults to :obj:`LOG_FORMAT`. log_date_format (str) - Log date format for logging liblary. Defaults to :obj:`LOG_DATE_FORMAT`. log_level (str) - Log level for logging liblary. Defaults to :obj:`LOG_LEVEL`. log_dir (str) - Log directory for storing the logs. If None provided, will default to "logs" in directory from which program was called. -1 means no logging to file. max_log_file_size_in_mb (int) - Maximum file size in mb for each logger. Defaults to 5. tex_geomatry (dict) - Geometry for pylatex. Defaults to :obj:`DEFAULT_TEX_GEOMETRY`. train_size (float) - % of traing set size. Defaults to 0.8. test_size (float) - % of traing set size. Defaults to 0.1. valid_size (float) - % of traing set size. Defaults to 0.1. random_state (int) - Random state for sklearn. max_datasets_after_preprocessing (int) - Maximum number of datasets that will be left after preprocessing steps. On them further models will be trained. Strongly affects performance. perform_only_required_ (bool) - weather or not to perform only required steps. Affects entire process. raport_decimal_precision (int) - Decimal precision for all float in raport. Will use standard python rounding. chart_settings (dict): Settings for customizing chart appearance. Defaults to None, which initializes default settings. correlation_selectors_settings (dict): Settings for correlation selectors. outlier_detector_settings (dict): Settings for outlier detectors imputer_settings (dict): Settings for imputers umap_components (int): Number of components for UMAP. max_unique_values_classification (int) - in case of target column being of non numerical dtype, it will calculate number of unique values (in task "auto"). If this number will be lower than that value, it'll perform classification. regression_pipeline_scoring_model (BaseEstimator) - model used for scoring processing pipelines in classification regression task. classification_pipeline_scoring_model (BaseEstimator) - model used for scoring processing pipelines in classification regression task. regression_pipeline_scoring_func (callable) - metric for scoring :obj:`regression_pipeline_scoring_model` output. classification_pipeline_scoring_func (callable) - metric for scoring :obj:`classification_pipeline_scoring_model` output. raport_chart_color_pallete (List[str]) - Color palette for basic eda charts. max_unique_values_classification (int) - in case of target column being of non numerical dtype, it will calculate number of unique values (in task "auto"). If this number will be lower than that value, it'll perform classification. regression_pipeline_scoring_model (BaseEstimator) - model used for scoring processing pipelines in classification regression task. classification_pipeline_scoring_model (BaseEstimator) - model used for scoring processing pipelines in classification regression task. regression_pipeline_scoring_func Union[callable, str] - pair (metric, direction) for scoring :obj:`regression_pipeline_scoring_model` output. Available directions are ['max', 'min']. classification_pipeline_scoring_func_bin Union[callable, str] - pair (metric, direction) for scoring :obj:`classification_pipeline_scoring_model` output. Available directions are ['max', 'min']. classification_pipeline_scoring_func_multi Union[callable, str] - pair (metric, direction) for scoring :obj:`classification_pipeline_scoring_model` output. Available directions are ['max', 'min']. raport_chart_color_pallete (List[str]) - Color palette for basic eda charts. correlation_threshold (float) - threshold used for detecting highly correlated features.Default 0.8. correlation_percent (float) - % of selected features based on their correlation with the target. Default 0.5. n_bins (int) - number of bins to create while binning numerical features. outlier_detector_method (str) - method used for outlier detection. Default "zscore". max_workers (int) - maximum number of cores to evaluate on. tuning_params (dict) - Tuning params for RandomizedSearchCV. max_models (int) - Maximum number of final models to save and raport. """ assert ( isinstance(raport_name, str) and raport_name != "" ), "raport_name should not be empty" self.raport_name = raport_name self.raport_title = raport_title self.raport_author = raport_author self.raport_abstract = raport_abstract self.root_dir = root_dir self.raport_path = os.path.abspath(os.path.join(root_dir, raport_name)) self.charts_dir = os.path.join(self.raport_path, "charts") self.pipelines_dir = os.path.join(self.raport_path, "pipelines") self.return_tex_ = return_tex_ self.logger_colors_map = logger_colors_map self.log_format = log_format self.log_date_format = log_date_format self.log_level = log_level assert ( int(max_log_file_size_in_mb) == max_log_file_size_in_mb and max_log_file_size_in_mb >= 1 ), f"Wrong value for max_log_file_size_in_mb: {max_log_file_size_in_mb}. " "Should be int > 1." self.max_log_file_size_in_mb = max_log_file_size_in_mb if log_dir is None: log_dir = os.path.abspath("logs") if log_dir != -1: os.makedirs(log_dir, exist_ok=True) self.log_dir = log_dir self.tex_geomatry = tex_geomatry self.train_size = train_size self.test_size = test_size self.valid_size = valid_size self.random_state = random_state np.random.seed(random_state) self.chart_settings = chart_settings assert ( max_datasets_after_preprocessing > 0 ), "Values smaller than 1 are forbidden." self.max_datasets_after_preprocessing = max_datasets_after_preprocessing self.perform_only_required_ = perform_only_required_ self.raport_decimal_precision = raport_decimal_precision self.root_project_dir = os.path.abspath( os.path.join(__file__, "..", "..", "..") ) assert 0 <= correlation_threshold <= 1, ( f"Invalid value for correlation_threshold: {correlation_threshold}. " "It must be a float between 0 and 1." ) self.correlation_threshold = correlation_threshold assert 0 <= correlation_percent <= 1, ( f"Invalid value for correlation_selector_percent: {correlation_percent}. " "It must be a float between 0 and 1." ) self.correlation_percent = correlation_percent assert ( int(n_bins) == n_bins and n_bins >= 1 ), f"Wrong value for n_bins: {n_bins}. " "Should be int >= 1." self.n_bins = n_bins self.correlation_selectors_settings = correlation_selectors_settings self.outlier_detector_settings = outlier_detector_settings self.imputer_settings = imputer_settings self.umap_components = umap_components assert ( max_unique_values_classification >= 0 ), "max_unique_values_classification should be positive integer." self.max_unique_values_classification = max_unique_values_classification self.regression_pipeline_scoring_model = regression_pipeline_scoring_model self.classification_pipeline_scoring_model = ( classification_pipeline_scoring_model ) for func_ in ( regression_pipeline_scoring_func, classification_pipeline_scoring_func_bin, classification_pipeline_scoring_func_multi, ): assert func_[1] in ( "min", "max", ), f"Unknown direction choosen for {func_.__name__}" self.regression_pipeline_scoring_func = regression_pipeline_scoring_func self.classification_pipeline_scoring_func = ( classification_pipeline_scoring_func_bin ) self.classification_pipeline_scoring_func_multi = ( classification_pipeline_scoring_func_multi ) assert outlier_detector_method in [ "zscore", "iqr", "isolation_forest", ], f"Invalid value for outlier_detector_method: {outlier_detector_method}." "Should be one of ['zscore', 'iqr', 'isolation_forest', 'cooks_distance']." self.outlier_detector_method = outlier_detector_method self.max_workers = max_workers self.tuning_params = tuning_params assert max_models > 0, "Invalid value" self.max_models = max_models
[docs] def update(self, **kwargs): """Updates config's data with kwargs.""" for key, value in kwargs.items(): setattr(self, key, value)
[docs] def prepare_dir(self): """Clears and creates all neccessary directories.""" if os.path.exists(self.root_dir): shutil.rmtree(self.root_dir) os.makedirs(self.root_dir, exist_ok=True) os.makedirs(self.charts_dir, exist_ok=True) os.makedirs(self.pipelines_dir, exist_ok=True)
config = GlobalConfig()