Source code for auto_prep.utils.config

import logging
import os
import shutil
import warnings
from typing import Union

import numpy as np
from pylatex import NoEscape
from sklearn import set_config
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, roc_auc_score

os.environ["OMP_DISPLAY_ENV"] = "FALSE"
# Suppress UserWarnings and RuntimeWarnings
warnings.simplefilter("ignore", category=UserWarning)
warnings.simplefilter("ignore", category=RuntimeWarning)


set_config(transform_output="pandas")

# ANSI color codes
COLORS: dict = {
    "DEBUG": "\033[36m",  # Cyan
    "INFO": "\033[32m",  # Green
    "WARNING": "\033[33m",  # Yellow
    "ERROR": "\033[31m",  # Red
    "CRITICAL": "\033[41m",  # Red background
    "RESET": "\033[0m",  # Reset color
}

LOG_FORMAT: str = "%(asctime)s %(levelname)s %(name)s: %(message)s"
LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
LOG_LEVEL: str = logging.CRITICAL

DEFAULT_TEX_GEOMETRY: dict = {
    "margin": "0.5in",
    "headheight": "10pt",
    "footskip": "0.2in",
    "tmargin": "0.5in",
    "bmargin": "0.5in",
}

DEFAULT_ABSTRACT: str = NoEscape(
    r"""
    \begin{abstract}
    This raport has been generated with AutoPrep.
    \end{abstract}
    """
)

DEFAULT_CHARTS_SETTINGS: dict = {
    "theme": "white",
    "title_fontsize": 18,
    "title_fontweight": "bold",
    "xlabel_fontsize": 15,
    "ylabel_fontsize": 15,
    "tick_label_rotation": 45,
    "palette": "pastel",
    "plot_width": 20,
    "plot_height_per_row": 8,
    "heatmap_cmap": "coolwarm",
    "heatmap_fmt": ".2f",
}

DEFAULT_CORRELATION_SELECTOR_SETTINGS: dict = {
    "threshold": 0.8,
    "k": 10,
}

DEFAULT_OUTLIER_DETECTOR_SETTINGS: dict = {
    "zscore_threshold": 3,
    "isol_forest_n_estimators": 100,
    "cook_threshold": 1,
}

DEFAULT_IMPUTTER_SETTINGS: dict = {
    "categorical_strategy": "most_frequent",
    "numerical_strategy": "mean",
    "n_iter": 10,
}

DEFAULT_TUNING_PARAMS: dict = {
    "cv": 3,
    "verbose": 0,
    "n_jobs": -1,
    "random_state": 42,
    "n_iter": 10,
}



[docs]
class GlobalConfig:
    """Global config class."""

    _instance = None

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super(GlobalConfig, cls).__new__(cls, *args, **kwargs)
            cls._instance.set()
        return cls._instance


[docs]
    def set(
        self,
        raport_name: str = "raport",
        raport_title: str = "ML Raport",
        raport_author: str = "AutoPrep",
        raport_abstract: str = DEFAULT_ABSTRACT,
        root_dir: str = "raport",
        return_tex_: bool = True,
        logger_colors_map: dict = COLORS,
        log_format: str = LOG_FORMAT,
        log_date_format: str = LOG_DATE_FORMAT,
        log_level: str = LOG_LEVEL,
        log_dir: str = None,
        max_log_file_size_in_mb: int = 5,
        tex_geomatry: dict = DEFAULT_TEX_GEOMETRY,
        train_size: float = 0.8,
        test_size: float = 0.1,
        valid_size: float = 0.1,
        random_state: int = 42,
        max_datasets_after_preprocessing: int = 3,
        perform_only_required_: bool = False,
        raport_decimal_precision: int = 4,
        chart_settings: dict = DEFAULT_CHARTS_SETTINGS,
        correlation_selectors_settings: dict = DEFAULT_CORRELATION_SELECTOR_SETTINGS,
        outlier_detector_settings: dict = DEFAULT_OUTLIER_DETECTOR_SETTINGS,
        imputer_settings: dict = DEFAULT_IMPUTTER_SETTINGS,
        umap_components: int = 50,
        correlation_threshold: float = 0.8,
        correlation_percent: float = 0.7,
        n_bins: int = 4,
        outlier_detector_method: str = "zscore",
        max_unique_values_classification: int = 20,
        regression_pipeline_scoring_model: BaseEstimator = RandomForestRegressor(
            n_estimators=100, random_state=42, max_depth=5, n_jobs=-1, warm_start=True
        ),
        classification_pipeline_scoring_model: BaseEstimator = RandomForestClassifier(
            n_estimators=100, random_state=42, max_depth=5, n_jobs=-1, warm_start=True
        ),
        regression_pipeline_scoring_func: Union[callable, str] = (
            mean_squared_error,
            "min",
        ),
        classification_pipeline_scoring_func_bin: Union[callable, str] = (
            roc_auc_score,
            "max",
        ),
        classification_pipeline_scoring_func_multi: Union[callable, str] = (
            accuracy_score,
            "max",
        ),
        max_workers: int = None,
        tuning_params: dict = DEFAULT_TUNING_PARAMS,
        max_models: int = 3,
    ):
        """
        Args:
            raport_name (str) - Raport name. Defaults to "raport.pdf".
            raport_title (str) - Raport title. Defaults to "ML Raport".
            raport_title (str) - Raport author. Defaults to "AutoPrep".
            raport_abstract (str) - Raport abstract section. Can be set to "".
                Defaults to :obj:`DEFAULT_ABSTRACT`.
            root_dir (str) - Root directory. Here raport will be
                stored and all cache. Defaults to "raport".
            return_tex_ (bool) - If true it will create .tex file
                alongsite the pdf. Defaults to True.
            logger_colors_map (dict) - Color map for the loggers.
                Defaults to :obj:`COLORS`.
            log_format (str) - Log format for logging liblary.
                Defaults to :obj:`LOG_FORMAT`.
            log_date_format (str) - Log date format for logging liblary.
                Defaults to :obj:`LOG_DATE_FORMAT`.
            log_level (str) - Log level for logging liblary.
                Defaults to :obj:`LOG_LEVEL`.
            log_dir (str) - Log directory for storing the logs.
                If None provided, will default to "logs" in directory from which program was called.
                -1 means no logging to file.
            max_log_file_size_in_mb (int) - Maximum file size in mb for
                each logger. Defaults to 5.
            tex_geomatry (dict) - Geometry for pylatex.
                Defaults to :obj:`DEFAULT_TEX_GEOMETRY`.
            train_size (float) - % of traing set size. Defaults to 0.8.
            test_size (float) - % of traing set size. Defaults to 0.1.
            valid_size (float) - % of traing set size. Defaults to 0.1.
            random_state (int) - Random state for sklearn.
            max_datasets_after_preprocessing (int) - Maximum number of datasets that will be left
                after preprocessing steps. On them further models will be trained. Strongly
                affects performance.
            perform_only_required_ (bool) - weather or not to perform only required steps.
                Affects entire process.
            raport_decimal_precision (int) - Decimal precision for all float in raport.
                Will use standard python rounding.
            chart_settings (dict): Settings for customizing chart appearance.
                Defaults to None, which initializes default settings.
            correlation_selectors_settings (dict): Settings for correlation selectors.
            outlier_detector_settings (dict): Settings for outlier detectors
            imputer_settings (dict): Settings for imputers
            umap_components (int): Number of components for UMAP.
            max_unique_values_classification (int) - in case of target column being of non numerical dtype,
                it will calculate number of unique values (in task "auto"). If this number will be lower than
                that value, it'll perform classification.
            regression_pipeline_scoring_model (BaseEstimator) - model used for scoring processing pipelines
                in classification regression task.
            classification_pipeline_scoring_model (BaseEstimator) - model used for scoring processing pipelines
                in classification regression task.
            regression_pipeline_scoring_func (callable) - metric for scoring :obj:`regression_pipeline_scoring_model` output.
            classification_pipeline_scoring_func (callable) - metric for scoring :obj:`classification_pipeline_scoring_model` output.
            raport_chart_color_pallete (List[str]) - Color palette for basic eda charts.
            max_unique_values_classification (int) - in case of target column being of non numerical dtype,
                it will calculate number of unique values (in task "auto"). If this number will be lower than
                that value, it'll perform classification.
            regression_pipeline_scoring_model (BaseEstimator) - model used for scoring processing pipelines
                in classification regression task.
            classification_pipeline_scoring_model (BaseEstimator) - model used for scoring processing pipelines
                in classification regression task.
            regression_pipeline_scoring_func Union[callable, str] - pair (metric, direction) for scoring :obj:`regression_pipeline_scoring_model` output. Available directions are ['max', 'min'].
            classification_pipeline_scoring_func_bin Union[callable, str] - pair (metric, direction) for scoring :obj:`classification_pipeline_scoring_model` output. Available directions are ['max', 'min'].
            classification_pipeline_scoring_func_multi Union[callable, str] - pair (metric, direction) for scoring :obj:`classification_pipeline_scoring_model` output. Available directions are ['max', 'min'].
            raport_chart_color_pallete (List[str]) - Color palette for basic eda charts.
            correlation_threshold (float) - threshold used for detecting highly correlated features.Default 0.8.
            correlation_percent (float) - % of selected features based on their correlation with the target. Default 0.5.
            n_bins (int) - number of bins to create while binning numerical features.
            outlier_detector_method (str) - method used for outlier detection. Default "zscore".
            max_workers (int) - maximum number of cores to evaluate on.
            tuning_params (dict) - Tuning params for RandomizedSearchCV.
            max_models (int) - Maximum number of final models to save and raport.
        """
        assert (
            isinstance(raport_name, str) and raport_name != ""
        ), "raport_name should not be empty"
        self.raport_name = raport_name
        self.raport_title = raport_title
        self.raport_author = raport_author
        self.raport_abstract = raport_abstract

        self.root_dir = root_dir
        self.raport_path = os.path.abspath(os.path.join(root_dir, raport_name))
        self.charts_dir = os.path.join(self.raport_path, "charts")
        self.pipelines_dir = os.path.join(self.raport_path, "pipelines")

        self.return_tex_ = return_tex_

        self.logger_colors_map = logger_colors_map
        self.log_format = log_format
        self.log_date_format = log_date_format
        self.log_level = log_level

        assert (
            int(max_log_file_size_in_mb) == max_log_file_size_in_mb
            and max_log_file_size_in_mb >= 1
        ), f"Wrong value for max_log_file_size_in_mb: {max_log_file_size_in_mb}. "
        "Should be int > 1."
        self.max_log_file_size_in_mb = max_log_file_size_in_mb

        if log_dir is None:
            log_dir = os.path.abspath("logs")
        if log_dir != -1:
            os.makedirs(log_dir, exist_ok=True)
        self.log_dir = log_dir

        self.tex_geomatry = tex_geomatry

        self.train_size = train_size
        self.test_size = test_size
        self.valid_size = valid_size

        self.random_state = random_state
        np.random.seed(random_state)

        self.chart_settings = chart_settings

        assert (
            max_datasets_after_preprocessing > 0
        ), "Values smaller than 1 are forbidden."
        self.max_datasets_after_preprocessing = max_datasets_after_preprocessing
        self.perform_only_required_ = perform_only_required_

        self.raport_decimal_precision = raport_decimal_precision

        self.root_project_dir = os.path.abspath(
            os.path.join(__file__, "..", "..", "..")
        )

        assert 0 <= correlation_threshold <= 1, (
            f"Invalid value for correlation_threshold: {correlation_threshold}. "
            "It must be a float between 0 and 1."
        )
        self.correlation_threshold = correlation_threshold

        assert 0 <= correlation_percent <= 1, (
            f"Invalid value for correlation_selector_percent: {correlation_percent}. "
            "It must be a float between 0 and 1."
        )
        self.correlation_percent = correlation_percent

        assert (
            int(n_bins) == n_bins and n_bins >= 1
        ), f"Wrong value for n_bins: {n_bins}. "
        "Should be int >= 1."
        self.n_bins = n_bins

        self.correlation_selectors_settings = correlation_selectors_settings
        self.outlier_detector_settings = outlier_detector_settings
        self.imputer_settings = imputer_settings

        self.umap_components = umap_components

        assert (
            max_unique_values_classification >= 0
        ), "max_unique_values_classification should be positive integer."
        self.max_unique_values_classification = max_unique_values_classification
        self.regression_pipeline_scoring_model = regression_pipeline_scoring_model
        self.classification_pipeline_scoring_model = (
            classification_pipeline_scoring_model
        )

        for func_ in (
            regression_pipeline_scoring_func,
            classification_pipeline_scoring_func_bin,
            classification_pipeline_scoring_func_multi,
        ):
            assert func_[1] in (
                "min",
                "max",
            ), f"Unknown direction choosen for {func_.__name__}"
        self.regression_pipeline_scoring_func = regression_pipeline_scoring_func
        self.classification_pipeline_scoring_func = (
            classification_pipeline_scoring_func_bin
        )
        self.classification_pipeline_scoring_func_multi = (
            classification_pipeline_scoring_func_multi
        )

        assert outlier_detector_method in [
            "zscore",
            "iqr",
            "isolation_forest",
        ], f"Invalid value for outlier_detector_method: {outlier_detector_method}."
        "Should be one of ['zscore', 'iqr', 'isolation_forest', 'cooks_distance']."
        self.outlier_detector_method = outlier_detector_method

        self.max_workers = max_workers
        self.tuning_params = tuning_params

        assert max_models > 0, "Invalid value"
        self.max_models = max_models



[docs]
    def update(self, **kwargs):
        """Updates config's data with kwargs."""
        for key, value in kwargs.items():
            setattr(self, key, value)



[docs]
    def prepare_dir(self):
        """Clears and creates all neccessary directories."""
        if os.path.exists(self.root_dir):
            shutil.rmtree(self.root_dir)
        os.makedirs(self.root_dir, exist_ok=True)
        os.makedirs(self.charts_dir, exist_ok=True)
        os.makedirs(self.pipelines_dir, exist_ok=True)




config = GlobalConfig()