Source code for auto_prep.raporting.overview

import humanize
import numpy as np
import pandas as pd

from ..utils.logging_config import setup_logger
from ..utils.system import get_system_info
from .raport import Raport

logger = setup_logger(__name__)


[docs] class OverviewRaport: def __init__(self): self.dataset_summary: dict = {} self.target_distibution: dict = {} self.missing_values: dict = {} self.features_details: dict = {} self.system_info: dict = {} self.descr_num: pd.DataFrame = None self.descr_cat: pd.DataFrame = None self.task: str = None
[docs] def run(self, X: pd.DataFrame, y: pd.Series, task: str): """Performs dataset overview.""" logger.start_operation("Overview.") self.task = task try: self.system_info = get_system_info() except Exception as e: logger.error(f"Failed to gather system informations: {str(e)}") raise e try: numeric_features = X.select_dtypes(include=[np.number]).columns categorical_features = X.select_dtypes(exclude=[np.number]).columns # Basic statistics self.dataset_summary = { "Number of samples": len(X), "Number of features": len(X.columns), "Number of numerical features": len(numeric_features), "Number of categorical features": len(categorical_features), } if task == "classification": value_counts = y.value_counts() normalized_counts = y.value_counts(normalize=True) self.target_distibution = list( zip( value_counts.index, value_counts.values, normalized_counts.values, ) ) missing_value_counts = X.isnull().sum() normalized_missing_counts = X.isnull().sum() / len(X) self.missing_values = list( zip( missing_value_counts.index, missing_value_counts.values, normalized_missing_counts.values, ) ) self.features_details = [ ( feature, "numerical" if feature in numeric_features else "categorical", X[feature].dtype, humanize.naturalsize(X[feature].memory_usage(deep=True)), ) for feature in X.columns ] logger.info( f"Found {len(numeric_features)} numeric and " f"{len(categorical_features)} categorical features" ) if len(numeric_features) > 0: self.descr_num = X[numeric_features].describe().T.reset_index() if len(categorical_features) > 0: self.descr_cat = ( X[categorical_features].describe(exclude=["number"]).T.reset_index() ) except Exception as e: logger.error(f"Failed to gather overview statistics: {str(e)}") raise e logger.end_operation()
[docs] def write_to_raport(self, raport: Raport): """Writes overview section to a raport""" overview_section = raport.add_section("Overview") # noqa: F841 system_subsection = raport.add_subsection("System") # noqa: F841 raport.add_table( self.system_info, header=None, caption="System overview.", ) dataset_subsection = raport.add_subsection("Dataset") # noqa: F841 if self.task == "classification": if len(self.target_distibution) > 2: raport.add_text( "Task detected for the dataset: multiclass classfication. \\newline" ) else: raport.add_text( "Task detected for the dataset: binary classfication. \\newline" ) elif self.task == "regression": raport.add_text("Task detected for the dataset: regression. \\newline") raport.add_reference(label="tab:dataset_summary", add_space=True) dataset_desc = "presents an overview of the dataset including the number of samples, features, and their types." raport.add_text(dataset_desc) raport.add_table( self.dataset_summary, header=None, caption="Dataset Summary.", label="tab:dataset_summary", ) if len(self.target_distibution) > 0: target_desc = "Distribution of the target classes in terms of the number of observations and their percentages is presented in " raport.add_text(target_desc) raport.add_reference(label="tab:target_distribution", add_space=False) raport.add_table( self.target_distibution, caption="Target class distribution.", header=["class", "number of observations", "fraction"], label="tab:target_distribution", ) raport.add_reference(label="tab:missing_values", add_space=True) missing_values_desc = ( "presents the distribution of missing values in the dataset." ) raport.add_text(missing_values_desc) raport.add_table( self.missing_values, caption="Missing values distribution.", header=["feature", "number of observations", "fraction"], label="tab:missing_values", ) raport.add_reference(label="tab:features_dtypes", add_space=True) features_desc = "presents the description of features in the dataset." raport.add_text(features_desc) raport.add_table( self.features_details, header=["feature", "type", "dtype", "space usage"], caption="Features dtypes description.", label="tab:features_dtypes", ) columns = [c.replace("%", "\%") for c in self.descr_num.columns] # noqa W605 if self.descr_num is not None and self.descr_cat is not None: raport.add_reference(label="tab:numerical_features", add_space=True) raport.add_text("and ") raport.add_reference(label="tab:categorical_features", add_space=True) features_desc = "present the description of numerical and categorical features in the dataset." raport.add_text(features_desc) columns[0] = "feature" raport.add_table( self.descr_num.values.tolist(), header=columns, caption="Numerical features description.", label="tab:numerical_features", ) raport.add_table( self.descr_cat.values.tolist(), header=self.descr_cat.columns, caption="Categorical features description.", label="tab:categorical_features", ) elif self.descr_num is not None: raport.add_reference(label="tab:numerical_features", add_space=True) features_desc = ( "presents the description of numerical features in the dataset." ) raport.add_text(features_desc) raport.add_table( self.descr_num.values.tolist(), header=columns, caption="Numerical features description.", label="tab:numerical_features", ) elif self.descr_cat is not None: raport.add_reference(label="tab:categorical_features", add_space=True) features_desc = ( "presents the description of categorical features in the dataset." ) raport.add_text(features_desc) self.descr_cat.columns[0] = "feature" raport.add_table( self.descr_cat.values.tolist(), header=self.descr_cat.columns, caption="Categorical features description.", label="tab:categorical_features", ) else: features_desc = "No numerical or categorical features descriptions are available in the dataset." raport.add_text(features_desc) return raport