Source code for auto_prep.raporting.eda

from typing import Dict, List, Tuple

import pandas as pd

from ..utils.logging_config import setup_logger
from ..visualization.categorical import CategoricalVisualizer
from ..visualization.eda import EdaVisualizer
from ..visualization.numerical import NumericalVisualizer
from .raport import Raport

logger = setup_logger(__name__)



[docs]
class EdaRaport:
    visualizers: list = [EdaVisualizer, CategoricalVisualizer, NumericalVisualizer]

    def __init__(self):
        self.charts_dt: Dict[str, List[Tuple[str, str]]] = {}


[docs]
    def run(self, X: pd.DataFrame, y: pd.Series, task: str):
        """Performs dataset EDA analysis based on the given task (classification or regression)."""

        logger.start_operation("EDA.")

        try:
            for visualizer_cls in EdaRaport.visualizers:
                logger.start_operation(f"{visualizer_cls.__name__} plot generation.")
                logger.debug(
                    f"Will call plots in the following order: {visualizer_cls.order}"
                )
                self.charts_dt[visualizer_cls.__name__] = []

                for method_name in visualizer_cls.order:
                    method = getattr(visualizer_cls, method_name)

                    try:
                        chart_dt = method(X, y, task=task)
                    except TypeError:
                        chart_dt = method(X, y)

                    if isinstance(chart_dt, list) and len(chart_dt) > 0:
                        self.charts_dt[visualizer_cls.__name__].extend(chart_dt)
                    elif isinstance(chart_dt, tuple) and chart_dt[0] != "":
                        self.charts_dt[visualizer_cls.__name__].append(chart_dt)

                logger.end_operation()

        except Exception as e:
            logger.error(f"Failed to perform EDA analysis: {str(e)}")
            raise e
        finally:
            logger.end_operation()



[docs]
    def write_to_raport(self, raport: Raport):
        """Writes eda section to a raport"""

        eda_section = raport.add_section("Eda")  # noqa: F841

        section_desc = "This part of the report provides basic insides to the data and the informations it holds.."
        raport.add_text(section_desc)

        for visualizer_name, charts_dt in self.charts_dt.items():
            # raport.add_subsection(visualizer_name[: -len("Visualizer")])
            if visualizer_name == "EdaVisualizer":
                raport.add_subsection("Target variable and missing values")
            elif visualizer_name == "CategoricalVisualizer":
                if charts_dt:
                    raport.add_subsection("EDA for categorical features")
            elif visualizer_name == "NumericalVisualizer":
                if charts_dt:
                    raport.add_subsection("EDA for numerical features")

            for path, caption in charts_dt:
                if caption == "Target distribution.":
                    raport.add_reference(label=caption, prefix="Figure", add_space=True)
                    raport.add_text(" shows the distribution of the target variable.")
                elif caption == "Missing values.":
                    raport.add_reference(label=caption, prefix="Figure", add_space=True)
                    raport.add_text(
                        " shows the distribution of missing values in the dataset."
                    )
                elif caption == "Numerical Features Distribution - Page 1":
                    raport.add_text(
                        "The distribution of numerical features is presented on histogram(s) below."
                    )
                elif caption == "Categorical Features Distribution - Page 1":
                    raport.add_text(
                        "The distribution of categorical features is presented on barplot(s) below."
                    )
                elif caption == "Correlation heatmap.":
                    raport.add_reference(label=caption, prefix="Figure", add_space=True)
                    raport.add_text(" shows the correlation between features.")
                elif caption == "Boxplot page 1":
                    raport.add_text(
                        "The boxplot of numerical features is presented on chart(s) below."
                    )
                raport.add_figure(path=path, caption=caption, label=caption)

        return raport