Source code for utils.data_preparation.optimize

import pandas as pd
import numpy as np
from typing import List

from .constants import THRESHOLD


[docs]def optimize_floats(df: pd.DataFrame) -> None:
    """
    Optimizes data space usage by casting float columns to smallest possible size

    :param df: DataFrame holding data
    """
    cols = df.select_dtypes(include=np.float_).columns.tolist()
    df[cols] = df[cols].apply(pd.to_numeric, downcast="float")


[docs]def optimize_ints(df: pd.DataFrame) -> None:
    """
    Optimizes data space usage by casting integer columns to smallest possible size

    :param df: DataFrame holding data
    """
    cols = df.select_dtypes(include=np.integer).columns
    min_vals = df[cols].min(axis=0)
    unsigned_idxs = np.where(min_vals >= 0)[0]
    signed_idxs = [i for i in range(len(cols)) if i not in unsigned_idxs]
    df[cols[unsigned_idxs]] = df[cols[unsigned_idxs]].apply(
        pd.to_numeric, downcast="unsigned"
    )
    df[cols[signed_idxs]] = df[cols[signed_idxs]].apply(
        pd.to_numeric, downcast="signed"
    )


[docs]def optimize_objects(
    df: pd.DataFrame, datetime_features: List[str], threshold: int = THRESHOLD
) -> None:
    """
    Optimizes data space usage by casting object columns to pd.category if less or equal to threshold % of entries are unique,
    and datetime_features to pd.datetime

    :param df: DataFrame holding data
    :param datetime_features: List of columns that can be casted to datetime, which significantly reduces space usage
    :param threshold: int from 0 to 100
    """
    for col in df.select_dtypes(include=np.object_):
        if col not in datetime_features:
            if not (type(df[col][0]) == list):
                num_unique_values = len(df[col].unique())
                num_total_values = len(df[col])
                if num_unique_values / num_total_values <= threshold / 100.0:
                    df[col] = df[col].astype("category")
        else:
            df[col] = pd.to_datetime(df[col])


[docs]def convert_to_hhmm(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts every column of df into a hhmm string format

    :returns: modified df
    """
    df = df.fillna(0).astype(np.int16).astype(str)
    for col in df.columns:
        df[col] = df[col].str.zfill(4)
        bad_idxs = df[col] >= "2400"
        df.loc[bad_idxs, col] = (
            (df.loc[bad_idxs, col].astype(np.int16) - 2400).astype(str).str.zfill(4)
        )

    return df


[docs]def optimize(
    df: pd.DataFrame, datetime_features: List[str] = [], flights_data: bool = False
) -> None:
    """
    Optimizes data space usage

    :param df: DataFrame holding data
    :param datetime_features: List of columns that can be casted to datetime, which significantly reduces space usage
    :param flights_data: special flag that triggers additional data conversions only for flights data
    """

    optimize_ints(df)
    optimize_floats(df)
    optimize_objects(df, datetime_features)

    if flights_data:
        dates = {
            "DepTime": "Departure",
            "CRSDepTime": "CRSDeparture",
            "ArrTime": "Arrival",
            "CRSArrTime": "CRSArrival",
        }

        df.loc[:, list(dates.keys())] = convert_to_hhmm(df.loc[:, list(dates.keys())])

        for original_name, new_name in dates.items():
            df[new_name] = pd.to_datetime(
                dict(
                    year=df["Year"],
                    month=df["Month"],
                    day=df["DayofMonth"],
                    hour=df[original_name].str[:2],
                    minute=df[original_name].str[2:],
                )
            )

        df.drop(
            columns=["Year", "Month", "DayofMonth"] + list(dates.keys()),
            inplace=True,
        )


[docs]def concatenate(dfs: List[pd.DataFrame], threshold: int = THRESHOLD) -> pd.DataFrame:
    """
    Concatenate while preserving categorical columns.

    :param dfs: list of DataFrames to concatenate
    :param threshold: target column will be left as categorical if unique values are less threshold % of all values
    """
    assert len(dfs) >= 1, "dfs cannot be empty"
    target_size = sum([df.shape[0] for df in dfs])

    for col in dfs[-1].select_dtypes(include="category").columns:
        # if not category than it must have been all empty
        uc = pd.api.types.union_categoricals(
            [df[col] for df in dfs if df[col].dtype == "category"]
        )
        if len(uc.categories) / target_size <= threshold / 100.0:
            for df in dfs:
                df[col] = pd.Categorical(df[col].values, categories=uc.categories)
    return pd.concat(dfs, ignore_index=True)