Source code for utils.data_preparation.optimize

import pandas as pd
import numpy as np
from typing import List

from .constants import THRESHOLD


[docs]def optimize_floats(df: pd.DataFrame) -> None: """ Optimizes data space usage by casting float columns to smallest possible size :param df: DataFrame holding data """ cols = df.select_dtypes(include=np.float_).columns.tolist() df[cols] = df[cols].apply(pd.to_numeric, downcast="float")
[docs]def optimize_ints(df: pd.DataFrame) -> None: """ Optimizes data space usage by casting integer columns to smallest possible size :param df: DataFrame holding data """ cols = df.select_dtypes(include=np.integer).columns min_vals = df[cols].min(axis=0) unsigned_idxs = np.where(min_vals >= 0)[0] signed_idxs = [i for i in range(len(cols)) if i not in unsigned_idxs] df[cols[unsigned_idxs]] = df[cols[unsigned_idxs]].apply( pd.to_numeric, downcast="unsigned" ) df[cols[signed_idxs]] = df[cols[signed_idxs]].apply( pd.to_numeric, downcast="signed" )
[docs]def optimize_objects( df: pd.DataFrame, datetime_features: List[str], threshold: int = THRESHOLD ) -> None: """ Optimizes data space usage by casting object columns to pd.category if less or equal to threshold % of entries are unique, and datetime_features to pd.datetime :param df: DataFrame holding data :param datetime_features: List of columns that can be casted to datetime, which significantly reduces space usage :param threshold: int from 0 to 100 """ for col in df.select_dtypes(include=np.object_): if col not in datetime_features: if not (type(df[col][0]) == list): num_unique_values = len(df[col].unique()) num_total_values = len(df[col]) if num_unique_values / num_total_values <= threshold / 100.0: df[col] = df[col].astype("category") else: df[col] = pd.to_datetime(df[col])
[docs]def convert_to_hhmm(df: pd.DataFrame) -> pd.DataFrame: """ Converts every column of df into a hhmm string format :returns: modified df """ df = df.fillna(0).astype(np.int16).astype(str) for col in df.columns: df[col] = df[col].str.zfill(4) bad_idxs = df[col] >= "2400" df.loc[bad_idxs, col] = ( (df.loc[bad_idxs, col].astype(np.int16) - 2400).astype(str).str.zfill(4) ) return df
[docs]def optimize( df: pd.DataFrame, datetime_features: List[str] = [], flights_data: bool = False ) -> None: """ Optimizes data space usage :param df: DataFrame holding data :param datetime_features: List of columns that can be casted to datetime, which significantly reduces space usage :param flights_data: special flag that triggers additional data conversions only for flights data """ optimize_ints(df) optimize_floats(df) optimize_objects(df, datetime_features) if flights_data: dates = { "DepTime": "Departure", "CRSDepTime": "CRSDeparture", "ArrTime": "Arrival", "CRSArrTime": "CRSArrival", } df.loc[:, list(dates.keys())] = convert_to_hhmm(df.loc[:, list(dates.keys())]) for original_name, new_name in dates.items(): df[new_name] = pd.to_datetime( dict( year=df["Year"], month=df["Month"], day=df["DayofMonth"], hour=df[original_name].str[:2], minute=df[original_name].str[2:], ) ) df.drop( columns=["Year", "Month", "DayofMonth"] + list(dates.keys()), inplace=True, )
[docs]def concatenate(dfs: List[pd.DataFrame], threshold: int = THRESHOLD) -> pd.DataFrame: """ Concatenate while preserving categorical columns. :param dfs: list of DataFrames to concatenate :param threshold: target column will be left as categorical if unique values are less threshold % of all values """ assert len(dfs) >= 1, "dfs cannot be empty" target_size = sum([df.shape[0] for df in dfs]) for col in dfs[-1].select_dtypes(include="category").columns: # if not category than it must have been all empty uc = pd.api.types.union_categoricals( [df[col] for df in dfs if df[col].dtype == "category"] ) if len(uc.categories) / target_size <= threshold / 100.0: for df in dfs: df[col] = pd.Categorical(df[col].values, categories=uc.categories) return pd.concat(dfs, ignore_index=True)