Source code for utils.charts.generate_charts

import os
import warnings
import inspect
import numpy as np
import pandas as pd
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.ndimage.filters import uniform_filter1d

from ..data_preparation.load_data import load_flights, load_airports
from .helpers import save_fig, finish
from .constants import REQUIRE, MONTHS, WEEK_DAYS, PLOTS_DIR


plt.set_loglevel("WARNING")
sns.set_style("whitegrid")
warnings.simplefilter(action="ignore")
np.random.seed(42)


[docs]def generate_charts(years: str | list = "all", dir: str = None): """ Function that wraps all eda_Pawel code and generates its charts for given year. Charts are saved to dir. :param years: choice of years that will be passed to utils.load_flights() :param dir: directory to save charts. If None, the chart will be saved to "plots/{{year}}" """ if dir is None: if isinstance(years, str): dir = os.path.join(PLOTS_DIR, years) else: dir = os.path.join(PLOTS_DIR, "_".join(years)) os.makedirs(dir, exist_ok=True) flights = load_flights(years, cols=REQUIRE) for item in list(globals().keys()): if item.startswith("chart_"): globals()[item](flights, dir)
[docs]def chart_1(flights: pd.DataFrame, dir: str): """ "Total Planned Flight Time for each Carrier" chart""" title = "Total Planned Flight Time for each Carrier" dt = flights.groupby("UniqueCarrier")["CRSElapsedTime"].sum() dt = np.c_[dt.index, dt / (60 * 1000)] dt = pd.DataFrame( dt, columns=["UniqueCarrier", "Total CRSElapsedTime [hours * 10^3]"] ) dt = dt.sort_values( by="Total CRSElapsedTime [hours * 10^3]", axis=0, ascending=False ).reset_index() if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan w = 6 * np.ceil(len(dt["UniqueCarrier"].unique()) / 20) plt.figure(figsize=(w, 6)) ax = sns.barplot( dt, x="UniqueCarrier", y="Total CRSElapsedTime [hours * 10^3]", color="lightblue", ) for i in range(len(dt)): ax.text( i, 10, round(dt.loc[i, "Total CRSElapsedTime [hours * 10^3]"], 1), color="#0f540f", ha="center", rotation=90, size=8, ) finish(ax, title, plot=False, dir=dir)
[docs]def chart_2(flights: pd.DataFrame, dir: str): """ "Max Departure and Arrival Delay for each Carrier" chart""" title = "Max Departure and Arrival Delay for each Carrier" dt = flights.groupby("UniqueCarrier")[["DepDelay", "ArrDelay"]].max() dt_DepDelay = np.c_[ dt.index, dt["DepDelay"] / 60, np.full(dt.index.shape, "DepDelay") ] dt_ArrDelay = np.c_[ dt.index, dt["ArrDelay"] / 60, np.full(dt.index.shape, "ArrDelay") ] dt = np.r_[dt_ArrDelay, dt_DepDelay] dt = pd.DataFrame(dt, columns=["UniqueCarrier", "Delay [hours]", "Type"]) if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan w = 6 * np.ceil(len(dt["UniqueCarrier"].unique()) / 20) plt.figure(figsize=(w, 6)) ax = sns.barplot( dt, x="UniqueCarrier", y="Delay [hours]", hue="Type", palette=sns.color_palette("ch:s=.25,rot=-.25", 2), ) finish(ax, title, plot=False, dir=dir)
[docs]def chart_3(flights: pd.DataFrame, dir: str): """ "Number of Aircrafts in fleet of each Carrier" chart""" title = "Number of Aircrafts in fleet of each Carrier" dt1 = flights.groupby(["UniqueCarrier"])["TailNum"].unique() dt2 = [len(dt1[i][~pd.isna(dt1[i])]) for i in range(len(dt1))] dt = np.c_[dt1.index, dt2] dt = pd.DataFrame(dt, columns=["UniqueCarrier", "Known Airplanes Count"]) dt = dt.sort_values( by="Known Airplanes Count", axis=0, ascending=False ).reset_index() if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan w = 6 * np.ceil(len(dt["UniqueCarrier"].unique()) / 20) plt.figure(figsize=(w, 6)) ax = sns.barplot( dt, x="UniqueCarrier", y="Known Airplanes Count", color="lightblue" ) for i in range(len(dt)): ax.text( i, 15, round(dt.loc[i, "Known Airplanes Count"], 1), color="#0f540f", ha="center", rotation=90, size=8, ) finish(ax, title, plot=False, dir=dir)
[docs]def chart_4(flights: pd.DataFrame, dir: str): """ "Cancelation Rate for each Carrier" chart""" title = "Cancelation Rate for each Carrier" dt1 = ( flights[~(flights["Cancelled"] == 0)] .groupby(["UniqueCarrier"])["Cancelled"] .count() ) dt2 = ( flights[~(flights["Cancelled"].isna())] .groupby(["UniqueCarrier"])["Cancelled"] .count() ) dt = np.c_[dt1.index, dt1, dt2, dt1 / dt2 * 100] dt = pd.DataFrame( dt, columns=[ "UniqueCarrier", "Cancelled flights", "All flights", "Cancelation Rate [%]", ], ) dt = dt.sort_values( by="Cancelation Rate [%]", axis=0, ascending=False ).reset_index() if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan w = 6 * np.ceil(len(dt["UniqueCarrier"].unique()) / 20) plt.figure(figsize=(w, 6)) ax = sns.barplot(dt, x="UniqueCarrier", y="Cancelation Rate [%]", color="lightblue") for i in range(len(dt)): ax.text( i, 0.1, round(dt.loc[i, "Cancelation Rate [%]"], 1), color="#0f540f", ha="center", rotation=90, size=8, ) finish(ax, title, plot=False, dir=dir)
[docs]def chart_5(flights: pd.DataFrame, dir: str): """ "Cancelation Causes" chart""" title = "Cancelation Causes" dt = ( flights[~(flights["Cancelled"] == 0)] .groupby(["CancellationCode"])["CancellationCode"] .count() ) dt.name = "Number" dt = pd.DataFrame(dt).reset_index() if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan plt.figure(figsize=(6, 6)) ax = sns.barplot(dt, x="CancellationCode", y="Number", color="lightblue") for i in range(len(dt)): ax.text( i, 200, round(dt.loc[i, "Number"], 1), color="#0f540f", ha="center", size=10 ) plt.xticks([0, 1, 2, 3], ["przewoźnik", "pogoda", "NAS", "bezpieczeńtwo"]) finish(ax, title, plot=False, dir=dir)
[docs]def chart_6(flights: pd.DataFrame, dir: str): """ "Planned Flights over Time" chart (x2)""" title = "Planned Flights over Time" bins = [0, 1, 2, 3, 4, 5, 6, np.inf] dt = flights.groupby([flights["Arrival"].dt.date, "DayOfWeek"])["DayOfWeek"].count() dt.name = "Number of flights" dt = pd.DataFrame(dt).reset_index() dt["DayOfWeek"] = pd.cut(dt["DayOfWeek"], bins, labels=WEEK_DAYS) if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan w = int(np.ceil(len(dt["Arrival"].unique()) / 300)) dt["Arrival"] = dt["Arrival"].astype(str) xmin, xmax = dt["Arrival"].min(), dt["Arrival"].max() dt["Smoothed"] = uniform_filter1d(dt["Number of flights"], w * 10) for nn in range(2): plt.figure(figsize=(20, 6)) if nn == 0: sns.lineplot( dt, x="Arrival", y="Number of flights", color="red", alpha=0.05 ) sns.lineplot( dt, x="Arrival", y="Smoothed", color="red", alpha=0.5, label="Average number of flights", ) ax = sns.pointplot( dt, x="Arrival", y="Number of flights", hue="DayOfWeek", scale=0.3, palette=sns.color_palette("husl", 7), ) else: ax = sns.lineplot( dt, x="Arrival", y="Number of flights", hue="DayOfWeek", palette=sns.color_palette("husl", 7), ) lgd = plt.legend( loc="upper left", bbox_to_anchor=(1, 1), title="Day of the week", title_fontsize="x-large", ) for handle in lgd.legendHandles: handle._sizes = [50] i = 5 * w - 1 for label in ax.xaxis.get_ticklabels(): i += 1 if i == 5 * w: i = 0 continue label.set_visible(False) plt.xticks(rotation=45) ax.xaxis.grid(False) sns.despine() plt.xlim([xmin, xmax]) ax.xaxis.label.set_size(20) ax.yaxis.label.set_size(20) plt.title(title, size=30) save_fig(f"{title}_{nn}", dir, dpi=max(min(w * 25, 400), 200))
[docs]def chart_7(flights: pd.DataFrame, dir: str): """ "Most popular routes" chart""" title = "Most popular routes" dt = ( flights.groupby(["Origin", "Dest", "Cancelled"])["TailNum"] .count() .reset_index() ) # treat flights from ABE to ATL and from ATL to ABE as same route dt1 = dt[dt["Origin"].astype("U3") < dt["Dest"].astype("U3")] dt2 = dt[dt["Origin"].astype("U3") > dt["Dest"].astype("U3")] dt2 = dt2.rename( columns={"Origin": "Dest", "Dest": "Origin", "TailNum": "Number of flights"} ) dt = pd.concat([dt1, dt2]) dt["Route"] = dt["Origin"].astype("U3") + " - " + dt["Dest"].astype("U3") dt1 = dt.groupby(["Route", "Cancelled"])["Number of flights"].sum().reset_index() dt2 = ( dt1.groupby(["Route"])["Number of flights"] .sum() .sort_values(ascending=False) .reset_index()[:15] ) keep = dt1["Route"].isin(dt2["Route"]) dt = dt1[keep].reset_index() if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan plt.figure(figsize=(8, 4)) ax = sns.barplot( dt, x="Route", y="Number of flights", hue="Cancelled", palette=["lightgreen", "#fa6666"], ) plt.legend( loc="upper left", bbox_to_anchor=(1, 1), title_fontsize="x-large", title="Cancelled", ) plt.xticks(rotation=45) finish(ax, title, plot=False, dir=dir)
[docs]def chart_8(flights: pd.DataFrame, dir: str): """ "Total Delay Time for Each Month" and "Delay Coefficient for Each Month" charts""" title = "Total Delay Time for Each Month" dt3 = flights.groupby(flights["Departure"].dt.month)["DepDelay"].count() / 1000 dt3.name = "All flights [x1000]" dt3 = dt3.reset_index() dt1 = flights.groupby(flights["Departure"].dt.month)["ArrDelay"].sum() dt1.name = "Delay [hr * 10^3]" dt1 = dt1.reset_index() dt1["type"] = "Arrival" dt1 = pd.merge(dt1, dt3) dt2 = flights.groupby(flights["Departure"].dt.month)["DepDelay"].sum() dt2.name = "Delay [hr * 10^3]" dt2 = dt2.reset_index() dt2["type"] = "Departure" dt2 = pd.merge(dt2, dt3) dt = pd.merge(dt1, dt2, how="outer") dt["Delay [hr * 10^3]"] = dt["Delay [hr * 10^3]"] / 60000 dt.rename(columns={"Departure": "Month"}, errors="raise", inplace=True) if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan fig = plt.figure() ax1 = fig.add_subplot(111) sns.barplot( dt, x="Month", y="Delay [hr * 10^3]", hue="type", palette=sns.color_palette("ch:s=.25,rot=-.25", 2), ) lines1, labels1 = ax1.get_legend_handles_labels() ax1.legend_.remove() ax2 = ax1.twinx() sns.pointplot( dt, x="Month", y="All flights [x1000]", color="red", linestyles="--", label="Number of flights", ) ax2.yaxis.label.set_size(13) ax2.set_yticklabels(ax2.get_yticks().astype(np.int64), size=9) ax2.grid(False) lines2, labels2 = ax2.get_legend_handles_labels() ax1.legend( lines1 + lines2, labels1 + labels2, loc="upper left", bbox_to_anchor=(1.1, 1), title_fontsize="x-large", title="Delay Type", ) pick = [int(text.get_text()) - 1 for text in ax2.get_xticklabels()] plt.xticks(list(range(len(pick))), MONTHS[pick]) plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45) # rotate xticks finish(ax1, title, plot=False, dir=dir) """ "Delay Coefficient for Each Month" chart""" title = "Delay Coefficient for Each Month" dt["Coefficient"] = dt["Delay [hr * 10^3]"] / dt["All flights [x1000]"] if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan fig = plt.figure() ax = sns.barplot( dt, x="Month", y="Coefficient", hue="type", palette=sns.color_palette("ch:s=.25,rot=-.25", 2), ) pick = [int(text.get_text()) - 1 for text in ax.get_xticklabels()] plt.xticks(list(range(len(pick))), MONTHS[pick], rotation=45) ax.legend( loc="upper left", bbox_to_anchor=(1, 1), title_fontsize="x-large", title="Delay Type", ) sns.despine() ax.xaxis.label.set_size(13) ax.yaxis.label.set_size(13) plt.title(title, size=20) # so the legend is put inside the box save_fig(title, dir, bbox_extra_artists=(ax.get_legend(),), bbox_inches="tight")
[docs]def chart_9(flights: pd.DataFrame, dir: str): """ "Number of Departures over hours" chart""" title = "Number of flights over hours" dts = [] for name in ["Departure", "Arrival"]: dt = ( flights[~pd.isna(flights[name])][name].dt.hour.value_counts().sort_index() / 1000 ).to_frame(name=name + "s") dt.index.name = "Hour" dt.reset_index(inplace=True) dts.append(dt) dt = pd.merge(*dts) dt.index = dt["Hour"] del dt["Hour"] if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan ax = dt.plot( kind="bar", stacked=True, color=sns.color_palette("ch:s=.25,rot=-.25", 2) ) plt.legend( loc="upper left", bbox_to_anchor=(1, 1), title_fontsize="x-large", title="", ) plt.xticks(rotation=45) plt.ylabel("Number of flights [x1000]") ax.xaxis.grid(False) finish(ax, title, plot=False, dir=dir)
[docs]def chart_10(flights: pd.DataFrame, dir: str): """ "Flights Count for each Carrier" chart""" title = "Flights Count for each Carrier chart" dt = flights["UniqueCarrier"].value_counts() dt = np.c_[dt.index, dt / (60 * 1000)] dt = pd.DataFrame(dt, columns=["UniqueCarrier", "Number of flights [* 10^3]"]) dt = dt.sort_values( by="Number of flights [* 10^3]", axis=0, ascending=False ).reset_index() if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan w = 6 * np.ceil(len(dt["UniqueCarrier"].unique()) / 20) plt.figure(figsize=(w, 6)) ax = sns.barplot( dt, x="UniqueCarrier", y="Number of flights [* 10^3]", color="lightblue", ) for i in range(len(dt)): ax.text( i, 0.5, round(dt.loc[i, "Number of flights [* 10^3]"], 1), color="#0f540f", ha="center", rotation=90, size=8, ) finish(ax, title, plot=False, dir=dir)
[docs]def chart_11(flights: pd.DataFrame, dir: str): """ "10 most popular Airports" and ""Airports and their popularity" charts""" title = "10 most popular Airports" dt1 = flights["Dest"].value_counts() dt2 = flights["Origin"].value_counts() dt1 = pd.DataFrame( np.c_[dt1.index, dt1 / 1000], columns=["Airport", "Number of flights [* 10^3]"] ) dt2 = pd.DataFrame( np.c_[dt2.index, dt2 / 1000], columns=["Airport", "Number of flights [* 10^3]"] ) dt1["Type"] = "Arrivals" dt2["Type"] = "Departures" dt = pd.merge(dt1, dt2, how="outer") dt.fillna(0.0, inplace=True) combined = dt.groupby(["Airport"])["Number of flights [* 10^3]"].sum().reset_index() combined.rename(columns={"Number of flights [* 10^3]": "Combined"}, inplace=True) dt = pd.merge(dt, combined).sort_values("Combined", ascending=False) dt_copy = dt.copy() if dt.shape[0] > 20: dt = dt.iloc[:20, :] if dt.empty: warnings.warn(f"Empty final data set: {inspect.currentframe().f_code.co_name}") return # all values were nan plt.figure(figsize=(10, 6)) ax = sns.barplot( dt, x="Airport", y="Number of flights [* 10^3]", hue="Type", color="lightblue", palette=sns.color_palette("ch:s=.25,rot=-.25", 2), ) ax.legend( loc="upper left", bbox_to_anchor=(1, 1), title_fontsize="x-large", title="Connection Type", ) sns.despine() ax.xaxis.label.set_size(13) ax.yaxis.label.set_size(13) plt.title(title, size=20) save_fig(title, dir, bbox_extra_artists=(ax.get_legend(),), bbox_inches="tight") """ "Airports and their popularity" chart """ title = "Airports and their popularity" airports = load_airports() dt = pd.merge(dt_copy, airports, left_on="Airport", right_on="iata") dt.drop_duplicates(subset=["Airport"], inplace=True) fig, ax = plt.subplots(figsize=(15, 15)) world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")) usa = world[world["name"] == "United States of America"] usa.plot(ax=ax, color="white", edgecolor="black", alpha=0.5) points = gpd.GeoDataFrame(dt, geometry=gpd.points_from_xy(dt["long"], dt["lat"])) w = int(np.ceil(points["Combined"].max() / 300)) points.plot( ax=ax, markersize=points["Combined"].astype(np.float32) / w, color="red", alpha=0.5, edgecolor="black", linewidth=0.5, ) ax.set_xlabel("longitude") ax.set_ylabel("latitude") ax.xaxis.label.set_size(20) ax.yaxis.label.set_size(20) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.grid(False) # set only mainland for visibility ax.set_xlim([-130, -65]) ax.set_ylim([24, 50]) i = 0 for x, y, label in zip(points["long"], points["lat"], points["city"]): if label == "Newark": # it overlaps with more important NewYork ax.annotate( label, xy=(x, y), xytext=(0, 70), textcoords="offset points", arrowprops=dict( facecolor="black", shrink=0.05, width=1, headwidth=4, edgecolor="black", ), ) else: ax.annotate( label, xy=(x, y), xytext=(4, -4), textcoords="offset points", ) i += 1 if i == 20: break finish(ax, title, plot=False, dir=dir)
[docs]def main(): generate_charts(["1989", "2007"]) generate_charts(["2000", "2001", "2002"]) generate_charts(["1990", "1995", "2000", "2005"])
if __name__ == "__main__": main()