import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import seaborn.objects as so
from tqdm import tqdm
import statsmodels.api as sm
import statsmodels.formula.api as smf


data_2022 = pd.read_parquet("Combined_Flights_2022.parquet")


data_2022.head()


data_2022.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4078318 entries, 0 to 590541
Data columns (total 61 columns):
 #   Column                                   Dtype         
---  ------                                   -----         
 0   FlightDate                               datetime64[ns]
 1   Airline                                  object        
 2   Origin                                   object        
 3   Dest                                     object        
 4   Cancelled                                bool          
 5   Diverted                                 bool          
 6   CRSDepTime                               int64         
 7   DepTime                                  float64       
 8   DepDelayMinutes                          float64       
 9   DepDelay                                 float64       
 10  ArrTime                                  float64       
 11  ArrDelayMinutes                          float64       
 12  AirTime                                  float64       
 13  CRSElapsedTime                           float64       
 14  ActualElapsedTime                        float64       
 15  Distance                                 float64       
 16  Year                                     int64         
 17  Quarter                                  int64         
 18  Month                                    int64         
 19  DayofMonth                               int64         
 20  DayOfWeek                                int64         
 21  Marketing_Airline_Network                object        
 22  Operated_or_Branded_Code_Share_Partners  object        
 23  DOT_ID_Marketing_Airline                 int64         
 24  IATA_Code_Marketing_Airline              object        
 25  Flight_Number_Marketing_Airline          int64         
 26  Operating_Airline                        object        
 27  DOT_ID_Operating_Airline                 int64         
 28  IATA_Code_Operating_Airline              object        
 29  Tail_Number                              object        
 30  Flight_Number_Operating_Airline          int64         
 31  OriginAirportID                          int64         
 32  OriginAirportSeqID                       int64         
 33  OriginCityMarketID                       int64         
 34  OriginCityName                           object        
 35  OriginState                              object        
 36  OriginStateFips                          int64         
 37  OriginStateName                          object        
 38  OriginWac                                int64         
 39  DestAirportID                            int64         
 40  DestAirportSeqID                         int64         
 41  DestCityMarketID                         int64         
 42  DestCityName                             object        
 43  DestState                                object        
 44  DestStateFips                            int64         
 45  DestStateName                            object        
 46  DestWac                                  int64         
 47  DepDel15                                 float64       
 48  DepartureDelayGroups                     float64       
 49  DepTimeBlk                               object        
 50  TaxiOut                                  float64       
 51  WheelsOff                                float64       
 52  WheelsOn                                 float64       
 53  TaxiIn                                   float64       
 54  CRSArrTime                               int64         
 55  ArrDelay                                 float64       
 56  ArrDel15                                 float64       
 57  ArrivalDelayGroups                       float64       
 58  ArrTimeBlk                               object        
 59  DistanceGroup                            int64         
 60  DivAirportLandings                       int64         
dtypes: bool(2), datetime64[ns](1), float64(18), int64(23), object(17)
memory usage: 1.8+ GB


column_subset = [
    "FlightDate",
    "Airline",
    "Flight_Number_Marketing_Airline",
    "Origin",
    "Dest",
    "Cancelled",
    "Diverted",
    "CRSDepTime",
    "DepTime",
    "DepDelayMinutes",
    "OriginAirportID",
    "OriginCityName",
    "OriginStateName",
    "DestAirportID",
    "DestCityName",
    "DestStateName",
    "TaxiOut",
    "TaxiIn",
    "CRSArrTime",
    "ArrTime",
    "ArrDelayMinutes",
    "Distance",
    "AirTime"
]
for i in tqdm(range(1)):
    df_2018 = pd.read_parquet("Combined_Flights_2018.parquet", 'pyarrow', columns=column_subset)
    df_2019 = pd.read_parquet("Combined_Flights_2019.parquet", 'pyarrow', columns=column_subset)
    df_2020 = pd.read_parquet("Combined_Flights_2020.parquet", 'pyarrow', columns=column_subset)
    df_2021 = pd.read_parquet("Combined_Flights_2021.parquet", 'pyarrow', columns=column_subset)
    df_2022 = pd.read_parquet("Combined_Flights_2022.parquet", 'pyarrow', columns=column_subset)

    dfs = [df_2018, df_2019, df_2020, df_2021, df_2022]
    df = pd.concat(dfs).reset_index(drop=True)
print("Done Reading!")
cat_cols = ["Airline", "Origin", "Dest", "OriginStateName", "DestStateName"]
for c in cat_cols:
    df[c] = df[c].astype("category")

100%|█████████████████████████████████████████████████████| 1/1 [00:29<00:00, 29.77s/it]

Done Reading!


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29193782 entries, 0 to 29193781
Data columns (total 23 columns):
 #   Column                           Dtype         
---  ------                           -----         
 0   FlightDate                       datetime64[ns]
 1   Airline                          category      
 2   Flight_Number_Marketing_Airline  int64         
 3   Origin                           category      
 4   Dest                             category      
 5   Cancelled                        bool          
 6   Diverted                         bool          
 7   CRSDepTime                       int64         
 8   DepTime                          float64       
 9   DepDelayMinutes                  float64       
 10  OriginAirportID                  int64         
 11  OriginCityName                   object        
 12  OriginStateName                  category      
 13  DestAirportID                    int64         
 14  DestCityName                     object        
 15  DestStateName                    category      
 16  TaxiOut                          float64       
 17  TaxiIn                           float64       
 18  CRSArrTime                       int64         
 19  ArrTime                          float64       
 20  ArrDelayMinutes                  float64       
 21  Distance                         float64       
 22  AirTime                          float64       
dtypes: bool(2), category(5), datetime64[ns](1), float64(8), int64(5), object(2)
memory usage: 3.7+ GB


# Barplot
plt.figure(figsize=(10,8))
sns.countplot(data=df, y="Airline", order=df["Airline"].value_counts().index)
plt.ylabel('Airline', fontsize=15)
plt.xlabel('Count of flights', fontsize=15)
plt.title('Flight count by airline', fontsize=15)

Text(0.5, 1.0, 'Flight count by airline')


# Line chart for the number of flights over time
plt.figure(figsize=(15,10))
df['FlightDate'] = pd.to_datetime(df['FlightDate'])
flight_counts = df.groupby('FlightDate')['Airline'].count()
flight_counts.plot(kind='line')
plt.title('Number of Flights Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Flights')
plt.show()


# Calculate the average flight distance by airline
avg_distance_by_airline = df.groupby('Airline')['Distance'].mean().sort_values(ascending=False)

# Plot the average flight distance by airline
plt.figure(figsize=(10,6))
sns.barplot(y=avg_distance_by_airline.index, x=avg_distance_by_airline.values, order=avg_distance_by_airline.index)
plt.title('Average Flight Distance by Airline')
plt.xlabel('Average Flight Distance (miles)')
plt.ylabel('Airline')
plt.show()


# Convert the 'FlightDate' column to a datetime object
df['FlightDate'] = pd.to_datetime(df['FlightDate'])

# Create a new column for the day of the week
df['DayOfWeek'] = df['FlightDate'].dt.day_name()

# Create a new column for the hour of the day
df['HourOfDay'] = df['CRSDepTime'] // 100

# Pivot the data to create a table of flight volume by day and hour
flight_vol = pd.pivot_table(df, values='Flight_Number_Marketing_Airline', index='DayOfWeek', columns='HourOfDay', aggfunc='count')

# Create a heatmap of the flight volume
plt.figure(figsize=(12, 8))
sns.heatmap(flight_vol, cmap='Blues', annot=False, fmt='.0f')
plt.title('Busiest Days and Times for Flights')
plt.xlabel('Hour of Day')
plt.ylabel('Day of Week')
plt.show()


plt.figure(figsize=(10,8))
sns.histplot(df['DepDelayMinutes'], kde=True, bins=500)
plt.title('Distribution of Departure Delays')
plt.xlabel('Delay (Minutes)')
plt.ylabel('Count')
plt.xlim(0,800)
plt.show()

print("Skewness: %f" % df['DepDelayMinutes'].skew())
print("Kurtosis: %f" % df['DepDelayMinutes'].kurt())

Skewness: 11.745739
Kurtosis: 266.145135


avg_delays = df.groupby('Airline')['DepDelayMinutes'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(y=avg_delays.index, x=avg_delays.values, order=avg_delays.index)
plt.title('Average Delay Time by Airlines')
plt.xlabel('Average Delay Time (minutes)')
plt.ylabel('Airline')
plt.show()


# Bin the delay times into different time ranges
bins = [0, 5, 15, 30, np.inf]
labels = ['<5 min', '5-15 min', '15-30 min', '>30 min']
df['DelayRange'] = pd.cut(df['DepDelayMinutes'], bins=bins, labels=labels).sort_values(ascending=True)

# Group the data by airline and delay time range, and count the number of flights in each group
grouped = df.groupby(['Airline', 'DelayRange'])['Airline'].count()

# Pivot the data so that each airline is a row and each delay time range is a column
stacked = grouped.unstack()

stacked.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Distribution of Delay Times by Airline')
plt.xlabel('Airline')
plt.ylabel('Number of Flights')
plt.show()


plt.figure(figsize=(20,20))
# Convert the FlightDate column to a datetime object
df['FlightDate'] = pd.to_datetime(df['FlightDate'])

# Group the data by FlightDate and calculate the mean of DepDelayMinutes
grouped_data = df.groupby('FlightDate')['DepDelayMinutes'].mean()

# Create a line plot
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(grouped_data.index, grouped_data.values)

# Set the title and axis labels
ax.set_title('Mean Departure Delay over Time')
ax.set_xlabel('Flight Date')
ax.set_ylabel('Mean Departure Delay (minutes)')

# Show the plot
plt.show()

<Figure size 2000x2000 with 0 Axes>


# Create a new column to indicate whether the flight is longer or shorter than the median distance
mean_distance = df['Distance'].mean()
for i in tqdm(range(1)):
    df['DistanceCategory'] = ['Long' if d >= mean_distance else 'Short' for d in df['Distance']]

# Create a scatterplot with hue based on the distance category
sns.scatterplot(x='Distance', y='DepDelayMinutes', hue='DistanceCategory', data=df)

# Add axis labels and a title
plt.xlabel('Flight Distance')
plt.ylabel('Departure Delay (Minutes)')
plt.title('Relationship Between Flight Distance and Departure Delay')
plt.show()

100%|█████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.60s/it]


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

df = df.dropna()

# select the relevant variables
X = df[['CRSDepTime', 'OriginAirportID', 'DestAirportID', 'Distance', 'AirTime']]
y = df['DepDelayMinutes']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# make predictions on the testing set
y_pred = model.predict(X_test)

# print the R-squared score
print('R-squared score:', model.score(X_test, y_test))

# create a scatter plot of the predicted vs actual values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual DepDelayMinutes')
plt.ylabel('Predicted DepDelayMinutes')
plt.title('Linear Regression Model')
plt.show()

# create a residual plot to check for heteroscedasticity and linearity
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.xlabel('Predicted DepDelayMinutes')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

R-squared score: 0.0028288496619960357


# Create scatter plot
sns.scatterplot(x='Distance', y='DepDelayMinutes', data=df)

# Fit regression line
reg = np.polyfit(df['Distance'], df['DepDelayMinutes'], 1)
ry = np.polyval(reg, df['Distance'])

# Plot regression line
plt.plot(df['Distance'], ry, 'r', linewidth=2)

# Set plot labels and title
plt.xlabel('Distance')
plt.ylabel('DepDelayMinutes')
plt.title('Regression Analysis: DepDelayMinutes vs Distance')

# Show plot
plt.show()


plt.figure(figsize=(15, 10))

# calculate standard deviation of departure delay time for each airline
delay_std = df.groupby('Airline')['DepDelayMinutes'].std().sort_values(ascending=False)

sns.barplot(y=delay_std.index, x=delay_std.values, order=delay_std.index)

plt.xlabel('Airline')
plt.ylabel('Standard Deviation of Departure Delay Time (minutes)')
plt.title('Departure Delay Time Standard Deviation by Airline')

# display the plot
plt.show()


plt.figure(figsize=(15, 10))
cancel_counts = df.groupby('Airline')['Cancelled'].sum().sort_values(ascending=False)
sns.barplot(y=cancel_counts.index, x=cancel_counts.values, order=cancel_counts.index)
plt.title('Airline Cancellation Counts')
plt.xlabel('Cancellation Count')
plt.ylabel('Airline')
plt.show()


# Define the columns to include in the analysis
column_subset = ["FlightDate", "Cancelled"]
# Calculate the number of cancelled and not cancelled flights for each year
cancelled = []
not_cancelled = []
for df in dfs:
    cancelled_flights = df['Cancelled'].sum()
    total_flights = len(df)
    cancelled_percent = cancelled_flights / total_flights * 100
    cancelled.append(cancelled_percent)
    not_cancelled.append(100 - cancelled_percent)

plt.bar(years, not_cancelled, color='#ff9999', label='Not_Cancelled')
plt.bar(years, cancelled, bottom=not_cancelled, color='#66b3ff', label='Cancelled')

plt.xlabel('Year')
plt.ylabel('Proportion of Flights')
plt.title('Proportion of Cancelled and Not Cancelled Flights Over 5 Years')
plt.legend()

plt.show()


cancelled = []
not_cancelled = []
for airline in df['Airline'].unique():
    airline_df = df[df['Airline'] == airline]
    cancelled_flights = airline_df['Cancelled'].sum()
    total_flights = len(airline_df)
    cancelled_percent = cancelled_flights / total_flights * 100
    cancelled.append(cancelled_percent)
    not_cancelled.append(100 - cancelled_percent)

# Create a bar chart showing the proportion of cancelled and not cancelled flights for each airline
plt.bar(df['Airline'].unique(), not_cancelled, color='#ff9999', label='Not_Cancelled')
plt.bar(df['Airline'].unique(), cancelled, bottom=not_cancelled, color='#66b3ff', label='Cancelled')
plt.xticks(rotation=90, ha='center')
plt.xlabel('Airline')
plt.ylabel('Proportion of Flights')
plt.title('Proportion of Cancelled and Not Cancelled Flights by Airline')
plt.legend()

plt.show()


years = ['2018', '2019', '2020', '2021', '2022']
dfs = []

cancelled_by_airline = df[df['Cancelled'] == True].groupby(['Airline', 'FlightDate'])['Cancelled'].count().reset_index()

# create a line chart for each airline
airlines = cancelled_by_airline['Airline'].unique()
fig = plt.figure(figsize=(15, 5))  # set the size of the plot
for airline in airlines:
    data = cancelled_by_airline[cancelled_by_airline['Airline'] == airline]
    plt.plot(data['FlightDate'], data['Cancelled'], label=airline)

# add labels and legend
plt.xlabel('Flight Date')
plt.ylabel('Cancelled Flights')
plt.title('Cancelled Flights Over Time by Airline')
plt.legend(fontsize='small', loc='upper right')

# set figure size
fig = plt.gcf()
fig.set_size_inches(15, 8)

# show the plot
plt.show()


subset = df[['Cancelled', 'DepDelayMinutes']]

delay_cancel_model = smf.ols('DepDelayMinutes ~  Cancelled', data=subset).fit()

delay_cancel_model.summary()


sns.boxplot(x='Cancelled', y='DepDelayMinutes', data=subset)

# Add legend
legend_labels = ['Not Cancelled', 'Cancelled']

plt.legend()
    
# Add axis labels and title
plt.xlabel('Cancelled')
plt.ylabel('Departure Delay Minutes')
plt.title('Distribution of Departure Delay Minutes by Cancellation Status')

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

Text(0.5, 1.0, 'Distribution of Departure Delay Minutes by Cancellation Status')


# subset_full_model = df[['Origin', 'Airline', 'Dest', 'Distance', 'AirTime']]

# full_model = smf.ols('Airline ~ Origin + Dest + Distance + AirTime', data=subset_full_model)

# # Fit the model to the data
# results = full_model.fit()

# # Print the summary statistics
# results.summary()

	FlightDate	Airline	Origin	Dest	Cancelled	Diverted	CRSDepTime	DepTime	DepDelay	...	WheelsOff	WheelsOn	TaxiIn	CRSArrTime	ArrDelay	ArrivalDelayGroups	ArrTimeBlk	DistanceGroup
0	2022-04-04	Commutair Aka Champlain Enterprises, Inc.	GJT	DEN	False	False	1133	1123.0	-10.0	...	1140.0	1220.0	8.0	1245	-17.0	-2.0	1200-1259	1
1	2022-04-04	Commutair Aka Champlain Enterprises, Inc.	HRL	IAH	False	False	732	728.0	-4.0	...	744.0	839.0	9.0	849	-1.0	-1.0	0800-0859	2
2	2022-04-04	Commutair Aka Champlain Enterprises, Inc.	DRO	DEN	False	False	1529	1514.0	-15.0	...	1535.0	1622.0	14.0	1639	-3.0	-1.0	1600-1659	2
3	2022-04-04	Commutair Aka Champlain Enterprises, Inc.	IAH	GPT	False	False	1435	1430.0	-5.0	...	1446.0	1543.0	4.0	1605	-18.0	-2.0	1600-1659	2
4	2022-04-04	Commutair Aka Champlain Enterprises, Inc.	DRO	DEN	False	False	1135	1135.0	0.0	...	1154.0	1243.0	8.0	1245	6.0	0.0	1200-1259	2

	FlightDate	Airline	Flight_Number_Marketing_Airline	Origin	Dest	Cancelled	Diverted	CRSDepTime	DepTime	...	DestAirportID	DestCityName	DestStateName	TaxiOut	TaxiIn	CRSArrTime	ArrTime	Distance	AirTime
0	2018-01-23	Endeavor Air Inc.	3298	ABY	ATL	False	False	1202	1157.0	...	10397	Atlanta, GA	Georgia	14.0	7.0	1304	1256.0	145.0	38.0
1	2018-01-24	Endeavor Air Inc.	3298	ABY	ATL	False	False	1202	1157.0	...	10397	Atlanta, GA	Georgia	13.0	12.0	1304	1258.0	145.0	36.0
2	2018-01-25	Endeavor Air Inc.	3298	ABY	ATL	False	False	1202	1153.0	...	10397	Atlanta, GA	Georgia	18.0	11.0	1304	1302.0	145.0	40.0
3	2018-01-26	Endeavor Air Inc.	3298	ABY	ATL	False	False	1202	1150.0	...	10397	Atlanta, GA	Georgia	17.0	11.0	1304	1253.0	145.0	35.0
4	2018-01-27	Endeavor Air Inc.	3298	ABY	ATL	False	False	1400	1355.0	...	10397	Atlanta, GA	Georgia	17.0	11.0	1500	1459.0	145.0	36.0

Dep. Variable:	DepDelayMinutes	R-squared:	0.001
Model:	OLS	Adj. R-squared:	0.001
Method:	Least Squares	F-statistic:	1.676e+04
Date:	Mon, 03 Apr 2023	Prob (F-statistic):	0.00
Time:	00:05:45	Log-Likelihood:	-1.4929e+08
No. Observations:	28430698	AIC:	2.986e+08
Df Residuals:	28430696	BIC:	2.986e+08
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	12.7571	0.009	1473.212	0.000	12.740	12.774
Cancelled[T.True]	48.3347	0.373	129.469	0.000	47.603	49.066

Omnibus:	48711687.801	Durbin-Watson:	1.802
Prob(Omnibus):	0.000	Jarque-Bera (JB):	84679398712.993
Skew:	11.748	Prob(JB):	0.00
Kurtosis:	269.328	Cond. No.	43.1

Flight Delays¶

Importing of packages¶

Data Processing¶

General Trends¶

Number of flights per airline¶

Number of flights over time¶

Average Distance covered by Airline¶

Understanding Delays¶

Histogram of departure delays¶

Average Delay Time by Airlines¶

Stacked Barplot for the distribution fo delay times¶

Average Departure delay over time¶

Short vs Long flights when it comes to delay times¶

Linear Model¶

Risk of delays¶

Understanding Cancellations¶

Number of cancelled flights by Airline¶

Proportion of cancelled vs not cancelled flights by Year¶

Proportion of cancelled vs nont cancelled flights by Airline¶

Cancelled flights over time by Airline¶

Models¶

Linear Model for Delay vs Cancellations¶

Meant to be a linear model for the airlines, keeps crashing¶