MTA Project
MTA Project
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [2]: df = pd.read_csv('MTA_Subway_Customer_Journey-Focused_Metrics__Beginning_2020_20240331.csv')
In [3]: df.head()
2024- A
0 1 offpeak 3867551.1 1.085337 0.683453 4197597.991 2.643288e+06 484512.9 0.125276 0.874724
02 DIVISION
2024- A
1 1 peak 4363445.4 1.443297 0.595086 6297748.947 2.596627e+06 463635.2 0.106254 0.893746
02 DIVISION
2024- A
2 2 offpeak 2336734.5 1.522652 0.613494 3558034.457 1.433574e+06 416612.5 0.178288 0.821712
02 DIVISION
2024- A
3 2 peak 2815864.7 1.202949 0.295036 3387341.608 8.307818e+05 361184.5 0.128268 0.871732
02 DIVISION
2024- A
4 3 offpeak 1726217.0 0.923262 0.264843 1593750.537 4.571768e+05 208555.9 0.120817 0.879183
02 DIVISION
In [4]: df.describe()
Out[4]: num_passengers additional platform time additional train time total_apt total_att over_five_mins over_five_mins_perc customer journey time performance
In [5]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2426 entries, 0 to 2425
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 month 2426 non-null object
1 division 2426 non-null object
2 line 2426 non-null object
3 period 2426 non-null object
4 num_passengers 2426 non-null float64
5 additional platform time 2426 non-null float64
6 additional train time 2426 non-null float64
7 total_apt 2426 non-null float64
8 total_att 2426 non-null float64
9 over_five_mins 2426 non-null float64
10 over_five_mins_perc 2426 non-null float64
11 customer journey time performance 2426 non-null float64
dtypes: float64(8), object(4)
memory usage: 227.6+ KB
In [7]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2426 entries, 0 to 2425
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 month 2426 non-null object
1 division 2426 non-null object
2 line 2426 non-null object
3 period 2426 non-null object
4 num_passengers 2426 non-null float64
5 apt 2426 non-null float64
6 att 2426 non-null float64
7 total_apt 2426 non-null float64
8 total_att 2426 non-null float64
9 over_five_mins 2426 non-null float64
10 over_five_mins_perc 2426 non-null float64
11 jtp 2426 non-null float64
dtypes: float64(8), object(4)
memory usage: 227.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2426 entries, 0 to 2425
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 month 2426 non-null int64
1 division 2426 non-null object
2 line 2426 non-null object
3 period 2426 non-null object
4 num_passengers 2426 non-null float64
5 apt 2426 non-null float64
6 att 2426 non-null float64
7 total_apt 2426 non-null float64
8 total_att 2426 non-null float64
9 over_five_mins 2426 non-null float64
10 over_five_mins_perc 2426 non-null float64
11 jtp 2426 non-null float64
12 year 2426 non-null int64
dtypes: float64(8), int64(2), object(3)
memory usage: 246.5+ KB
In [9]: new_index_order = ['month', 'year', 'division', 'line', 'period', 'num_passengers','apt', 'att','total_apt', 'total_att', 'over_five_mins','over_five_mins_perc', 'jtp']
df = df.reindex(columns=new_index_order)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2426 entries, 0 to 2425
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 month 2426 non-null int64
1 year 2426 non-null int64
2 division 2426 non-null object
3 line 2426 non-null object
4 period 2426 non-null object
5 num_passengers 2426 non-null float64
6 apt 2426 non-null float64
7 att 2426 non-null float64
8 total_apt 2426 non-null float64
9 total_att 2426 non-null float64
10 over_five_mins 2426 non-null float64
11 over_five_mins_perc 2426 non-null float64
12 jtp 2426 non-null float64
dtypes: float64(8), int64(2), object(3)
memory usage: 246.5+ KB
In [11]: #plot a scatterplot to see if an increase of apt (additional platform time) also increases the amount of passengers
/opt/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert
inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
/opt/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert
inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
In [16]: # Filter data for the Q line during offpeak and peak periods in 2023
q_line_offpeak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'offpeak') & (df['year'] == 2023)]
q_line_peak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'peak') & (df['year'] == 2023)]
# Plotting
plt.figure(figsize=(10, 6))
plt.show()
In [17]: # Filter data for the Q line during offpeak and peak periods in 2023
q_line_offpeak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'offpeak') & (df['year'] == 2023)]
q_line_peak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'peak') & (df['year'] == 2023)]
plt.show()
In [18]: # Filter data for the Q line during offpeak and peak periods in 2023
q_line_offpeak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'offpeak') & (df['year'] == 2023)]
q_line_peak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'peak') & (df['year'] == 2023)]
plt.show()
# Group data by 'line' and calculate mean JTP for each line
mean_jtp_by_line = df_2023.groupby('line')['jtp'].mean()
Line: S Fkln
Mean JTP: 0.9572767267916666
Line: L
Mean JTP: 0.9206239399166667
Line: S Rock
Mean JTP: 0.9041509411666667
Line: 1
Mean JTP: 0.88777615075
In [20]: ## What is the probability that I will have to wait extra for the Q train on the platform?
# Count the number of instances where additional platform time is greater than zero
extra_wait_count = q_train_data[q_train_data['apt'] > 0]['apt'].count()
print("Probability of having to wait extra for the Q train on the platform:", probability_extra_wait)
Probability of having to wait extra for the Q train on the platform: 1.0
# Calculate the average additional platform time for each line in 2023
average_additional_platform_time_per_line = data_2023.groupby('line')['apt'].mean()
print(f"The train line that affects the most passengers is {most_affected_line} with a total of {total_passengers_affected} passengers affected.")
Average additional platform time for each line in 2023 (in minutes), descending order:
Line: B, Average Wait Time: 1.73 minutes
Line: M, Average Wait Time: 1.72 minutes
Line: C, Average Wait Time: 1.66 minutes
Line: D, Average Wait Time: 1.62 minutes
Line: R, Average Wait Time: 1.60 minutes
Line: N, Average Wait Time: 1.54 minutes
Line: JZ, Average Wait Time: 1.51 minutes
Line: F, Average Wait Time: 1.48 minutes
Line: Q, Average Wait Time: 1.44 minutes
Line: 2, Average Wait Time: 1.36 minutes
Line: A, Average Wait Time: 1.27 minutes
Line: 7, Average Wait Time: 1.23 minutes
Line: G, Average Wait Time: 1.18 minutes
Line: E, Average Wait Time: 1.16 minutes
Line: 6, Average Wait Time: 1.14 minutes
Line: 1, Average Wait Time: 1.11 minutes
Line: W, Average Wait Time: 1.10 minutes
Line: 3, Average Wait Time: 1.09 minutes
Line: 5, Average Wait Time: 1.09 minutes
Line: 4, Average Wait Time: 0.98 minutes
Line: S Rock, Average Wait Time: 0.90 minutes
Line: L, Average Wait Time: 0.89 minutes
Line: S Fkln, Average Wait Time: 0.59 minutes
Line: S 42nd, Average Wait Time: 0.47 minutes
The train line that affects the most passengers is F with a total of 58122287.0 passengers affected.
# Group data by month and sum the 'over_five_mins' column to get the total number of affected passengers for each month
total_passengers_affected_f_line_monthly = f_line_data.groupby(['year', 'month'])['over_five_mins'].sum()
# Group data by month and train line, and sum the 'over_five_mins' column
total_passengers_affected_monthly = data_2023.groupby(['month', 'line'])['over_five_mins'].sum()
# Find the train line that affects the most passengers for each month
most_affected_line_per_month = total_passengers_affected_monthly.groupby(level='month').idxmax()