0% found this document useful (0 votes)
21 views1 page

MTA Project

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
21 views1 page

MTA Project

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 1

In [1]: import pandas as pd

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]: df = pd.read_csv('MTA_Subway_Customer_Journey-Focused_Metrics__Beginning_2020_20240331.csv')

In [3]: df.head()

Out[3]: additional platform additional train customer journey time


month division line period num_passengers total_apt total_att over_five_mins over_five_mins_perc
time time performance

2024- A
0 1 offpeak 3867551.1 1.085337 0.683453 4197597.991 2.643288e+06 484512.9 0.125276 0.874724
02 DIVISION

2024- A
1 1 peak 4363445.4 1.443297 0.595086 6297748.947 2.596627e+06 463635.2 0.106254 0.893746
02 DIVISION

2024- A
2 2 offpeak 2336734.5 1.522652 0.613494 3558034.457 1.433574e+06 416612.5 0.178288 0.821712
02 DIVISION

2024- A
3 2 peak 2815864.7 1.202949 0.295036 3387341.608 8.307818e+05 361184.5 0.128268 0.871732
02 DIVISION

2024- A
4 3 offpeak 1726217.0 0.923262 0.264843 1593750.537 4.571768e+05 208555.9 0.120817 0.879183
02 DIVISION

In [4]: df.describe()

Out[4]: num_passengers additional platform time additional train time total_apt total_att over_five_mins over_five_mins_perc customer journey time performance

count 2.426000e+03 2426.000000 2426.000000 2.426000e+03 2.426000e+03 2.426000e+03 2426.000000 2426.000000

mean 4.060347e+06 1.247788 0.251696 5.377748e+06 1.647531e+06 6.324144e+05 0.145706 0.854294

std 1.491784e+07 0.452390 0.475173 1.986666e+07 7.350485e+06 2.333993e+06 0.059420 0.059420

min 3.017300e+03 -0.431496 -2.213688 -7.396448e+03 -7.426452e+06 4.630000e+01 0.000112 0.682048

25% 1.030356e+06 0.962895 0.001369 1.218312e+06 8.842812e+02 1.361670e+05 0.107526 0.810005

50% 1.955926e+06 1.222895 0.310895 2.612567e+06 5.378233e+05 2.935772e+05 0.147931 0.852069

75% 3.096545e+06 1.534176 0.562851 4.181991e+06 1.499993e+06 5.025409e+05 0.189995 0.892474

max 2.093928e+08 3.506892 1.584780 2.655073e+08 1.359087e+08 3.129205e+07 0.317952 0.999888

In [5]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2426 entries, 0 to 2425
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 month 2426 non-null object
1 division 2426 non-null object
2 line 2426 non-null object
3 period 2426 non-null object
4 num_passengers 2426 non-null float64
5 additional platform time 2426 non-null float64
6 additional train time 2426 non-null float64
7 total_apt 2426 non-null float64
8 total_att 2426 non-null float64
9 over_five_mins 2426 non-null float64
10 over_five_mins_perc 2426 non-null float64
11 customer journey time performance 2426 non-null float64
dtypes: float64(8), object(4)
memory usage: 227.6+ KB

In [6]: df.rename(columns={'customer journey time performance': 'jtp'}, inplace=True)

df.rename(columns={'additional platform time': 'apt'}, inplace=True)

df.rename(columns={'additional train time': 'att'}, inplace=True)

In [7]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2426 entries, 0 to 2425
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 month 2426 non-null object
1 division 2426 non-null object
2 line 2426 non-null object
3 period 2426 non-null object
4 num_passengers 2426 non-null float64
5 apt 2426 non-null float64
6 att 2426 non-null float64
7 total_apt 2426 non-null float64
8 total_att 2426 non-null float64
9 over_five_mins 2426 non-null float64
10 over_five_mins_perc 2426 non-null float64
11 jtp 2426 non-null float64
dtypes: float64(8), object(4)
memory usage: 227.6+ KB

In [8]: # Split 'month' column into 'year' and 'month' columns


df[['year', 'month']] = df['month'].str.split('-', expand=True)

# Convert 'year' and 'month' columns to appropriate data types if needed


df['year'] = df['year'].astype(int)
df['month'] = df['month'].astype(int)

# Print the DataFrame to see the changes


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2426 entries, 0 to 2425
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 month 2426 non-null int64
1 division 2426 non-null object
2 line 2426 non-null object
3 period 2426 non-null object
4 num_passengers 2426 non-null float64
5 apt 2426 non-null float64
6 att 2426 non-null float64
7 total_apt 2426 non-null float64
8 total_att 2426 non-null float64
9 over_five_mins 2426 non-null float64
10 over_five_mins_perc 2426 non-null float64
11 jtp 2426 non-null float64
12 year 2426 non-null int64
dtypes: float64(8), int64(2), object(3)
memory usage: 246.5+ KB

In [9]: new_index_order = ['month', 'year', 'division', 'line', 'period', 'num_passengers','apt', 'att','total_apt', 'total_att', 'over_five_mins','over_five_mins_perc', 'jtp']

df = df.reindex(columns=new_index_order)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2426 entries, 0 to 2425
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 month 2426 non-null int64
1 year 2426 non-null int64
2 division 2426 non-null object
3 line 2426 non-null object
4 period 2426 non-null object
5 num_passengers 2426 non-null float64
6 apt 2426 non-null float64
7 att 2426 non-null float64
8 total_apt 2426 non-null float64
9 total_att 2426 non-null float64
10 over_five_mins 2426 non-null float64
11 over_five_mins_perc 2426 non-null float64
12 jtp 2426 non-null float64
dtypes: float64(8), int64(2), object(3)
memory usage: 246.5+ KB

In [10]: # Remove rows of 'Systemwide' data in the division column


df = df[df['division'] != 'Systemwide']

In [11]: #plot a scatterplot to see if an increase of apt (additional platform time) also increases the amount of passengers

sns.scatterplot(data=df, x='num_passengers', y='total_apt')


plt.show()

In [12]: sns.barplot(data=df, x='division', y='num_passengers')


plt.show()

In [13]: # Filter DataFrame for rows where 'division' is 'Q'


df_q = df[df['line'] == 'Q']

# Scatterplot: Additional Platform Time vs. Q - Journey Time Performance


sns.scatterplot(data=df_q, x='apt', y='jtp')
plt.title('Additional Platform Time vs. Q JTP')
plt.xlabel('Additional Platform Time')
plt.ylabel('Q JTP')
plt.show()

In [14]: # Scatterplot: Additional Platform Time vs. Q - Journey Time Performance


sns.scatterplot(data=df_q, x='apt', y='jtp')
sns.regplot(data=df_q, x='apt', y='jtp', scatter=False) # Add line of best fit
plt.title('Additional Platform Time vs. Q JTP')
plt.xlabel('Additional Platform Time')
plt.ylabel('Q JTP')
plt.show()

In [15]: # Filter DataFrame for the year 2023


df_2023 = df[df['year'] == 2023]

# Create a line plot of journey performance by month


plt.figure(figsize=(10, 6))
sns.lineplot(data=df_2023, x='month', y='jtp', marker='o')
plt.title('Journey Performance by Month in 2023')
plt.xlabel('Month')
plt.ylabel('Journey Performance')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

/opt/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert
inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
/opt/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert
inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):

In [16]: # Filter data for the Q line during offpeak and peak periods in 2023
q_line_offpeak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'offpeak') & (df['year'] == 2023)]
q_line_peak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'peak') & (df['year'] == 2023)]

# Plotting
plt.figure(figsize=(10, 6))

# Plot offpeak performance


plt.plot(q_line_offpeak_2023['month'], q_line_offpeak_2023['jtp'], label='Offpeak')

# Plot peak performance


plt.plot(q_line_peak_2023['month'], q_line_peak_2023['jtp'], label='Peak')

plt.title('Q Line Journey Travel Performance in 2023')


plt.xlabel('Month')
plt.ylabel('Journey Travel Performance')
plt.legend()
plt.grid(True)

plt.show()

In [17]: # Filter data for the Q line during offpeak and peak periods in 2023
q_line_offpeak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'offpeak') & (df['year'] == 2023)]
q_line_peak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'peak') & (df['year'] == 2023)]

# Calculate the difference between peak and offpeak performance


difference = q_line_peak_2023['jtp'].values - q_line_offpeak_2023['jtp'].values

# Plotting just the difference line


plt.figure(figsize=(10, 6))

# Plot the difference between peak and offpeak


plt.plot(q_line_peak_2023['month'], difference, label='Difference (Peak - Offpeak)', linestyle='--', color='red')

plt.title('Q Line Journey Travel Performance Difference (Peak - Offpeak) in 2023')


plt.xlabel('Month')
plt.ylabel('Difference in Journey Travel Performance')
plt.legend()
plt.grid(True)

plt.show()

In [18]: # Filter data for the Q line during offpeak and peak periods in 2023
q_line_offpeak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'offpeak') & (df['year'] == 2023)]
q_line_peak_2023 = df[(df['line'] == 'Q') & (df['period'] == 'peak') & (df['year'] == 2023)]

# Calculate the difference between peak and offpeak performance


difference = q_line_peak_2023['jtp'].values - q_line_offpeak_2023['jtp'].values

# Plotting just the difference line


plt.figure(figsize=(10, 6))

# Plot the difference between peak and offpeak


plt.plot(q_line_peak_2023['month'], difference, label='Difference (Peak - Offpeak)', linestyle='--', color='red')

plt.title('Q Line Journey Travel Performance Difference (Peak - Offpeak) in 2023')


plt.xlabel('Month')
plt.ylabel('Difference in Journey Travel Performance')
plt.legend()
plt.grid(True)

plt.show()

# Display difference values in a table


difference_df = pd.DataFrame({'Month': q_line_peak_2023['month'], 'Difference (Peak - Offpeak)': difference})
# Sort the DataFrame by the 'Difference (Peak - Offpeak)' column in descending order
difference_df = difference_df.sort_values(by='Difference (Peak - Offpeak)', ascending=False)
print(difference_df)

Month Difference (Peak - Offpeak)


235 10 0.014538
382 7 0.010489
529 4 0.007917
676 1 0.004792
431 6 0.004583
480 5 0.002935
284 9 -0.003889
333 8 -0.006041
627 2 -0.009870
137 12 -0.011253
578 3 -0.012512
186 11 -0.018329

In [19]: # Filter data for the year 2023


df_2023 = df[df['year'] == 2023]

# Group data by 'line' and calculate mean JTP for each line
mean_jtp_by_line = df_2023.groupby('line')['jtp'].mean()

# Sort mean JTP values in descending order and select top 5


top_5_lines = mean_jtp_by_line.nlargest(5)

print("Top 5 performing lines for 2023:")


for line, mean_jtp in top_5_lines.items():
print("Line:", line)
print("Mean JTP:", mean_jtp)
print() # Empty line for better readability

Top 5 performing lines for 2023:


Line: S 42nd
Mean JTP: 0.9878337433333333

Line: S Fkln
Mean JTP: 0.9572767267916666

Line: L
Mean JTP: 0.9206239399166667

Line: S Rock
Mean JTP: 0.9041509411666667

Line: 1
Mean JTP: 0.88777615075

In [20]: ## What is the probability that I will have to wait extra for the Q train on the platform?

# Filter data for the Q train


q_train_data = df[df['line'] == 'Q']

# Count the number of instances where additional platform time is greater than zero
extra_wait_count = q_train_data[q_train_data['apt'] > 0]['apt'].count()

# Total number of instances


total_instances = q_train_data['apt'].count()

# Calculate the probability


probability_extra_wait = extra_wait_count / total_instances

print("Probability of having to wait extra for the Q train on the platform:", probability_extra_wait)

Probability of having to wait extra for the Q train on the platform: 1.0

In [21]: # Filter data for the year 2023


data_2023 = df[df['year'] == 2023]

# Calculate the average additional platform time for each line in 2023
average_additional_platform_time_per_line = data_2023.groupby('line')['apt'].mean()

# Sort lines based on average additional platform time in descending order


sorted_lines = average_additional_platform_time_per_line.sort_values(ascending=False)

# Display each line's wait time in descending order


print("Average additional platform time for each line in 2023 (in minutes), descending order:")
for line, wait_time in sorted_lines.items():
print(f"Line: {line}, Average Wait Time: {wait_time:.2f} minutes")# Group data by train line and sum the 'over_five_mins' column to get the total number of passengers
passengers_affected_by_line = df.groupby('line')['over_five_mins'].sum()

# Sort the totals in descending order


passengers_affected_by_line = passengers_affected_by_line.sort_values(ascending=False)

# Display the train line that affects the most passengers


most_affected_line = passengers_affected_by_line.index[0]
total_passengers_affected = passengers_affected_by_line.iloc[0]

print(f"The train line that affects the most passengers is {most_affected_line} with a total of {total_passengers_affected} passengers affected.")

Average additional platform time for each line in 2023 (in minutes), descending order:
Line: B, Average Wait Time: 1.73 minutes
Line: M, Average Wait Time: 1.72 minutes
Line: C, Average Wait Time: 1.66 minutes
Line: D, Average Wait Time: 1.62 minutes
Line: R, Average Wait Time: 1.60 minutes
Line: N, Average Wait Time: 1.54 minutes
Line: JZ, Average Wait Time: 1.51 minutes
Line: F, Average Wait Time: 1.48 minutes
Line: Q, Average Wait Time: 1.44 minutes
Line: 2, Average Wait Time: 1.36 minutes
Line: A, Average Wait Time: 1.27 minutes
Line: 7, Average Wait Time: 1.23 minutes
Line: G, Average Wait Time: 1.18 minutes
Line: E, Average Wait Time: 1.16 minutes
Line: 6, Average Wait Time: 1.14 minutes
Line: 1, Average Wait Time: 1.11 minutes
Line: W, Average Wait Time: 1.10 minutes
Line: 3, Average Wait Time: 1.09 minutes
Line: 5, Average Wait Time: 1.09 minutes
Line: 4, Average Wait Time: 0.98 minutes
Line: S Rock, Average Wait Time: 0.90 minutes
Line: L, Average Wait Time: 0.89 minutes
Line: S Fkln, Average Wait Time: 0.59 minutes
Line: S 42nd, Average Wait Time: 0.47 minutes
The train line that affects the most passengers is F with a total of 58122287.0 passengers affected.

In [22]: # Filter data for the F line


f_line_data = df[df['line'] == 'F']

# Group data by month and sum the 'over_five_mins' column to get the total number of affected passengers for each month
total_passengers_affected_f_line_monthly = f_line_data.groupby(['year', 'month'])['over_five_mins'].sum()

# Calculate the average monthly affected passengers for the F line


average_monthly_affected_passengers_f_line = total_passengers_affected_f_line_monthly.mean()

print("Average monthly affected passengers for the F line:", average_monthly_affected_passengers_f_line)

Average monthly affected passengers for the F line: 1162445.74

In [30]: # Filter data for the year 2023


data_2023 = df[df['year'] == 2023]

# Group data by month and train line, and sum the 'over_five_mins' column
total_passengers_affected_monthly = data_2023.groupby(['month', 'line'])['over_five_mins'].sum()

# Find the train line that affects the most passengers for each month
most_affected_line_per_month = total_passengers_affected_monthly.groupby(level='month').idxmax()

# Display the results


print("Most passengers affected by line per month in 2023:")
for month, line in most_affected_line_per_month.items():
print(f"Month: {month}, Most affected line: {line[1]}")

Most passengers affected by line per month in 2023:


Month: 1, Most affected line: A
Month: 2, Most affected line: E
Month: 3, Most affected line: F
Month: 4, Most affected line: F
Month: 5, Most affected line: F
Month: 6, Most affected line: F
Month: 7, Most affected line: R
Month: 8, Most affected line: F
Month: 9, Most affected line: F
Month: 10, Most affected line: F
Month: 11, Most affected line: F
Month: 12, Most affected line: F

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy