Skip to content

Instantly share code, notes, and snippets.

View tanpengshi's full-sized avatar

Tan Pengshi Alvin tanpengshi

View GitHub Profile
@tanpengshi
tanpengshi / mta_df.csv
Last active September 7, 2020 14:45
MTA Dataframe Head
C/A UNIT SCP STATION LINENAME DIVISION DATE TIME DESC ENTRIES EXITS
0 A002 R051 02-00-00 59 ST NQR456W BMT 2020-08-22 00:00:00 REGULAR 7447810 2532191
1 A002 R051 02-00-00 59 ST NQR456W BMT 2020-08-22 04:00:00 REGULAR 7447812 2532197
2 A002 R051 02-00-00 59 ST NQR456W BMT 2020-08-22 08:00:00 REGULAR 7447824 2532208
3 A002 R051 02-00-00 59 ST NQR456W BMT 2020-08-22 12:00:00 REGULAR 7447852 2532248
4 A002 R051 02-00-00 59 ST NQR456W BMT 2020-08-22 16:00:00 REGULAR 7447937 2532276
@tanpengshi
tanpengshi / MTA_Data_Blog.ipynb
Created September 7, 2020 15:01
Unique Turnstile Analysis
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@tanpengshi
tanpengshi / MTA_Data3_Blog2.ipynb
Created September 7, 2020 15:21
Turnstile Counter Reset
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@tanpengshi
tanpengshi / delta.py
Created September 7, 2020 15:36
Replacing NaN with Mean of Values
df['ENTRY_DELTA'][df['ENTRY_DELTA']>10000]=np.nan
df['ENTRY_DELTA'][df['ENTRY_DELTA']<0]=np.nan
# Setting the anomaly values due to reset of counters to the uniform NaN values
delta_list = list(df['ENTRY_DELTA'])
ind = 0
for i in delta_list:
if np.isnan(i) == 1:
delta_list[ind] = np.nanmean([delta_list[ind-1],delta_list[ind+1]])
ind += 1
@tanpengshi
tanpengshi / histplot.py
Last active September 7, 2020 15:52
Creating Histogram
group_station = df.groupby('STATION')['ENTRY_EXIT'].sum().sort_values(ascending=False)
fig1 = plt.figure(figsize=[8,6])
ax1 = sns.distplot(group_station,bins=50,kde=False)
plt.xlim([0,410000])
plt.ylabel('No. of Stations',fontsize=15, weight='bold')
plt.xlabel('Total Traffic', fontsize=15, weight='bold')
ax1.annotate('Top 10 Stations', xy=(0.73, 0.08), xytext=(0.73, 0.12), xycoords='axes fraction',
fontsize=12, ha='center', va='bottom',
arrowprops=dict(arrowstyle='-[, widthB=9.0, lengthB=1', lw=1.0),color='blue')
@tanpengshi
tanpengshi / barplot.py
Created September 7, 2020 16:00
Creating Barplot
plt.figure(figsize=[8,5])
ax = sns.barplot(data=group_station.head(10).reset_index(),x='ENTRY_EXIT',y='STATION',palette='rainbow')
plt.xlabel('Total Traffic',weight='bold',fontsize=15)
plt.ylabel('Top 10 Stations',weight='bold',fontsize=15)
plt.xticks(range(0,400001,50000),[str(int(i/1000))+'k' for i in range(0,400001,50000)])
plt.title('Busiest MTA Stations from 22/8 to 28/8', weight='bold',fontsize='15')
for p in ax.patches:
ax.annotate(str(int(p.get_width()/1000))+'k', (p.get_width(), p.get_y()+0.5))
sns.despine()
@tanpengshi
tanpengshi / heatmap1.py
Created September 7, 2020 16:06
Creating Heatmap 1
df_10 = df[df['STATION'].isin(list(group_station.head(10).index))]
df_10['WEEKDAY'] = df_10['DATE'].dt.day_name()
group_station_day = df_10.groupby(['STATION','WEEKDAY'])['ENTRY_EXIT'].sum()
matrix_station_day = group_station_day.unstack()
matrix_station_day.reset_index()
matrix_station_day = matrix_station_day.reindex(columns=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"])
matrix_station_day = matrix_station_day.reindex(index=list(group_station.head(10).index))
array = np.array(matrix_station_day.applymap(lambda x:str(round(x/1000,1))+'k'))
fig2 = plt.figure(figsize=[10,10])
@tanpengshi
tanpengshi / timeperiod.py
Created September 7, 2020 16:11
Creating Time Period
def timeperiod(time):
if time >= datetime.time(0,0,0) and time < datetime.time(4,0,0):
return "12am-4am"
elif time >= datetime.time(4,0,0) and time < datetime.time(8,0,0):
return "4am-8am"
elif time >= datetime.time(8,0,0) and time < datetime.time(12,0,0):
return "8am-12pm"
elif time >= datetime.time(12,0,0) and time < datetime.time(16,0,0):
return "12pm-4pm"
elif time >= datetime.time(16,0,0) and time < datetime.time(20,0,0):
@tanpengshi
tanpengshi / heatmap2.py
Created September 7, 2020 16:12
Creating Heatmap Matrix
matrix_list= []
for station in list(group_station.head(10).index):
df_station = df_10[df_10['STATION']==station]
group_day_time = df_station.groupby(['WEEKDAY','TIME_PERIOD'])['ENTRY_EXIT'].sum()
matrix_day_time = group_day_time.unstack()
matrix_day_time.reset_index()
matrix_day_time = matrix_day_time.reindex(index=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"])
matrix_day_time = matrix_day_time.reindex(columns=["12am-4am","4am-8am","8am-12pm","12pm-4pm","4pm-8pm","8pm-12am"])
matrix_list.append(matrix_day_time)
@tanpengshi
tanpengshi / webscrape_fifa.py
Last active May 21, 2022 00:05
Webscraping with Beautiful Soup
import numpy as np
from bs4 import BeautifulSoup
import requests
player_list = []
url = 'https://www.fifaindex.com/players/{}/' #creating url for primary pages
pageno = 1
while True: