Tan Pengshi Alvin tanpengshi

## mta_df.csv

          
            C/A
            UNIT
            SCP
            STATION
            LINENAME
            DIVISION
            DATE
            TIME
            DESC
            ENTRIES
            EXITS

            
              0
              A002
              R051
              02-00-00
              59 ST
              NQR456W
              BMT
              2020-08-22
              00:00:00
              REGULAR
              7447810
              2532191

            
              1
              A002
              R051
              02-00-00
              59 ST
              NQR456W
              BMT
              2020-08-22
              04:00:00
              REGULAR
              7447812
              2532197

            
              2
              A002
              R051
              02-00-00
              59 ST
              NQR456W
              BMT
              2020-08-22
              08:00:00
              REGULAR
              7447824
              2532208

            
              3
              A002
              R051
              02-00-00
              59 ST
              NQR456W
              BMT
              2020-08-22
              12:00:00
              REGULAR
              7447852
              2532248

            
              4
              A002
              R051
              02-00-00
              59 ST
              NQR456W
              BMT
              2020-08-22
              16:00:00
              REGULAR
              7447937
              2532276

## MTA_Data_Blog.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                tanpengshi
                / MTA_Data_Blog.ipynb
            
            
              Created
              September 7, 2020 15:01
            
              
                Unique Turnstile Analysis
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## MTA_Data3_Blog2.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                tanpengshi
                / MTA_Data3_Blog2.ipynb
            
            
              Created
              September 7, 2020 15:21
            
              
                Turnstile Counter Reset
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## delta.py
df['ENTRY_DELTA'][df['ENTRY_DELTA']>10000]=np.nan
df['ENTRY_DELTA'][df['ENTRY_DELTA']<0]=np.nan
# Setting the anomaly values due to reset of counters to the uniform NaN values

delta_list = list(df['ENTRY_DELTA'])
ind = 0
for i in delta_list:
    if np.isnan(i) == 1:
        delta_list[ind] = np.nanmean([delta_list[ind-1],delta_list[ind+1]])
    ind += 1

## histplot.py
group_station = df.groupby('STATION')['ENTRY_EXIT'].sum().sort_values(ascending=False)

fig1 = plt.figure(figsize=[8,6])
ax1 = sns.distplot(group_station,bins=50,kde=False)
plt.xlim([0,410000])
plt.ylabel('No. of Stations',fontsize=15, weight='bold')
plt.xlabel('Total Traffic', fontsize=15, weight='bold')
ax1.annotate('Top 10 Stations', xy=(0.73, 0.08), xytext=(0.73, 0.12), xycoords='axes fraction',
            fontsize=12, ha='center', va='bottom',
            arrowprops=dict(arrowstyle='-[, widthB=9.0, lengthB=1', lw=1.0),color='blue')

## barplot.py
plt.figure(figsize=[8,5])
ax = sns.barplot(data=group_station.head(10).reset_index(),x='ENTRY_EXIT',y='STATION',palette='rainbow')
plt.xlabel('Total Traffic',weight='bold',fontsize=15)
plt.ylabel('Top 10 Stations',weight='bold',fontsize=15)
plt.xticks(range(0,400001,50000),[str(int(i/1000))+'k' for i in range(0,400001,50000)])
plt.title('Busiest MTA Stations from 22/8 to 28/8', weight='bold',fontsize='15')
for p in ax.patches:
        ax.annotate(str(int(p.get_width()/1000))+'k', (p.get_width(), p.get_y()+0.5))
sns.despine()

## heatmap1.py
df_10 = df[df['STATION'].isin(list(group_station.head(10).index))]
df_10['WEEKDAY'] = df_10['DATE'].dt.day_name()
group_station_day = df_10.groupby(['STATION','WEEKDAY'])['ENTRY_EXIT'].sum()
matrix_station_day = group_station_day.unstack()
matrix_station_day.reset_index()
matrix_station_day = matrix_station_day.reindex(columns=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"])
matrix_station_day = matrix_station_day.reindex(index=list(group_station.head(10).index))

array = np.array(matrix_station_day.applymap(lambda x:str(round(x/1000,1))+'k'))
fig2 = plt.figure(figsize=[10,10])

## timeperiod.py
def timeperiod(time):
    if time >= datetime.time(0,0,0) and time < datetime.time(4,0,0):
        return "12am-4am"
    elif time >= datetime.time(4,0,0) and time < datetime.time(8,0,0):
        return "4am-8am"
    elif time >= datetime.time(8,0,0) and time < datetime.time(12,0,0):
        return "8am-12pm"
    elif time >= datetime.time(12,0,0) and time < datetime.time(16,0,0):
        return "12pm-4pm"
    elif time >= datetime.time(16,0,0) and time < datetime.time(20,0,0):

## heatmap2.py
matrix_list= []
for station in list(group_station.head(10).index):
    df_station = df_10[df_10['STATION']==station]
    group_day_time = df_station.groupby(['WEEKDAY','TIME_PERIOD'])['ENTRY_EXIT'].sum()
    matrix_day_time = group_day_time.unstack()
    matrix_day_time.reset_index()
    matrix_day_time = matrix_day_time.reindex(index=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"])
    matrix_day_time = matrix_day_time.reindex(columns=["12am-4am","4am-8am","8am-12pm","12pm-4pm","4pm-8pm","8pm-12am"])
    matrix_list.append(matrix_day_time)

## webscrape_fifa.py
import numpy as np
from bs4 import BeautifulSoup
import requests

player_list = []
url = 'https://www.fifaindex.com/players/{}/'   #creating url for primary pages

pageno = 1

while True:
	C/A	UNIT	SCP	STATION	LINENAME	DIVISION	DATE	TIME	DESC	ENTRIES	EXITS
0	A002	R051	02-00-00	59 ST	NQR456W	BMT	2020-08-22	00:00:00	REGULAR	7447810	2532191
1	A002	R051	02-00-00	59 ST	NQR456W	BMT	2020-08-22	04:00:00	REGULAR	7447812	2532197
2	A002	R051	02-00-00	59 ST	NQR456W	BMT	2020-08-22	08:00:00	REGULAR	7447824	2532208
3	A002	R051	02-00-00	59 ST	NQR456W	BMT	2020-08-22	12:00:00	REGULAR	7447852	2532248
4	A002	R051	02-00-00	59 ST	NQR456W	BMT	2020-08-22	16:00:00	REGULAR	7447937	2532276
	df['ENTRY_DELTA'][df['ENTRY_DELTA']>10000]=np.nan
	df['ENTRY_DELTA'][df['ENTRY_DELTA']<0]=np.nan
	# Setting the anomaly values due to reset of counters to the uniform NaN values

	delta_list = list(df['ENTRY_DELTA'])
	ind = 0
	for i in delta_list:
	if np.isnan(i) == 1:
	delta_list[ind] = np.nanmean([delta_list[ind-1],delta_list[ind+1]])
	ind += 1
	group_station = df.groupby('STATION')['ENTRY_EXIT'].sum().sort_values(ascending=False)

	fig1 = plt.figure(figsize=[8,6])
	ax1 = sns.distplot(group_station,bins=50,kde=False)
	plt.xlim([0,410000])
	plt.ylabel('No. of Stations',fontsize=15, weight='bold')
	plt.xlabel('Total Traffic', fontsize=15, weight='bold')
	ax1.annotate('Top 10 Stations', xy=(0.73, 0.08), xytext=(0.73, 0.12), xycoords='axes fraction',
	fontsize=12, ha='center', va='bottom',
	arrowprops=dict(arrowstyle='-[, widthB=9.0, lengthB=1', lw=1.0),color='blue')
	plt.figure(figsize=[8,5])
	ax = sns.barplot(data=group_station.head(10).reset_index(),x='ENTRY_EXIT',y='STATION',palette='rainbow')
	plt.xlabel('Total Traffic',weight='bold',fontsize=15)
	plt.ylabel('Top 10 Stations',weight='bold',fontsize=15)
	plt.xticks(range(0,400001,50000),[str(int(i/1000))+'k' for i in range(0,400001,50000)])
	plt.title('Busiest MTA Stations from 22/8 to 28/8', weight='bold',fontsize='15')
	for p in ax.patches:
	ax.annotate(str(int(p.get_width()/1000))+'k', (p.get_width(), p.get_y()+0.5))
	sns.despine()
	df_10 = df[df['STATION'].isin(list(group_station.head(10).index))]
	df_10['WEEKDAY'] = df_10['DATE'].dt.day_name()
	group_station_day = df_10.groupby(['STATION','WEEKDAY'])['ENTRY_EXIT'].sum()
	matrix_station_day = group_station_day.unstack()
	matrix_station_day.reset_index()
	matrix_station_day = matrix_station_day.reindex(columns=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"])
	matrix_station_day = matrix_station_day.reindex(index=list(group_station.head(10).index))

	array = np.array(matrix_station_day.applymap(lambda x:str(round(x/1000,1))+'k'))
	fig2 = plt.figure(figsize=[10,10])
	def timeperiod(time):
	if time >= datetime.time(0,0,0) and time < datetime.time(4,0,0):
	return "12am-4am"
	elif time >= datetime.time(4,0,0) and time < datetime.time(8,0,0):
	return "4am-8am"
	elif time >= datetime.time(8,0,0) and time < datetime.time(12,0,0):
	return "8am-12pm"
	elif time >= datetime.time(12,0,0) and time < datetime.time(16,0,0):
	return "12pm-4pm"
	elif time >= datetime.time(16,0,0) and time < datetime.time(20,0,0):
	matrix_list= []
	for station in list(group_station.head(10).index):
	df_station = df_10[df_10['STATION']==station]
	group_day_time = df_station.groupby(['WEEKDAY','TIME_PERIOD'])['ENTRY_EXIT'].sum()
	matrix_day_time = group_day_time.unstack()
	matrix_day_time.reset_index()
	matrix_day_time = matrix_day_time.reindex(index=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"])
	matrix_day_time = matrix_day_time.reindex(columns=["12am-4am","4am-8am","8am-12pm","12pm-4pm","4pm-8pm","8pm-12am"])
	matrix_list.append(matrix_day_time)
	import numpy as np
	from bs4 import BeautifulSoup
	import requests

	player_list = []
	url = 'https://www.fifaindex.com/players/{}/' #creating url for primary pages

	pageno = 1

	while True: