Skip to content

Instantly share code, notes, and snippets.

@makispl
Created September 2, 2021 19:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save makispl/bc9a3aae039bcf1b83992913b92dc52d to your computer and use it in GitHub Desktop.
Save makispl/bc9a3aae039bcf1b83992913b92dc52d to your computer and use it in GitHub Desktop.
# import the libraries
from nba_api.stats.endpoints import boxscoreadvancedv2
import sqlite3 as sql
import pandas as pd
import numpy as np
from multiprocessing import Pool
import requests
from functools import partial
from io import BytesIO
import os
from datetime import datetime
import config
# get proxies
def get_proxies(nba_api=True):
res = requests.get(
'https://api.proxyscrape.com/v2/?request=getproxies&protocol=http&timeout=3000&country=all&ssl=yes&anonymity=all&simplified=true')
content = res.content
res.close()
proxy_list = [item[0]
for item in pd.read_csv(BytesIO(content)).values.tolist()]
proxy_list = proxy_list + \
requests.get(
'http://pubproxy.com/api/proxy?limit=25&format=txt&http=true&country=US&type=http').content.decode("utf-8").split('\n')
proxy_list = proxy_list + ['165.225.77.42:8800', '165.225.77.44:80', '3.10.5.102:8080', '165.225.77.44:9400', '165.225.77.42:9443', '165.225.77.42:443',
'165.225.77.42:80', '165.225.77.47:443', '165.225.77.47:8800', '165.225.77.47:9443', '165.225.77.47:80', '165.225.77.47:9400',
'165.225.77.47:9401']
proxies = proxy_list
if not nba_api:
proxy_list = ["http://" + proxy for proxy in proxies]
proxies = {"http": proxy_list, "https": proxy_list}
return proxies
# get nba api endoints
def get_nba_api_endpoint(input_val, input_val_label, endpoint, process_response_dfs, proxies, game_ids):
# define helpful variables
if np.where(game_ids == input_val)[0].size > 0:
print("Making request #{} for game id: {} | {}".format(
np.where(game_ids == input_val)[0], input_val, datetime.now()))
no_res = True
proxy_collection_counter = 0
proxy_index = 0
arg = {input_val_label: input_val}
# while no response
while no_res:
# try getting a response without a proxy
try:
dfs = endpoint(**arg, proxy="http://" + proxies,
timeout=3).get_data_frames()
no_res = False
break
except:
# if that fails
while no_res:
# try getting with a certain proxy
try:
dfs = endpoint(
**arg, proxy="http://" + proxies[proxy_index], timeout=3).get_data_frames()
no_res = False
break
except:
# if that fails, move on to next proxy unless out of proxies
if (proxy_index + 1) >= len(proxies):
# unless tried proxies 5 times
if proxy_collection_counter < 5:
# if out of proxies: get more proxies, fix counters, and try without a proxy again
proxy_index = 0
proxy_collection_counter = proxy_collection_counter + 1
break
else:
return None
else:
proxy_index = proxy_index + 1
res = process_response_dfs(dfs)
return res
def process_response_dfs_box_advan(res):
"""
"""
return {'players': res[0], 'teams': res[1]}
# get players & teams dataframes
def get_df(game_ids):
"""
Takes in a dataframe of the 'GAME_ID',
returns 2 dataframes of player & team data
Parameters
---------
game_ids : a dataframe object
Contains the ID of each game
Returns
-------
player_df : a dataframe object
Contains the player attributes per game
team_df : a dataframe object
Contains the team attributes per game
"""
proxies = get_proxies()
dfs = map(partial(get_nba_api_endpoint, input_val_label='game_id', endpoint=boxscoreadvancedv2.BoxScoreAdvancedV2,
process_response_dfs=process_response_dfs_box_advan, proxies=proxies, game_ids=game_ids), game_ids)
print("successfully extracted all game ids ... concatenating and returning | {}".format(
datetime.now()))
player_df = pd.DataFrame()
teams_df = pd.DataFrame()
for df in dfs:
player_df = pd.concat([player_df, df['players']], axis=0)
teams_df = pd.concat([teams_df, df['teams']], axis=0)
return player_df, teams_df
# get seasons' game winners
def get_game_winners(db, seasons):
"""
Takes in the db and the seasons of interest,
returns a dataframe game winner teams
Parameters
---------
db : a str object
Contains the db's path
season: a list object
Contains the seasons years (2019-->22019, 2020-->22020, etc)
Returns
-------
game_winners_df : a dataframe object
Contains the per game winners 'TEAM_ID'
"""
q = str()
for i in range(len(seasons)):
if i < (len(seasons) - 1):
season = "'" + str(seasons[i]) + "'"
q = q + f"(SEASON_ID={season}) OR "
else:
season = "'" + str(seasons[-1]) + "'"
q = q + f"(SEASON_ID={season})"
# fetch the game data
with sql.connect(db) as conn:
game_info = pd.read_sql(
f"SELECT * FROM Game WHERE {q}",
conn
)
game_info = game_info.loc[:, [
'GAME_ID', 'GAME_DATE', 'TEAM_ID_HOME', 'TEAM_ID_AWAY', 'WL_HOME', 'SEASON_ID']]
# find the game winner teams
game_info['WINNER_ID'] = game_info['TEAM_ID_HOME']
game_info.loc[(game_info.loc[:, 'WL_HOME'] == 'L'),
'WINNER_ID'] = game_info.loc[:, 'TEAM_ID_AWAY']
game_winners_df = game_info.loc[:, [
'GAME_ID', 'WINNER_ID', 'GAME_DATE']].copy()
# drop duplicate values
game_winners_df.drop_duplicates(inplace=True)
return game_winners_df
# get players dataframes
def get_plays(player_df, game_winners_df, seasons):
"""
Takes in the player_df and game_winners_df dataframes,
returns the merged dataframe
Parameters
---------
player_df : a dataframe object
Contains the player attributes per game
game_winners_df: a dataframe object
contains the per game winners 'TEAM_ID'
season: a list object
Contains the seasons years (2019-->22019, 2020-->22020, etc)
Returns
-------
plays_df : a dataframe object
Contains the in-use dataframe of per game player attributes
"""
plays_df = pd.merge(
left=player_df,
right=game_winners_df,
how='left',
on='GAME_ID'
)
cols = [
'GAME_ID', 'GAME_DATE', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_CITY',
'PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'START_POSITION', 'COMMENT', 'MIN',
'E_OFF_RATING', 'OFF_RATING', 'E_DEF_RATING', 'DEF_RATING',
'E_NET_RATING', 'NET_RATING', 'AST_PCT', 'AST_TOV', 'AST_RATIO',
'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'TM_TOV_PCT', 'EFG_PCT', 'TS_PCT',
'USG_PCT', 'E_USG_PCT', 'E_PACE', 'PACE', 'PACE_PER40', 'POSS', 'PIE'
]
plays_df = plays_df[cols].copy()
plays_df.to_csv(data_dir + 'plays_total.csv', index=False)
return plays_df
if __name__ == '__main__':
print(
"Please enter the desired season years (2019-->22019, 2020-->22020, etc) separated by comma, like:\n22019,22020"
)
# set seasons
seasons = input().split(',')
# set the db
db = config.EXTERNAL_DATA_FILE
# set the data directory
data_dir = config.RAW_DATA_OUTPUT
# extract the game winner teams
game_winners_df = get_game_winners(db, seasons)
# create the player and team datasets
player_df, teams_df = get_df(game_winners_df.GAME_ID)
# create and export the final dataset
get_plays(player_df, game_winners_df, seasons)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment