Skip to content

Instantly share code, notes, and snippets.

@903124
Last active October 10, 2018 20:07
Show Gist options
  • Save 903124/0cf68236b200c0f9af6518cfd744386d to your computer and use it in GitHub Desktop.
Save 903124/0cf68236b200c0f9af6518cfd744386d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pybaseball\n",
"import requests\n",
"import requests_cache\n",
"import pandas as pd\n",
"import numpy as np\n",
"import math\n",
"from sklearn.externals import joblib\n",
"from scipy import stats\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from datetime import datetime\n",
"import json\n",
"import psycopg2\n",
"import sqlalchemy\n",
"from bs4 import BeautifulSoup\n",
"import urllib.request\n",
"import csv\n",
"import heapq\n",
"np.set_printoptions(suppress=True)\n",
"np.set_printoptions(precision=2)\n",
"\n",
"\n",
"\n",
"team_array = ['LAA','ARI','ATL','BAL','BOS','CWS','CHC','CIN','CLE','COL','DET','MIA','HOU','KC','LAD','MIL','MIN','NYY','NYM','OAK','PHI','PIT','SD','SEA','SF','STL','TB','TEX','TOR','WSH']\n",
"altitude_array = [151,1086,930,33,21,595,595,535,653,5190,600,10,45,865,515,597,815,55,10,3,20,780,23,10,0,460,15,545,270,35]\n",
"\n",
"\n",
"\n",
"def train_data(statcast_data):\n",
" \n",
" pos_id_range = []\n",
" pos_id_dp = []\n",
" pos_id_dp_pivot = []\n",
" pos_id_arm = []\n",
" \n",
" catch_result = []\n",
" train_parameter_range = [] #exit speed, exit angle, theta, altitude, handness, shift, date, 1b,2b,3b ,out\n",
"\n",
" dp_result = []\n",
" train_parameter_dp = [] #exit speed, exit angle, hc_x, hc_y, altitude, handness, shift, date, on_2b, on_3b\n",
"\n",
" arm_result = []\n",
" train_parameter_arm = [] #exit speed, exit angle, hc_x, hc_y, park, handness, shift, date,condition\n",
" extra_base_case_count = []\n",
" kill_id = []\n",
"\n",
" for count,item in enumerate(statcast_data['des']):\n",
" if(count % 10000 == 0):\n",
" print(count)\n",
"\n",
" if item is None:\n",
" continue\n",
"\n",
" # error = item.find('error')\n",
" # if(error != -1):\n",
" # continue\n",
" # bunt = item.find('bunt')\n",
" # if(bunt != -1):\n",
" # bunt_indicator = 1\n",
" # if(len(pos_catch_array) == 0 ):\n",
" # continue\n",
" # if(statcast_2015_data['bb_type'][count] == 'fly_ball' or statcast_2015_data['bb_type'][count] == 'popup' ):\n",
" # continue\n",
" # if(min(pos_catch_array) == pos_p or min(pos_catch_array) == pos_c):\n",
" # continue \n",
" launch_speed_data = statcast_data['launch_speed'][count] \n",
" launch_angle_data = statcast_data['launch_angle'][count] \n",
"\n",
" on_1b = 1\n",
" on_2b = 1\n",
" on_3b = 1\n",
"\n",
" if(pd.isna(statcast_data['on_1b'][count])):\n",
" on_1b = 0\n",
" if(pd.isna(statcast_data['on_2b'][count])):\n",
" on_2b = 0\n",
" if(pd.isna(statcast_data['on_3b'][count])):\n",
" on_3b = 0 \n",
"\n",
" out_at_2nd = item.find('out at 2nd') \n",
" out_at_3rd = item.find('out at 3rd') \n",
" out_at_home = item.find('out at home') \n",
"\n",
" to_2nd = item.find('to 2nd') \n",
" to_3rd = item.find('to 3rd') \n",
" scores = item.find('scores') \n",
"\n",
" game_day = statcast_data['game_date'][count].dayofyear\n",
"\n",
" hit_location = statcast_data['hit_location'][count]\n",
" events = statcast_data['events'][count]\n",
"\n",
" altitude = 0\n",
" park = 0\n",
" for i, team in enumerate(team_array):\n",
"\n",
" if(statcast_data['home_team'][count] == team):\n",
" altitude = altitude_array[i]\n",
" park = i \n",
" # infield_height = baseball_tracjectory_sim(launch_speed_data,launch_angle_data,altitude)\n",
" theta = math.degrees(math.atan((128-statcast_data['hc_x'][count])/(204-statcast_data['hc_y'][count])))\n",
"\n",
"\n",
" hc_x = 128-statcast_data['hc_x'][count]\n",
" hc_y = 204-statcast_data['hc_y'][count]\n",
" # if(theta > 55):\n",
" # continue\n",
" if(statcast_data['stand'][count] == 'L'):\n",
" handness = 1\n",
" else:\n",
" handness = 0\n",
"\n",
" if(statcast_data['if_fielding_alignment'][count] == 'Standard'):\n",
" shift = 0\n",
" elif(statcast_data['if_fielding_alignment'][count] == 'Strategic'):\n",
" shift = 1 \n",
" else:\n",
" shift = 2\n",
"\n",
" out_when_up = statcast_data['outs_when_up'][count]\n",
"\n",
"\n",
"\n",
" train_parameter_range.append([launch_speed_data, launch_angle_data, theta, altitude, handness, shift, game_day,on_1b, on_2b, on_3b, out_when_up])\n",
"\n",
" lineup_id = []\n",
" for i in range(9):\n",
"\n",
" lineup_id.append(statcast_data['pos' + str(i+1) + '_person_id'][count])\n",
"\n",
" pos_id_range.append(lineup_id)\n",
"\n",
" if(pd.isna(hit_location)):\n",
" catch_result.append(0)\n",
" else:\n",
" catch_result.append(hit_location)\n",
"\n",
" dp_indicator = 0 \n",
" if(hit_location >= 0 and to_2nd != -1): \n",
" dp_indicator = 1\n",
"\n",
"\n",
" try:\n",
" pos_ss = item.find('shortstop')\n",
" except AttributeError:\n",
" continue\n",
" pos_1b = item.find('first baseman')\n",
" pos_2b = item.find('second baseman')\n",
" pos_3b = item.find('third baseman')\n",
" pos_p = item.find('pitcher')\n",
" pos_lf = item.find('left fielder')\n",
" pos_cf = item.find('center fielder')\n",
" pos_rf = item.find('right fielder')\n",
" pos_c = item.find('catcher')\n",
"\n",
"\n",
" pos_array = np.array([pos_p,pos_c,pos_1b,pos_2b,pos_3b,pos_ss,pos_lf,pos_cf,pos_rf])\n",
" pos_catch_array = pos_array[np.where(pos_array>0)]\n",
"\n",
"\n",
" if(len(pos_catch_array) == 0 ):\n",
" continue\n",
"\n",
" # dp_indicator = 0 \n",
" # if(hit_location >= 0 and to_2nd != -1): \n",
" # dp_indicator = 1 \n",
" first_receiver = 0\n",
" for i in range(9):\n",
" if(min(pos_catch_array) == pos_array[i]):\n",
" first_receiver = i\n",
"\n",
" if(on_1b == 1 and out_when_up < 2 ):\n",
"\n",
" for i in range(9):\n",
" if(min(pos_catch_array) == pos_array[i]):\n",
"\n",
" pos_id_dp.append(statcast_data['pos'+ str(i+1) + '_person_id'][count])\n",
"\n",
" second_min = heapq.nsmallest(2, pos_catch_array)[-1] \n",
" if(len(pos_catch_array) >= 2 and (second_min == pos_1b or second_min == pos_2b or second_min == pos_ss or second_min == pos_3b)):\n",
"\n",
" for i in range(9):\n",
" if(second_min == pos_array[i]):\n",
" pos_id_dp_pivot.append(statcast_data['pos'+ str(i+1) + '_person_id'][count])\n",
"\n",
"\n",
" else:\n",
" pos_id_dp_pivot.append(0)\n",
"\n",
"\n",
" train_parameter_dp.append([launch_speed_data, launch_angle_data, hc_x, hc_y, altitude, handness, shift, first_receiver])\n",
"\n",
"\n",
"\n",
" # double_play = item.find('double play')\n",
" if(statcast_data['events'][count] == 'grounded_into_double_play' ):\n",
" dp_result.append(1)\n",
" else:\n",
" dp_result.append(0)\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" flies_out = item.find('flies out')\n",
" lines_out = item.find('lines out')\n",
"\n",
"\n",
"\n",
" if(min(pos_catch_array) == pos_lf or min(pos_catch_array) == pos_cf or min(pos_catch_array) == pos_rf):\n",
"\n",
" if((events == 'single' and (on_1b == 1 or on_2b == 1) )or (events == 'double' and on_1b == 1) or ((flies_out != -1 or lines_out != -1 )and (on_2b == 1 or on_3b == 1) and out_when_up < 2 )):\n",
" \n",
" if(min(pos_catch_array) == pos_lf):\n",
" first_receiver_id = (statcast_data['pos7_person_id'][count])\n",
" elif(min(pos_catch_array) == pos_cf):\n",
" first_receiver_id =(statcast_data['pos8_person_id'][count])\n",
" else:\n",
" first_receiver_id =(statcast_data['pos9_person_id'][count])\n",
" \n",
" pos_id_arm.append(first_receiver_id)\n",
" \n",
" \n",
" if(events == 'single' and on_2b == 1): \n",
"\n",
" extra_base_case = 0\n",
" extra_base_case_count.append(extra_base_case)\n",
" \n",
" if(item.find('out at home') != -1):\n",
" kill_id.append(first_receiver_id)\n",
" \n",
" if(scores != -1 or out_at_home != -1):\n",
"\n",
" arm_result.append(1)\n",
" else: \n",
" arm_result.append(0) \n",
"\n",
" elif(events == 'single' and on_1b == 1): \n",
"\n",
" extra_base_case = 1\n",
" extra_base_case_count.append(extra_base_case)\n",
" \n",
" if(item.find('out at third') != -1):\n",
" kill_id.append(first_receiver_id) \n",
"\n",
" if(to_2nd != -1 or out_at_2nd != -1):\n",
"\n",
" arm_result.append(0)\n",
" else: \n",
" arm_result.append(1)\n",
"\n",
" elif(events == 'double' and on_1b == 1):\n",
"\n",
" extra_base_case = 2\n",
" extra_base_case_count.append(extra_base_case)\n",
" \n",
" if(item.find('out at third') != -1):\n",
" kill_id.append(first_receiver_id) \n",
" \n",
" if(scores != -1 or out_at_home != -1):\n",
"\n",
" arm_result.append(1)\n",
" else: \n",
" arm_result.append(0) \n",
"\n",
" elif((flies_out != -1 or lines_out != -1 )and (on_3b == 1) and statcast_data['outs_when_up'][count] < 2):\n",
"\n",
" extra_base_case = 3\n",
" extra_base_case_count.append(extra_base_case)\n",
" \n",
" if(item.find('out at home') != -1):\n",
" kill_id.append(first_receiver_id) \n",
" \n",
" if(scores != -1 or out_at_home != -1):\n",
"\n",
" arm_result.append(1)\n",
" else: \n",
" arm_result.append(0) \n",
" else:\n",
" extra_base_case = 4\n",
" extra_base_case_count.append(extra_base_case)\n",
" \n",
" if(item.find('out at third') != -1):\n",
" kill_id.append(first_receiver_id) \n",
" \n",
" if(to_3rd != -1 or out_at_3rd != -1):\n",
"\n",
" arm_result.append(1)\n",
" else: \n",
" arm_result.append(0) \n",
"\n",
" train_parameter_arm.append([launch_speed_data, launch_angle_data, hc_x, hc_y, park, handness, shift, game_day, extra_base_case, first_receiver,out_when_up])\n",
"\n",
"\n",
"\n",
" print(len(catch_result)) \n",
" \n",
" return pos_id_range,pos_id_dp,pos_id_dp_pivot,pos_id_arm,catch_result,train_parameter_range ,dp_result,train_parameter_dp,arm_result,train_parameter_arm,extra_base_case_count,kill_id\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Get statcast data\n",
"\n",
"conn = psycopg2.connect(\"dbname=dbname user=user host=host password=password\")\n",
"cur = conn.cursor()\n",
"engine = sqlalchemy.create_engine('dialect+driver://username:password@host:port/database', echo=False)\n",
"\n",
"\n",
"\n",
"data_April_18 = pybaseball.statcast(start_dt='2018-04-03', end_dt='2018-05-02')\n",
"data_May_18 = pybaseball.statcast(start_dt='2018-05-03', end_dt='2018-06-02')\n",
"data_June_18 = pybaseball.statcast(start_dt='2018-06-03', end_dt='2018-07-02')\n",
"\n",
"\n",
"\n",
"data_April_18.to_sql('baseball_2018', con=engine, if_exists='replace')\n",
"data_May_18.to_sql('baseball_2018', con=engine, if_exists='append')\n",
"data_June_18.to_sql('baseball_2018', con=engine, if_exists='append')\n",
"\n",
"data_April_17 = pybaseball.statcast(start_dt='2017-04-03', end_dt='2017-05-02')\n",
"data_May_17 = pybaseball.statcast(start_dt='2017-05-03', end_dt='2017-06-02')\n",
"data_June_17 = pybaseball.statcast(start_dt='2017-06-03', end_dt='2017-07-02')\n",
"data_July_17 = pybaseball.statcast(start_dt='2017-07-03', end_dt='2017-08-02')\n",
"data_August_17 = pybaseball.statcast(start_dt='2017-08-03', end_dt='2017-09-02')\n",
"data_September_17 = pybaseball.statcast(start_dt='2017-09-03', end_dt='2017-10-02')\n",
"\n",
"\n",
"data_April.to_sql('baseball_2018', con=engine, if_exists='replace')\n",
"data_May.to_sql('baseball_2018', con=engine, if_exists='append')\n",
"data_June.to_sql('baseball_2018', con=engine, if_exists='append')\n",
"\n",
"data_April.to_sql('baseball_2017', con=engine, if_exists='replace')\n",
"data_May.to_sql('baseball_2017', con=engine, if_exists='append')\n",
"data_June.to_sql('baseball_2017', con=engine, if_exists='append')\n",
"data_July.to_sql('baseball_2017', con=engine, if_exists='append')\n",
"data_August.to_sql('baseball_2017', con=engine, if_exists='append')\n",
"data_September.to_sql('baseball_2017', con=engine, if_exists='append')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Train with 2017 Statcast data\n",
"\n",
"query = '''\n",
"SELECT des,game_date,bb_type,home_team,events,description,launch_speed, launch_angle, on_3b, on_2b, on_1b,stand,\n",
"zone,outs_when_up,hc_x,hc_y,hit_location,\n",
"pos1_person_id, pos2_person_id, pos3_person_id, \n",
"pos4_person_id, pos5_person_id, pos6_person_id,\n",
"pos7_person_id, pos8_person_id, pos9_person_id, \n",
"if_fielding_alignment, of_fielding_alignment\n",
"\n",
"FROM baseball_2017\n",
"WHERE launch_speed >= 0 AND description != 'foul' AND pos1_person_id >= 0\n",
"AND events != 'walk' AND events != 'hit_by_pitch' AND hc_x >1 AND hc_y > 1 \n",
"ORDER BY game_date\n",
"'''\n",
"\n",
"\n",
"statcast_2017_data = pd.read_sql(query,engine) \n",
" \n",
"pos_id_range,pos_id_dp,pos_id_dp_pivot,pos_id_arm,catch_result,train_parameter_range ,dp_result,train_parameter_dp,arm_result,train_parameter_arm,extra_base_case_count,kill_id = train_data(statcast_2017_data) \n",
" \n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Plus-minus run\n",
" \n",
"label = ['launch speed', 'exit angle', 'theta' ,'altitude','handness','shift','date', 'on_1b', 'on_2b', 'on_3b','out'] \n",
"\n",
"\n",
"df = pd.DataFrame.from_records(np.array(train_parameter_range).tolist(),columns = label) \n",
"X_train, X_test, y_train, y_test = train_test_split(df[label], catch_result, test_size=0.5, stratify=catch_result, random_state=123456) \n",
"rf_catch = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=123456)\n",
"rf_catch.fit(X_train, y_train)\n",
"\n",
"joblib.dump(rf_catch, 'rf_catch_17.pkl')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Double play run\n",
"\n",
"label = ['launch speed', 'exit angle', 'hc_x' , 'hc_y', 'altitude','handness','shift','first_receiver'] \n",
"\n",
" \n",
"df2 = pd.DataFrame.from_records(np.array(train_parameter_dp).tolist(),columns = label) \n",
"X_train, X_test, y_train, y_test = train_test_split(df2[label], dp_result, test_size=0.3, stratify=dp_result, random_state=123456)\n",
"rf_dp = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=123456)\n",
"rf_dp.fit(X_train, y_train)\n",
"joblib.dump(rf_dp, 'rf_dp_17.pkl')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Arm run\n",
"\n",
"label = ['launch speed', 'exit angle', 'hc_x' , 'hc_y', 'altitude','handness','shift','date', 'condition','first_receiver'] \n",
"\n",
"\n",
"df3 = pd.DataFrame.from_records(np.array(train_parameter_arm).tolist(),columns = label) \n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(df3[label], arm_result, test_size=0.3, stratify=arm_result, random_state=123456)\n",
"\n",
"rf_arm = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=123456)\n",
"rf_arm.fit(X_train, y_train)\n",
"joblib.dump(rf_arm, 'rf_arm_17.pkl')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Framing run\n",
"\n",
"query = '''\n",
"SELECT release_speed, release_pos_x, release_pos_z,\n",
"description, stand, p_throws, pfx_x, pfx_z, plate_x, plate_z,sz_top,sz_bot,vx0,vz0,\n",
"effective_speed,release_spin_rate,release_extension,zone,\n",
"pitcher,pos2_person_id,\n",
"ax,ay,az,pitch_name,home_team,balls,strikes,inning_topbot\n",
"\n",
"\n",
"\n",
"\n",
"FROM baseball_2017\n",
"WHERE description = 'ball' OR description = 'called_strike' \n",
"AND release_speed >= 0 AND plate_x <100 AND plate_z <100 \n",
"'''\n",
"statcast_2017_pitch_data = pd.read_sql(query,engine)\n",
"#statcast_2017_data\n",
"\n",
"statcast_pitch_data = statcast_2017_pitch_data.dropna(axis=0, how='any')\n",
"\n",
"train_pitch_parameter = pd.get_dummies(statcast_pitch_data[['plate_x','plate_z', 'stand' , 'p_throws', 'release_extension','release_speed','release_spin_rate' ,'release_pos_x','release_pos_z','vx0','vz0']]) \n",
"\n",
"pitch_result = []\n",
"\n",
"for description in statcast_pitch_data['description']:\n",
" if(description == 'ball'):\n",
" pitch_result.append(0)\n",
" else:\n",
" pitch_result.append(1)\n",
" \n",
"X_train, X_test, y_train, y_test = train_test_split(train_pitch_parameter, pitch_result, test_size=0.3, stratify=pitch_result, random_state=123456)\n",
"rf_pitch = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=123456)\n",
"rf_pitch.fit(X_train, y_train)\n",
"joblib.dump(rf_pitch, 'rf_pitch_17.pkl')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Calculate DRS using 18 data\n",
"\n",
"query = '''\n",
"SELECT des,game_date,bb_type,home_team,events,description,launch_speed, launch_angle, on_3b, on_2b, on_1b,stand,\n",
"zone,outs_when_up,hc_x,hc_y,hit_location,\n",
"pos1_person_id, pos2_person_id, pos3_person_id, \n",
"pos4_person_id, pos5_person_id, pos6_person_id,\n",
"pos7_person_id, pos8_person_id, pos9_person_id, \n",
"if_fielding_alignment, of_fielding_alignment\n",
"\n",
"FROM baseball_2018\n",
"WHERE launch_speed >= 0 AND description != 'foul' AND pos1_person_id >= 0\n",
"AND events != 'walk' AND events != 'hit_by_pitch' AND hc_x >1 AND hc_y > 1 \n",
"ORDER BY game_date \n",
"'''\n",
"statcast_2018_data = pd.read_sql(query,engine)\n",
"\n",
"pos_id_range,pos_id_dp,pos_id_dp_pivot,pos_id_arm,catch_result,train_parameter_range ,dp_result,train_parameter_dp,arm_result,train_parameter_arm,extra_base_case_count,kill_id = train_data(statcast_2018_data) \n",
"\n",
"\n",
"rf_catch = joblib.load('rf_catch_17.pkl')\n",
"rf_dp = joblib.load('rf_dp_17.pkl')\n",
"rf_arm = joblib.load('rf_arm_17.pkl')\n",
"\n",
"\n",
"predict_proba_catch = rf_catch.predict_proba(train_parameter_range)\n",
"predict_proba_dp = rf_dp.predict_proba(train_parameter_dp)\n",
"predict_proba_arm = rf_arm.predict_proba(train_parameter_arm)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Range Run/plus-minus run\n",
"\n",
"label = ['P', 'C', '1B' ,'2B','3B','SS','LF', 'CF', 'RF'] \n",
"\n",
"playerid_df = pd.DataFrame.from_records(pos_id_range,columns = label) \n",
"\n",
"\n",
"unique_id_range = []\n",
"\n",
"for i, position in enumerate(label):\n",
"\n",
" unique_id_range.append(playerid_df[position].unique())\n",
"\n",
"\n",
"range_run = []\n",
"range_count = []\n",
"for i in range(9):\n",
"\n",
" range_run.append(np.zeros(len(unique_id_range[i])))\n",
" range_count.append(np.zeros(len(unique_id_range[i])))\n",
"range_run = np.array(range_run)\n",
"range_run_count = np.array(range_count)\n",
"\n",
"\n",
"for i, result in enumerate(catch_result):\n",
" \n",
" if(result > 0):\n",
" for j,player_id in enumerate(unique_id_range[int(result-1)]):\n",
" if(player_id == pos_id_range[i][int(result-1)]):\n",
" range_run[int(result-1)][j] += 0.83*predict_proba_catch[i][0]\n",
" range_count[int(result-1)][j] +=1\n",
" else:\n",
" for k in range(9):\n",
" for j,player_id in enumerate(unique_id_range[k]): \n",
" if(player_id == pos_id_range[i][k]):\n",
" range_run[k][j] -= 0.83*predict_proba_catch[i][k+1]\n",
" range_count[k][j] +=1\n",
"\n",
"\n",
"for i in range(9):\n",
" range_run_sum = np.sum(range_run[i])\n",
" range_run[i] -= 0.5*np.sum(range_run_sum)*range_count[i]/np.sum(range_count[i])\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Double play run\n",
"\n",
"unique_dp = np.unique(pos_id_dp)\n",
"unique_dp_pivot = np.unique(pos_id_dp_pivot)\n",
"dp_run = np.zeros(len(unique_dp))\n",
"dp_pivot = np.zeros(len(unique_dp_pivot))\n",
"dp_count = np.zeros(len(unique_dp))\n",
"dp_pivot_count = np.zeros(len(unique_dp_pivot))\n",
"\n",
"for i, result in enumerate(dp_result):\n",
"\n",
" for j,player_id in enumerate(unique_dp):\n",
"\n",
" if(player_id == pos_id_dp[i]):\n",
" if(pos_id_dp_pivot[i] == 0):\n",
" dp_count[j] += 1\n",
" if(result == 1):\n",
"\n",
" dp_run[j] += 0.58*predict_proba_dp[i][0]\n",
" else:\n",
" dp_run[j] -= 0.58*predict_proba_dp[i][1]\n",
" \n",
" else: \n",
" for k,player_id_pivot in enumerate(unique_dp_pivot):\n",
" if(player_id_pivot== pos_id_dp_pivot[i]):\n",
" dp_count[j] += 1\n",
" dp_pivot_count[k] += 1\n",
" if(result == 1):\n",
"\n",
" dp_run[j] += 0.5*0.58*predict_proba_dp[i][0]\n",
" dp_pivot[k] += 0.5*0.58*predict_proba_dp[i][0]\n",
" else:\n",
" dp_run[j] -= 0.5*0.58*predict_proba_dp[i][1]\n",
" dp_pivot[k] -= 0.5*0.58*predict_proba_dp[i][1]\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Arm run\n",
"\n",
"unique_arm = np.unique(pos_id_arm)\n",
"\n",
"arm_run = np.zeros(len(unique_arm))\n",
"\n",
"arm_count = np.zeros(len(unique_arm))\n",
"\n",
"\n",
"for j,player_id in enumerate(unique_arm): \n",
" for i, result in enumerate(arm_result):\n",
" \n",
" if(player_id == pos_id_arm[i]):\n",
" arm_count[j] += 1\n",
" if(result == 1):\n",
" arm_run[j] -= 0.25* predict_proba_arm[i][0]\n",
" \n",
" else:\n",
" arm_run[j] += 0.25*predict_proba_arm[i][1]\n",
" for kill_player_id in kill_id:\n",
" if(player_id == kill_player_id):\n",
" arm_run[j] += 0.75 \n",
"\n",
"arm_run_sum = np.sum(arm_run)\n",
"arm_run -= 0.5*np.sum(arm_run_sum)*arm_count/np.sum(arm_count)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Catch Framing run\n",
"\n",
"query = '''\n",
"SELECT release_speed, release_pos_x, release_pos_z,\n",
"description, stand, p_throws, pfx_x, pfx_z, plate_x, plate_z,sz_top,sz_bot,vx0,vz0,\n",
"effective_speed,release_spin_rate,release_extension,\n",
"pitcher,pos2_person_id,\n",
"ax,ay,az,pitch_name,home_team,balls,strikes,inning_topbot\n",
"\n",
"\n",
"\n",
"\n",
"FROM baseball_2018\n",
"WHERE description = 'ball' OR description = 'called_strike'\n",
"AND release_speed >= 0 AND plate_x <100 AND plate_z <100 \n",
"'''\n",
"statcast_2018_pitch_data = pd.read_sql(query,engine)\n",
"\n",
"statcast_pitch_data = statcast_2018_pitch_data.dropna(axis=0, how='any')\n",
"\n",
"train_pitch_parameter = pd.get_dummies(statcast_pitch_data[['plate_x','plate_z', 'stand' , 'p_throws', 'release_extension','release_speed','release_spin_rate' ,'release_pos_x','release_pos_z','vx0','vz0']]) \n",
"\n",
"\n",
"rf_pitch = joblib.load('rf_pitch_17.pkl')\n",
"\n",
"pitch_proba = rf_pitch.predict_proba(train_pitch_parameter)\n",
"\n",
"pitch_result = []\n",
"\n",
"for description in statcast_pitch_data['description']:\n",
" if(description == 'ball'):\n",
" pitch_result.append(0)\n",
" else:\n",
" pitch_result.append(1)\n",
" \n",
"unique_catcher_frame = pd.unique(statcast_pitch_data['pos2_person_id'])\n",
"catcher_id_all = np.array(statcast_pitch_data['pos2_person_id'])\n",
"\n",
"frame_run = np.zeros(len(unique_catcher_frame))\n",
"frame_count = np.zeros(len(unique_catcher_frame))\n",
"\n",
"for i,proba in enumerate(pitch_proba):\n",
" for j,player_id in enumerate(unique_catcher_frame):\n",
" \n",
" if(player_id == catcher_id_all[i] ):\n",
" frame_count[j] += 1\n",
" if(pitch_result[i] == 1):\n",
" frame_run[j] += 0.14*proba[0]\n",
" else:\n",
" frame_run[j] -= 0.14*proba[1]\n",
"\n",
"frame_run_sum = np.sum(frame_run)\n",
"frame_run -= 0.5*frame_run_sum / np.sum(frame_count)* frame_count "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Catch stolen base run\n",
"\n",
"f = urllib.request.urlopen(\"https://baseballsavant.mlb.com/poptime?year=2017&team=&min2b=1&min3b=0\")\n",
"soup = BeautifulSoup(f,\"lxml\")\n",
"\n",
"catcher_poptime_df = pd.DataFrame.from_dict(json.loads(soup.findAll(\"script\")[9].text.split('\\n')[1][15:-1]))\n",
"\n",
"catch_pop_id = catcher_poptime_df['player_id']\n",
"catch_pop_run = np.zeros(len(catch_pop_id))\n",
"for i in range(len(catch_pop_id)):\n",
"\n",
" catch_pop_run[i] = 1.2*(2.01-float(catcher_poptime_df['pop_2b_sba'][i]))*float(catcher_poptime_df['pop_2b_sba_count'][i])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Get player name from database\n",
"url=\"https://raw.githubusercontent.com/chadwickbureau/register/master/data/people.csv\"\n",
"player_table=pd.read_csv(url)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Inning data and positional adjustment\n",
"\n",
"query = '''\n",
"SELECT \n",
"pos1_person_id, pos2_person_id, pos3_person_id, \n",
"pos4_person_id, pos5_person_id, pos6_person_id,\n",
"pos7_person_id, pos8_person_id, pos9_person_id,outs_when_up,inning,game_pk\n",
"\n",
"\n",
"\n",
"FROM baseball_2018\n",
"WHERE pos1_person_id IS NOT NULL\n",
"GROUP BY pos1_person_id, pos2_person_id, pos3_person_id, \n",
"pos4_person_id, pos5_person_id, pos6_person_id,\n",
"pos7_person_id, pos8_person_id, pos9_person_id,outs_when_up,inning,game_pk\n",
"'''\n",
"statcast_2018_data_all_inning = pd.read_sql(query,engine)\n",
"\n",
"statcast_data_all_inning = statcast_2018_data_all_inning\n",
"\n",
"statcast_data_all_inning_id = statcast_data_all_inning.iloc[:,0:9]\n",
"pos_id_all_inning = np.unique(statcast_data_all_inning_id.dropna(axis=0, how='any').values)\n",
"\n",
"positional_inning = np.zeros((len(pos_id_all_inning),9))\n",
"\n",
"for i,column in enumerate(statcast_data_all_inning.iloc[:,0:9]):\n",
"\n",
"\n",
" for k, player_id_match in enumerate(pos_id_all_inning):\n",
" positional_inning[k][i] += np.in1d(statcast_data_all_inning[column],player_id_match).sum()*1/3\n",
" \n",
"positional_inning_sum = np.sum(positional_inning,axis=1) \n",
"player_position = np.argmax(positional_inning, axis=1)\n",
"\n",
"positional_run_temp = np.zeros((len(pos_id_all_inning),9))\n",
"\n",
"position_run_adjustment = [0,12.5,-12.5,2.5,2.5,7.5,-7.5,2.5,-7.5]\n",
"\n",
"for i in range(9):\n",
"\n",
" positional_run_temp[:,i] = positional_inning[:,i]/9/162*position_run_adjustment[i]\n",
" \n",
"positional_run = np.sum(positional_run_temp,axis=1).tolist() "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Adding all the above together\n",
"\n",
"unique_id_range_stack = np.hstack(unique_id_range)\n",
"range_run_stack = np.hstack(range_run)\n",
"\n",
"run_array = [range_run_stack,dp_run,dp_pivot,arm_run,catch_pop_run,frame_run,positional_run]\n",
"id_array = [unique_id_range_stack,unique_dp,unique_dp_pivot,unique_arm,np.array(catch_pop_id),unique_catcher_frame,pos_id_all_inning]\n",
"unique_id_all = np.unique(np.hstack(pos_id_all_inning))\n",
"\n",
"player_run = np.zeros((len(unique_id_all),7))\n",
"\n",
"for i,player_id in enumerate(unique_id_all):\n",
" for j in range(7):\n",
" for k,player_id_match in enumerate(id_array[j]):\n",
" if(player_id == player_id_match):\n",
" player_run[i][j] += run_array[j][k]\n",
"\n",
"player_total_run_df = pd.DataFrame(player_run,columns=['rPM','rDPc','rDPp','rARM','rSB','rFrame','rPos'])\n",
"player_total_run_df['rGDP'] = player_total_run_df[['rDPc','rDPp']].sum(axis=1)\n",
"player_total_run_df['player_id'] = unique_id_all\n",
"\n",
"player_name = []\n",
"\n",
"\n",
"for i in unique_id_all:\n",
"\n",
" player_name.append((player_table[player_table.key_mlbam == i]['name_first'].to_string(index=False) + ' ' + player_table[player_table.key_mlbam == i]['name_last'].to_string(index=False)))\n",
" \n",
"player_total_run_df['Player Name'] = player_name\n",
"\n",
"player_position_dict= {0:'P',1:'C',2:'1B',3:'2B',4:'3B',5:'SS',6:'LF',7:'CF',8:'RF'}\n",
"player_position_output = np.vectorize(player_position_dict.__getitem__)(player_position)\n",
"player_total_run_df['Position']=player_position_output\n",
"\n",
"player_total_run_df['Inning'] = positional_inning_sum \n",
"player_total_run_df['sDRS'] = player_total_run_df[['rPM','rGDP','rARM','rSB','rFrame']].sum(axis=1)\n",
"player_total_run_df['Def'] = player_total_run_df[['rPM','rGDP','rARM','rSB','rFrame','rPos']].sum(axis=1)\n",
"\n",
"player_total_run_df = player_total_run_df.loc[player_total_run_df['player_id'] != 0]\n",
"\n",
"player_total_run_df_out = player_total_run_df[['Player Name','Position','Inning','rPM','rGDP','rARM','rSB','rFrame','sDRS','rPos','Def']]\n",
"player_total_run_df_out = player_total_run_df_out.sort_values(by=['sDRS'],ascending=False)\n",
"player_total_run_df_out = player_total_run_df_out.round(1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Output\n",
"\n",
"player_total_run_df_out.to_csv('StatcastDRS_18.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment