903124/EPA_CFB.ipynb

## cfb_teams_list.csv

          
            full_name
            abbreviation

            
              Abilene Christian
              AblCh

            
              Air Force
              AFA

            
              Akron
              Akron

            
              Alabama
              Alab

            
              Alabama A&M
              AlaAM

            
              Albany
              Alban

            
              Alcorn State
              AlcSt

            
              Appalachian State
              AppSt

            
              Arizona
              Ariz

            
              Arizona State
              ArzSt

            
              Arkansas
              Ark

            
              Arkansas State
              ArkSt

            
              Arkansas-Pine Bluff
              ArkPB

            
              Army
              Army

            
              Auburn
              Aub

            
              Austin Peay
              APeay

            
              BYU
              BYU

            
              Ball State
              BalSt

            
              Baylor
              Bayl

            
              Bethune-Cookman
              BthCk

            
              Boise State
              BoiSt

            
              Boston College
              BC

            
              Bowling Green
              BwGrn

            
              Buffalo
              Buff

            
              California
              Cal

            
              Campbell
              Camp

            
              Central Arkansas
              CArk

            
              Central Connecticut
              CConn

            
              Central Michigan
              CMich

            
              Charleston Southern
              ChaSo

            
              Charlotte
              Charl

            
              Cincinnati
              Cincy

            
              Clemson
              Clem

            
              Coastal Carolina
              CCaro

            
              Colorado
              Colo

            
              Colorado State
              ColSt

            
              Connecticut
              UConn

            
              Delaware State
              DelSt

            
              Drake
              Drake

            
              Duke
              Duke

            
              Duquesne
              Duqsn

            
              East Carolina
              ECaro

            
              Eastern Illinois
              EIlln

            
              Eastern Kentucky
              EKent

            
              Eastern Michigan
              EMich

            
              Eastern Washington
              EWash

            
              Elon
              ElonU

            
              Florida
              Fla

            
              Florida Atlantic
              FlAtl

            
              Florida International
              FlaIn

            
              Florida State
              FlaSt

            
              Fordham
              Fordh

            
              Fresno State
              FrsSt

            
              Gardner-Webb
              GrdWb

            
              Georgia
              Geo

            
              Georgia Southern
              GeoSo

            
              Georgia State
              GeoSt

            
              Georgia Tech
              GTech

            
              Grambling
              Gramb

            
              Hawai'i
              Hawa

            
              Houston
              Houst

            
              Houston Baptist
              HstnB

            
              Howard
              Howrd

            
              Idaho
              Idaho

            
              Idaho State
              IdaSt

            
              Illinois
              Illin

            
              Illinois State
              IllSt

            
              Indiana
              Ind

            
              Iowa
              Iowa

            
              Iowa State
              IowSt

            
              Jackson State
              JckSt

            
              James Madison
              JMads

            
              Kansas
              Kans

            
              Kansas State
              KanSt

            
              Kennesaw State
              Ksaw

            
              Kent State
              KntSt

            
              Kentucky
              Kent

            
              LSU
              LSU

            
              Lafayette
              Lafay

            
              Lehigh
              Lehgh

            
              Liberty
              Liber

            
              Louisiana
              LaLaf

            
              Louisiana Monroe
              LaMon

            
              Louisiana Tech
              LaTch

            
              Louisville
              Lvile

            
              Maine
              Maine

            
              Marshall
              Marsh

            
              Maryland
              Mary

            
              McNeese
              McNSt

            
              Memphis
              Memph

            
              Mercer
              Mercr

            
              Miami
              MiaFl

            
              Miami (OH)
              MiaOh

            
              Michigan
              Mich

            
              Michigan State
              MchSt

            
              Middle Tennessee
              MTnSt

            
              Minnesota
              Minn

            
              Mississippi State
              MisSt

            
              Missouri
              Misso

            
              Missouri State
              MoSt

            
              Monmouth
              MonNJ

            
              Murray State
              MurrS

            
              NC State
              NCSt

            
              Navy
              Navy

            
              Nebraska
              Neb

            
              Nevada
              Nevad

            
              New Hampshire
              NHamp

            
              New Mexico
              NMex

            
              New Mexico State
              NMxSt

            
              Nicholls
              NicSt

            
              Norfolk State
              NflkS

            
              North Carolina
              NCaro

            
              North Carolina A&T
              NCAT

            
              North Carolina Central
              NCCtl

            
              North Texas
              NoTex

            
              Northern Arizona
              NoArz

            
              Northern Illinois
              NoIll

            
              Northern Iowa
              NIowa

            
              Northwestern
              Nwest

            
              Northwestern State
              NWSt

            
              Notre Dame
              NDame

            
              Ohio
              Ohio

            
              Ohio State
              OhSt

            
              Oklahoma
              Okla

            
              Oklahoma State
              OKSt

            
              Old Dominion
              ODU

            
              Ole Miss
              Miss

            
              Oregon
              Oregn

            
              Oregon State
              OrgSt

            
              Penn State
              PnSt

            
              Pittsburgh
              Pitt

            
              Portland State
              PrtSt

            
              Prairie View
              PraVw

            
              Purdue
              Prdue

            
              Rhode Island
              RIsld

            
              Rice
              Rice

            
              Richmond
              Richm

            
              Rutgers
              Rutgr

            
              SMU
              SMU

            
              San Diego State
              SDSt

            
              San Jos茅 State
              SJSt

            
              Savannah State
              SavSt

            
              South Alabama
              SAlab

            
              South Carolina
              SCaro

            
              South Carolina State
              SCSt

            
              South Dakota
              SDako

            
              South Florida
              SFla

            
              Southeast Missouri State
              SEMo

            
              Southeastern Louisiana
              SELa

            
              Southern
              Sthrn

            
              Southern Mississippi
              SoMis

            
              Southern Utah
              SoUth

            
              Stanford
              Stanf

            
              Stephen F. Austin
              SFAus

            
              Stony Brook
              StBrk

            
              Syracuse
              Syrac

            
              TCU
              TCU

            
              Temple
              Temp

            
              Tennessee
              Tenn

            
              Tennessee State
              TenSt

            
              Tennessee Tech
              TnTch

            
              Texas
              Texas

            
              Texas A&M
              TexAM

            
              Texas Southern
              TexSo

            
              Texas State
              TexSt

            
              Texas Tech
              TexTc

            
              Toledo
              Toled

            
              Troy
              Troy

            
              Tulane
              Tulan

            
              Tulsa
              Tulsa

            
              UAB
              UAB

            
              UC Davis
              UCDav

            
              UCF
              UCF

            
              UCLA
              UCLA

            
              UMass
              UMass

            
              UNLV
              UNLV

            
              USC
              USC

            
              UT Martin
              TnMar

            
              UT San Antonio
              TexSA

            
              UTEP
              UTEP

            
              Utah
              Utah

            
              Utah State
              UthSt

            
              VMI
              VMI

            
              Vanderbilt
              Vandy

            
              Villanova
              Villa

            
              Virginia
              Virg

            
              Virginia Tech
              VTech

            
              Wake Forest
              WFrst

            
              Washington
              Wash

            
              Washington State
              WshSt

            
              Weber State
              WebSt

            
              West Virginia
              WVirg

            
              Western Kentucky
              WKent

            
              Western Michigan
              WMich

            
              Wisconsin
              Wisc

            
              Wofford
              Woffd

            
              Wyoming
              Wyom

## EPA_CFB.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import re\n",
    "import requests\n",
    "import time\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "import joblib\n",
    "pd.options.display.max_columns = 999"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here is the process of calculation expected points added (EPA) of College football in 2018 season. \n",
    "Data is collected from https://collegefootballdata.com/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "drive_data = pd.DataFrame(requests.get('https://api.collegefootballdata.com/drives?seasonType=regular&year=2018').json())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "game_data = pd.DataFrame(requests.get('https://api.collegefootballdata.com/games?year=2018&seasonType=regular').json())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "game_data['game_id'] = game_data['id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.merge(drive_data,game_data,on='game_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['drive_id'] = data['id_x']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "play_data = pd.DataFrame()\n",
    "for i in range(15):\n",
    "    request_df = requests.get('https://api.collegefootballdata.com/plays?seasonType=regular&year=2018&week=' + str(i+1)).json()\n",
    "    time.sleep(2)\n",
    "    play_data = play_data.append(pd.DataFrame(request_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "pbp_data = pd.merge(play_data,data[['home_team','drive_id']],how='left',on='drive_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "pbp_data['coef'] = (pbp_data['home_team'] == pbp_data['defense']).astype(int)\n",
    "pbp_data['adjusted_yardline'] = 100*(1-pbp_data['coef']) +  (2*pbp_data['coef']-1)*pbp_data['yard_line'] #yard_line is defined by home team in API"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We would first calculate expected point of play using logistic regression. \n",
    "\n",
    "The target variable here is point scored of scoring-drive (e.g. Touchdown, Field Goal, Safety, Defensive TD) and the point scored by opponent's next drive for non-scoring drive(e.g. Punt, Missed FG) ."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['drive_point'] = data.drive_result.apply(lambda x: 7 if (x == 'TD' or x == 'PUNT TD' or x == 'RUSHING TD' or x == 'PASSING TD') else (3 if (x == 'FG' or x == 'FG GOOD') else (-2 if x == 'SF' else -7 if ( x ==   'PUNT RETURN TD' or x == 'MISSED FG TD' or x == 'INT TD' or x == 'FUMBLE RETURN TD' or x == 'DOWNS TD' or x == 'INT RETURN TOUCH'  or x == 'FG MISSED TD' or x =='PUNT TD' or x == 'TURNOVER ON DOWNS TD') else 0 )))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['next_drive_point'] = -data['drive_point'].shift(-1).clip_lower(-2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[data.drive_point == 0, 'drive_point'] = data['next_drive_point']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "pbp_data = pbp_data.merge(data[['drive_id','drive_point','drive_result']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "exclude_playtype = ['Kickoff',  'End Period',\n",
    "        'Kickoff Return (Offense)',\n",
    "       'Kickoff Return Touchdown', 'End of Half', 'Defensive 2pt Conversion','Uncategorized', 'End of Game']\n",
    "\n",
    "game_end_drive = ['END OF HALF', 'END OF GAME', 'Uncategorized','END OF 4TH QUARTER', 'DOWNS TD','POSSESSION (FOR OT DRIVES)']\n",
    "\n",
    "regression_df = pbp_data[~(pbp_data.play_type.isin(exclude_playtype)) & (pbp_data.adjusted_yardline > 0)& (pbp_data.adjusted_yardline < 100) & ~(pbp_data.drive_result.isin(game_end_drive))].dropna()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Gradient boosting classifier from sklearn is used here for expected point calculation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
       "                           learning_rate=0.1, loss='deviance', max_depth=3,\n",
       "                           max_features=None, max_leaf_nodes=None,\n",
       "                           min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                           min_samples_leaf=1, min_samples_split=2,\n",
       "                           min_weight_fraction_leaf=0.0, n_estimators=200,\n",
       "                           n_iter_no_change=None, presort='auto',\n",
       "                           random_state=None, subsample=1.0, tol=0.0001,\n",
       "                           validation_fraction=0.1, verbose=0,\n",
       "                           warm_start=False)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = GradientBoostingClassifier(n_estimators = 200)\n",
    "clf.fit(regression_df[['down','distance','adjusted_yardline']], regression_df.drive_point)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculation of EPA below is for play from scrimmage only."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "special_team_play_type = ['Kickoff','Punt','Kickoff Return (Offense)', 'Kickoff Return Touchdown','Field Goal Good', 'Field Goal Missed', 'Blocked Field Goal', 'Blocked Punt','Punt Return Touchdown','Blocked Punt Touchdown','Missed Field Goal Return','Uncategorized', 'Missed Field Goal Return Touchdown','Defensive 2pt Conversion']\n",
    "timing_play_type = ['End Period','End of Game','Timeout','End of Half']\n",
    "turnover_play_type = ['Fumble Recovery (Opponent)','Pass Interception Return','Interception Return Touchdown','Fumble Return Touchdown','Safety','Interception','Pass Interception']\n",
    "regular_play_type = [ 'Rush', 'Sack', 'Pass Reception', 'Passing Touchdown','Pass Incompletion', 'Fumble Recovery (Own)','Rushing Touchdown','Pass Interception','Pass Completion']\n",
    "off_TD = ['Passing Touchdown','Rushing Touchdown']\n",
    "def_TD = ['Interception Return Touchdown','Fumble Return Touchdown']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play = pbp_data[pbp_data.play_type.isin(regular_play_type) | pbp_data.play_type.isin(turnover_play_type) ]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Team abbreviation in play_text is obtained using regex match on the data. Here we just read csv after cleaning up, and match the abbrevation to offense and defense"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "CFB_teams_list = pd.read_csv('cfb_teams_list.csv',encoding='utf-8') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['offense'],right_on=['full_name'])\n",
    "regular_play.rename(columns={'abbreviation':'off_abbr', 'full_name': 'off_full_name'}, inplace=True)\n",
    "regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['defense'],right_on=['full_name'])\n",
    "regular_play.rename(columns={'abbreviation':'def_abbr', 'full_name': 'def_full_name'}, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Expected point at the start of the play:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "EP_predict = clf.predict_proba(regular_play[['down','distance','adjusted_yardline']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7\n",
    "regular_play['EP_start'] = EP"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Cleaning the data for expected point at the end of the play"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play['new_yardline']= 0\n",
    "regular_play['new_down']= 0\n",
    "regular_play['new_distance']= 0\n",
    "regular_play['turnover'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Drop missing data and erroneous play type\n",
    "regular_play = regular_play[~pd.isna(regular_play.play_text) & (regular_play.play_type != 'Interception')] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_down'] = 1\n",
    "regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_distance'] = 10\n",
    "\n",
    "regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_down'] = 1\n",
    "regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_distance'] = 10\n",
    "\n",
    "regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_down'] = regular_play.down + 1\n",
    "regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_distance'] = regular_play.distance - regular_play.yards_gained\n",
    "\n",
    "regular_play.loc[regular_play.play_text.str.contains('50 yard line'), 'new_yardline'] = 50\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_yardline'] = 100- (regular_play.yard_line + regular_play.yards_gained) \n",
    "regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_down'] = 1\n",
    "regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_distance'] = 10\n",
    "\n",
    "regular_play.loc[regular_play.play_type == 'Sack', 'new_yardline'] = 100- (regular_play.yard_line - regular_play.yards_gained)\n",
    "regular_play.loc[regular_play.play_type == 'Sack', 'new_down'] = regular_play.down + 1\n",
    "regular_play.loc[regular_play.play_type == 'Sack', 'new_distance'] = regular_play.distance - regular_play.yards_gained"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\kayiu\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  after removing the cwd from sys.path.\n",
      "C:\\Users\\kayiu\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "#Collect end of play yardline information (e.g. Alab 38 = Alabama own 38) from play_text and match the team abbreviation\n",
    "\n",
    "temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.off_abbr.values.astype(str)) >= 0] \n",
    "temp_df['split_string'] =  [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.off_abbr.values.astype(str)))]\n",
    "regular_play.loc[temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].index, 'new_yardline'] = 100-np.array(temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].split_string.str.extract(r'(\\d+)').astype(float)).ravel()\n",
    "\n",
    "temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.def_abbr.values.astype(str)) >= 0]\n",
    "temp_df['split_string'] =  [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.def_abbr.values.astype(str)))]\n",
    "regular_play.loc[temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].index, 'new_yardline'] = np.array(temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].split_string.str.extract(r'(\\d+)').astype(float)).ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[pd.isna(regular_play.new_yardline),'new_distance'] = regular_play.distance - regular_play.yards_gained \n",
    "regular_play.loc[pd.isna(regular_play.new_yardline),'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained\n",
    "\n",
    "regular_play.loc[regular_play.play_type == 'Pass Incompletion', 'new_yardline'] = regular_play.adjusted_yardline\n",
    "\n",
    "regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_yardline'] = 80\n",
    "regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_down'] = 1\n",
    "\n",
    "#Fake data for model prediction, EP will be changed after processing the data\n",
    "\n",
    "regular_play.loc[regular_play.play_type == 'Safety', 'new_yardline'] = 99 #Fake yardline for Safety\n",
    "\n",
    "regular_play.loc[regular_play.play_type.isin(off_TD),'new_down'] = 1 #Fake new down for Offensive tocuhdown play\n",
    "regular_play.loc[regular_play.play_type.isin(off_TD),'new_distance']  = 10 #Fake new yards to go for Offensive tocuhdown play\n",
    "\n",
    "regular_play.loc[(regular_play.play_type.isin(off_TD) | regular_play.play_type.isin(def_TD)),'new_yardline'] = 99  #Fake yardline for Offensive tocuhdown play\n",
    "\n",
    "regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'turnover'] = 1 #Turnover on down\n",
    "regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_down'] = 1 \n",
    "regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_distance'] = 10\n",
    "regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_yardline'] = 100-regular_play.new_yardline\n",
    "\n",
    "\n",
    "regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0))  & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_down' ] = 1 #Strip sack\n",
    "regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0)) & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_distance' ] = 10 \n",
    "regular_play.loc[((regular_play.new_yardline <= 0) |(regular_play.new_distance <= 0)) & (regular_play.play_text.str.contains('return')), 'new_yardline' ] = 100-(regular_play.adjusted_yardline - regular_play.yards_gained)\n",
    "regular_play.loc[ regular_play.play_text.str.contains('return'), 'turnover' ] = 1 \n",
    "\n",
    "regular_play.loc[regular_play.new_distance <= 0, 'new_down'] = 1 #First down not in API\n",
    "regular_play.loc[regular_play.new_distance <= 0, 'new_distance'] = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[regular_play.new_yardline <= 0 ,'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[regular_play.play_text.str.contains('TOUCHDOWN'),'new_yardline'] = 99"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Throw away some plays with error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play = regular_play[(regular_play.new_yardline > 0) & (regular_play.new_yardline < 100) & (regular_play.adjusted_yardline > 0) & (regular_play.adjusted_yardline < 100)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculate expected point at the end of play. Since statsmodels take column name input in prediction we first extract and rename feature columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_df = pd.DataFrame({'down':regular_play['new_down'],'distance':regular_play['new_distance'],'adjusted_yardline':regular_play['new_yardline']})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "EP_predict = clf.predict_proba(out_df[['down','distance','adjusted_yardline']])\n",
    "EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play['EP_end'] = EP"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally setting the expected point at end of touchdown and safety play to 7 and -2, and reverse the number for turnover plays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[(regular_play.play_type.isin(off_TD) | regular_play.play_type.isin(def_TD) | regular_play.play_text.str.contains('TOUCHDOWN')),'EP_end'] = 7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[(regular_play.play_type.isin(turnover_play_type)| regular_play.turnover == 1),'EP_end'] *= -1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.loc[regular_play.play_type == 'Safety','EP_end'] = -2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play['EPA'] = regular_play['EP_end'] - regular_play['EP_start']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "pass_play_type = ['Sack','Pass Incompletion','Pass Interception Return','Pass Reception','Interception Return Touchdown','Passing Touchdown','Pass Completion','Pass Interception']\n",
    "rush_play_type = ['Fumble Recovery (Opponent)','Fumble Recovery (Own)','Fumble Return Touchdown','Rush','Rushing Touchdown']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check EPA by play type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.03542604861003451"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regular_play[regular_play.play_type.isin(pass_play_type)]['EPA'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-0.02507176193772802"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regular_play[regular_play.play_type.isin(rush_play_type)]['EPA'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "play_type\n",
       "Fumble Recovery (Opponent)      -4.600877\n",
       "Fumble Recovery (Own)           -0.935820\n",
       "Fumble Return Touchdown         -7.319282\n",
       "Interception Return Touchdown   -7.766629\n",
       "Pass Incompletion               -1.104547\n",
       "Pass Interception Return        -3.544499\n",
       "Pass Reception                   0.912590\n",
       "Passing Touchdown                3.536264\n",
       "Rush                            -0.071102\n",
       "Rushing Touchdown                2.217327\n",
       "Sack                            -1.874424\n",
       "Safety                          -0.429088\n",
       "Name: EPA, dtype: float64"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regular_play.groupby('play_type')['EPA'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "regular_play.to_csv('CFB_regular_play_18.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	full_name	abbreviation
	Abilene Christian	AblCh
	Air Force	AFA
	Akron	Akron
	Alabama	Alab
	Alabama A&M	AlaAM
	Albany	Alban
	Alcorn State	AlcSt
	Appalachian State	AppSt
	Arizona	Ariz
	Arizona State	ArzSt
	Arkansas	Ark
	Arkansas State	ArkSt
	Arkansas-Pine Bluff	ArkPB
	Army	Army
	Auburn	Aub
	Austin Peay	APeay
	BYU	BYU
	Ball State	BalSt
	Baylor	Bayl
	Bethune-Cookman	BthCk
	Boise State	BoiSt
	Boston College	BC
	Bowling Green	BwGrn
	Buffalo	Buff
	California	Cal
	Campbell	Camp
	Central Arkansas	CArk
	Central Connecticut	CConn
	Central Michigan	CMich
	Charleston Southern	ChaSo
	Charlotte	Charl
	Cincinnati	Cincy
	Clemson	Clem
	Coastal Carolina	CCaro
	Colorado	Colo
	Colorado State	ColSt
	Connecticut	UConn
	Delaware State	DelSt
	Drake	Drake
	Duke	Duke
	Duquesne	Duqsn
	East Carolina	ECaro
	Eastern Illinois	EIlln
	Eastern Kentucky	EKent
	Eastern Michigan	EMich
	Eastern Washington	EWash
	Elon	ElonU
	Florida	Fla
	Florida Atlantic	FlAtl
	Florida International	FlaIn
	Florida State	FlaSt
	Fordham	Fordh
	Fresno State	FrsSt
	Gardner-Webb	GrdWb
	Georgia	Geo
	Georgia Southern	GeoSo
	Georgia State	GeoSt
	Georgia Tech	GTech
	Grambling	Gramb
	Hawai'i	Hawa
	Houston	Houst
	Houston Baptist	HstnB
	Howard	Howrd
	Idaho	Idaho
	Idaho State	IdaSt
	Illinois	Illin
	Illinois State	IllSt
	Indiana	Ind
	Iowa	Iowa
	Iowa State	IowSt
	Jackson State	JckSt
	James Madison	JMads
	Kansas	Kans
	Kansas State	KanSt
	Kennesaw State	Ksaw
	Kent State	KntSt
	Kentucky	Kent
	LSU	LSU
	Lafayette	Lafay
	Lehigh	Lehgh
	Liberty	Liber
	Louisiana	LaLaf
	Louisiana Monroe	LaMon
	Louisiana Tech	LaTch
	Louisville	Lvile
	Maine	Maine
	Marshall	Marsh
	Maryland	Mary
	McNeese	McNSt
	Memphis	Memph
	Mercer	Mercr
	Miami	MiaFl
	Miami (OH)	MiaOh
	Michigan	Mich
	Michigan State	MchSt
	Middle Tennessee	MTnSt
	Minnesota	Minn
	Mississippi State	MisSt
	Missouri	Misso
	Missouri State	MoSt
	Monmouth	MonNJ
	Murray State	MurrS
	NC State	NCSt
	Navy	Navy
	Nebraska	Neb
	Nevada	Nevad
	New Hampshire	NHamp
	New Mexico	NMex
	New Mexico State	NMxSt
	Nicholls	NicSt
	Norfolk State	NflkS
	North Carolina	NCaro
	North Carolina A&T	NCAT
	North Carolina Central	NCCtl
	North Texas	NoTex
	Northern Arizona	NoArz
	Northern Illinois	NoIll
	Northern Iowa	NIowa
	Northwestern	Nwest
	Northwestern State	NWSt
	Notre Dame	NDame
	Ohio	Ohio
	Ohio State	OhSt
	Oklahoma	Okla
	Oklahoma State	OKSt
	Old Dominion	ODU
	Ole Miss	Miss
	Oregon	Oregn
	Oregon State	OrgSt
	Penn State	PnSt
	Pittsburgh	Pitt
	Portland State	PrtSt
	Prairie View	PraVw
	Purdue	Prdue
	Rhode Island	RIsld
	Rice	Rice
	Richmond	Richm
	Rutgers	Rutgr
	SMU	SMU
	San Diego State	SDSt
	San Jos茅 State	SJSt
	Savannah State	SavSt
	South Alabama	SAlab
	South Carolina	SCaro
	South Carolina State	SCSt
	South Dakota	SDako
	South Florida	SFla
	Southeast Missouri State	SEMo
	Southeastern Louisiana	SELa
	Southern	Sthrn
	Southern Mississippi	SoMis
	Southern Utah	SoUth
	Stanford	Stanf
	Stephen F. Austin	SFAus
	Stony Brook	StBrk
	Syracuse	Syrac
	TCU	TCU
	Temple	Temp
	Tennessee	Tenn
	Tennessee State	TenSt
	Tennessee Tech	TnTch
	Texas	Texas
	Texas A&M	TexAM
	Texas Southern	TexSo
	Texas State	TexSt
	Texas Tech	TexTc
	Toledo	Toled
	Troy	Troy
	Tulane	Tulan
	Tulsa	Tulsa
	UAB	UAB
	UC Davis	UCDav
	UCF	UCF
	UCLA	UCLA
	UMass	UMass
	UNLV	UNLV
	USC	USC
	UT Martin	TnMar
	UT San Antonio	TexSA
	UTEP	UTEP
	Utah	Utah
	Utah State	UthSt
	VMI	VMI
	Vanderbilt	Vandy
	Villanova	Villa
	Virginia	Virg
	Virginia Tech	VTech
	Wake Forest	WFrst
	Washington	Wash
	Washington State	WshSt
	Weber State	WebSt
	West Virginia	WVirg
	Western Kentucky	WKent
	Western Michigan	WMich
	Wisconsin	Wisc
	Wofford	Woffd
	Wyoming	Wyom
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"import re\n",
	"import requests\n",
	"import time\n",
	"from sklearn.ensemble import GradientBoostingClassifier\n",
	"import joblib\n",
	"pd.options.display.max_columns = 999"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Here is the process of calculation expected points added (EPA) of College football in 2018 season. \n",
	"Data is collected from https://collegefootballdata.com/"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"drive_data = pd.DataFrame(requests.get('https://api.collegefootballdata.com/drives?seasonType=regular&year=2018').json())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"game_data = pd.DataFrame(requests.get('https://api.collegefootballdata.com/games?year=2018&seasonType=regular').json())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"game_data['game_id'] = game_data['id']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"data = pd.merge(drive_data,game_data,on='game_id')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"data['drive_id'] = data['id_x']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"play_data = pd.DataFrame()\n",
	"for i in range(15):\n",
	" request_df = requests.get('https://api.collegefootballdata.com/plays?seasonType=regular&year=2018&week=' + str(i+1)).json()\n",
	" time.sleep(2)\n",
	" play_data = play_data.append(pd.DataFrame(request_df))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"pbp_data = pd.merge(play_data,data[['home_team','drive_id']],how='left',on='drive_id')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"pbp_data['coef'] = (pbp_data['home_team'] == pbp_data['defense']).astype(int)\n",
	"pbp_data['adjusted_yardline'] = 100(1-pbp_data['coef']) + (2pbp_data['coef']-1)*pbp_data['yard_line'] #yard_line is defined by home team in API"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We would first calculate expected point of play using logistic regression. \n",
	"\n",
	"The target variable here is point scored of scoring-drive (e.g. Touchdown, Field Goal, Safety, Defensive TD) and the point scored by opponent's next drive for non-scoring drive(e.g. Punt, Missed FG) ."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"data['drive_point'] = data.drive_result.apply(lambda x: 7 if (x == 'TD' or x == 'PUNT TD' or x == 'RUSHING TD' or x == 'PASSING TD') else (3 if (x == 'FG' or x == 'FG GOOD') else (-2 if x == 'SF' else -7 if ( x == 'PUNT RETURN TD' or x == 'MISSED FG TD' or x == 'INT TD' or x == 'FUMBLE RETURN TD' or x == 'DOWNS TD' or x == 'INT RETURN TOUCH' or x == 'FG MISSED TD' or x =='PUNT TD' or x == 'TURNOVER ON DOWNS TD') else 0 )))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"data['next_drive_point'] = -data['drive_point'].shift(-1).clip_lower(-2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"data.loc[data.drive_point == 0, 'drive_point'] = data['next_drive_point']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"pbp_data = pbp_data.merge(data[['drive_id','drive_point','drive_result']])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"exclude_playtype = ['Kickoff', 'End Period',\n",
	" 'Kickoff Return (Offense)',\n",
	" 'Kickoff Return Touchdown', 'End of Half', 'Defensive 2pt Conversion','Uncategorized', 'End of Game']\n",
	"\n",
	"game_end_drive = ['END OF HALF', 'END OF GAME', 'Uncategorized','END OF 4TH QUARTER', 'DOWNS TD','POSSESSION (FOR OT DRIVES)']\n",
	"\n",
	"regression_df = pbp_data[~(pbp_data.play_type.isin(exclude_playtype)) & (pbp_data.adjusted_yardline > 0)& (pbp_data.adjusted_yardline < 100) & ~(pbp_data.drive_result.isin(game_end_drive))].dropna()\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Gradient boosting classifier from sklearn is used here for expected point calculation"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
	" learning_rate=0.1, loss='deviance', max_depth=3,\n",
	" max_features=None, max_leaf_nodes=None,\n",
	" min_impurity_decrease=0.0, min_impurity_split=None,\n",
	" min_samples_leaf=1, min_samples_split=2,\n",
	" min_weight_fraction_leaf=0.0, n_estimators=200,\n",
	" n_iter_no_change=None, presort='auto',\n",
	" random_state=None, subsample=1.0, tol=0.0001,\n",
	" validation_fraction=0.1, verbose=0,\n",
	" warm_start=False)"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"clf = GradientBoostingClassifier(n_estimators = 200)\n",
	"clf.fit(regression_df[['down','distance','adjusted_yardline']], regression_df.drive_point)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Calculation of EPA below is for play from scrimmage only."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"special_team_play_type = ['Kickoff','Punt','Kickoff Return (Offense)', 'Kickoff Return Touchdown','Field Goal Good', 'Field Goal Missed', 'Blocked Field Goal', 'Blocked Punt','Punt Return Touchdown','Blocked Punt Touchdown','Missed Field Goal Return','Uncategorized', 'Missed Field Goal Return Touchdown','Defensive 2pt Conversion']\n",
	"timing_play_type = ['End Period','End of Game','Timeout','End of Half']\n",
	"turnover_play_type = ['Fumble Recovery (Opponent)','Pass Interception Return','Interception Return Touchdown','Fumble Return Touchdown','Safety','Interception','Pass Interception']\n",
	"regular_play_type = [ 'Rush', 'Sack', 'Pass Reception', 'Passing Touchdown','Pass Incompletion', 'Fumble Recovery (Own)','Rushing Touchdown','Pass Interception','Pass Completion']\n",
	"off_TD = ['Passing Touchdown','Rushing Touchdown']\n",
	"def_TD = ['Interception Return Touchdown','Fumble Return Touchdown']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play = pbp_data[pbp_data.play_type.isin(regular_play_type) \| pbp_data.play_type.isin(turnover_play_type) ]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Team abbreviation in play_text is obtained using regex match on the data. Here we just read csv after cleaning up, and match the abbrevation to offense and defense"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"CFB_teams_list = pd.read_csv('cfb_teams_list.csv',encoding='utf-8') "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['offense'],right_on=['full_name'])\n",
	"regular_play.rename(columns={'abbreviation':'off_abbr', 'full_name': 'off_full_name'}, inplace=True)\n",
	"regular_play = pd.merge(regular_play,CFB_teams_list,left_on=['defense'],right_on=['full_name'])\n",
	"regular_play.rename(columns={'abbreviation':'def_abbr', 'full_name': 'def_full_name'}, inplace=True)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Expected point at the start of the play:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [],
	"source": [
	"EP_predict = clf.predict_proba(regular_play[['down','distance','adjusted_yardline']])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [],
	"source": [
	"EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7\n",
	"regular_play['EP_start'] = EP"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Cleaning the data for expected point at the end of the play"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play['new_yardline']= 0\n",
	"regular_play['new_down']= 0\n",
	"regular_play['new_distance']= 0\n",
	"regular_play['turnover'] = 0"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [],
	"source": [
	"#Drop missing data and erroneous play type\n",
	"regular_play = regular_play[~pd.isna(regular_play.play_text) & (regular_play.play_type != 'Interception')] "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_down'] = 1\n",
	"regular_play.loc[regular_play.play_type.isin(turnover_play_type),'new_distance'] = 10\n",
	"\n",
	"regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_down'] = 1\n",
	"regular_play.loc[regular_play.play_text.str.contains('1ST'), 'new_distance'] = 10\n",
	"\n",
	"regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_down'] = regular_play.down + 1\n",
	"regular_play.loc[~regular_play.play_type.isin(turnover_play_type) & ~regular_play.play_text.str.contains('1ST'), 'new_distance'] = regular_play.distance - regular_play.yards_gained\n",
	"\n",
	"regular_play.loc[regular_play.play_text.str.contains('50 yard line'), 'new_yardline'] = 50\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_yardline'] = 100- (regular_play.yard_line + regular_play.yards_gained) \n",
	"regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_down'] = 1\n",
	"regular_play.loc[regular_play.play_type == 'Fumble Recovery (Opponent)', 'new_distance'] = 10\n",
	"\n",
	"regular_play.loc[regular_play.play_type == 'Sack', 'new_yardline'] = 100- (regular_play.yard_line - regular_play.yards_gained)\n",
	"regular_play.loc[regular_play.play_type == 'Sack', 'new_down'] = regular_play.down + 1\n",
	"regular_play.loc[regular_play.play_type == 'Sack', 'new_distance'] = regular_play.distance - regular_play.yards_gained"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"C:\\Users\\kayiu\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
	"A value is trying to be set on a copy of a slice from a DataFrame.\n",
	"Try using .loc[row_indexer,col_indexer] = value instead\n",
	"\n",
	"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
	" after removing the cwd from sys.path.\n",
	"C:\\Users\\kayiu\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
	"A value is trying to be set on a copy of a slice from a DataFrame.\n",
	"Try using .loc[row_indexer,col_indexer] = value instead\n",
	"\n",
	"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
	" \n"
	]
	}
	],
	"source": [
	"#Collect end of play yardline information (e.g. Alab 38 = Alabama own 38) from play_text and match the team abbreviation\n",
	"\n",
	"temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.off_abbr.values.astype(str)) >= 0] \n",
	"temp_df['split_string'] = [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.off_abbr.values.astype(str)))]\n",
	"regular_play.loc[temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].index, 'new_yardline'] = 100-np.array(temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].split_string.str.extract(r'(\\d+)').astype(float)).ravel()\n",
	"\n",
	"temp_df = regular_play.iloc[np.char.find(regular_play.play_text.values.astype(str), regular_play.def_abbr.values.astype(str)) >= 0]\n",
	"temp_df['split_string'] = [x[1] for x in list(np.char.split(temp_df.play_text.values.astype(str),sep =temp_df.def_abbr.values.astype(str)))]\n",
	"regular_play.loc[temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].index, 'new_yardline'] = np.array(temp_df[temp_df.play_text.str.contains('\\d+', regex=True)].split_string.str.extract(r'(\\d+)').astype(float)).ravel()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[pd.isna(regular_play.new_yardline),'new_distance'] = regular_play.distance - regular_play.yards_gained \n",
	"regular_play.loc[pd.isna(regular_play.new_yardline),'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained\n",
	"\n",
	"regular_play.loc[regular_play.play_type == 'Pass Incompletion', 'new_yardline'] = regular_play.adjusted_yardline\n",
	"\n",
	"regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_yardline'] = 80\n",
	"regular_play.loc[regular_play.play_text.str.contains('touchback'), 'new_down'] = 1\n",
	"\n",
	"#Fake data for model prediction, EP will be changed after processing the data\n",
	"\n",
	"regular_play.loc[regular_play.play_type == 'Safety', 'new_yardline'] = 99 #Fake yardline for Safety\n",
	"\n",
	"regular_play.loc[regular_play.play_type.isin(off_TD),'new_down'] = 1 #Fake new down for Offensive tocuhdown play\n",
	"regular_play.loc[regular_play.play_type.isin(off_TD),'new_distance'] = 10 #Fake new yards to go for Offensive tocuhdown play\n",
	"\n",
	"regular_play.loc[(regular_play.play_type.isin(off_TD) \| regular_play.play_type.isin(def_TD)),'new_yardline'] = 99 #Fake yardline for Offensive tocuhdown play\n",
	"\n",
	"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'turnover'] = 1 #Turnover on down\n",
	"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_down'] = 1 \n",
	"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_distance'] = 10\n",
	"regular_play.loc[(regular_play.new_down > 4) & ~(regular_play.play_type.isin(off_TD)),'new_yardline'] = 100-regular_play.new_yardline\n",
	"\n",
	"\n",
	"regular_play.loc[((regular_play.new_yardline <= 0) \|(regular_play.new_distance <= 0)) & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_down' ] = 1 #Strip sack\n",
	"regular_play.loc[((regular_play.new_yardline <= 0) \|(regular_play.new_distance <= 0)) & (regular_play.play_type == 'Sack') & (regular_play.play_text.str.contains('return')), 'new_distance' ] = 10 \n",
	"regular_play.loc[((regular_play.new_yardline <= 0) \|(regular_play.new_distance <= 0)) & (regular_play.play_text.str.contains('return')), 'new_yardline' ] = 100-(regular_play.adjusted_yardline - regular_play.yards_gained)\n",
	"regular_play.loc[ regular_play.play_text.str.contains('return'), 'turnover' ] = 1 \n",
	"\n",
	"regular_play.loc[regular_play.new_distance <= 0, 'new_down'] = 1 #First down not in API\n",
	"regular_play.loc[regular_play.new_distance <= 0, 'new_distance'] = 10"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[regular_play.new_yardline <= 0 ,'new_yardline'] = regular_play.adjusted_yardline - regular_play.yards_gained"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[regular_play.play_text.str.contains('TOUCHDOWN'),'new_yardline'] = 99"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Throw away some plays with error"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play = regular_play[(regular_play.new_yardline > 0) & (regular_play.new_yardline < 100) & (regular_play.adjusted_yardline > 0) & (regular_play.adjusted_yardline < 100)]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Calculate expected point at the end of play. Since statsmodels take column name input in prediction we first extract and rename feature columns"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [],
	"source": [
	"out_df = pd.DataFrame({'down':regular_play['new_down'],'distance':regular_play['new_distance'],'adjusted_yardline':regular_play['new_yardline']})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {},
	"outputs": [],
	"source": [
	"EP_predict = clf.predict_proba(out_df[['down','distance','adjusted_yardline']])\n",
	"EP = EP_predict[:,0]* -7 + EP_predict[:,1] * -3 + EP_predict[:,2] * -2 + EP_predict[:,4] * 2 + EP_predict[:,5] * 3 + EP_predict[:,6] * 7\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play['EP_end'] = EP"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Finally setting the expected point at end of touchdown and safety play to 7 and -2, and reverse the number for turnover plays"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[(regular_play.play_type.isin(off_TD) \| regular_play.play_type.isin(def_TD) \| regular_play.play_text.str.contains('TOUCHDOWN')),'EP_end'] = 7"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[(regular_play.play_type.isin(turnover_play_type)\| regular_play.turnover == 1),'EP_end'] *= -1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.loc[regular_play.play_type == 'Safety','EP_end'] = -2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play['EPA'] = regular_play['EP_end'] - regular_play['EP_start']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"metadata": {},
	"outputs": [],
	"source": [
	"pass_play_type = ['Sack','Pass Incompletion','Pass Interception Return','Pass Reception','Interception Return Touchdown','Passing Touchdown','Pass Completion','Pass Interception']\n",
	"rush_play_type = ['Fumble Recovery (Opponent)','Fumble Recovery (Own)','Fumble Return Touchdown','Rush','Rushing Touchdown']"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Check EPA by play type"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.03542604861003451"
	]
	},
	"execution_count": 39,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"regular_play[regular_play.play_type.isin(pass_play_type)]['EPA'].mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"-0.02507176193772802"
	]
	},
	"execution_count": 40,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"regular_play[regular_play.play_type.isin(rush_play_type)]['EPA'].mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"play_type\n",
	"Fumble Recovery (Opponent) -4.600877\n",
	"Fumble Recovery (Own) -0.935820\n",
	"Fumble Return Touchdown -7.319282\n",
	"Interception Return Touchdown -7.766629\n",
	"Pass Incompletion -1.104547\n",
	"Pass Interception Return -3.544499\n",
	"Pass Reception 0.912590\n",
	"Passing Touchdown 3.536264\n",
	"Rush -0.071102\n",
	"Rushing Touchdown 2.217327\n",
	"Sack -1.874424\n",
	"Safety -0.429088\n",
	"Name: EPA, dtype: float64"
	]
	},
	"execution_count": 41,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"regular_play.groupby('play_type')['EPA'].mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 42,
	"metadata": {},
	"outputs": [],
	"source": [
	"regular_play.to_csv('CFB_regular_play_18.csv')"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}