Skip to content

Instantly share code, notes, and snippets.

@ryangooch
Created January 26, 2017 23:08
Show Gist options
  • Save ryangooch/db17932967f5b624424d557ac3727773 to your computer and use it in GitHub Desktop.
Save ryangooch/db17932967f5b624424d557ac3727773 to your computer and use it in GitHub Desktop.
# coding: utf-8
# ## Automated NCAA Tournament Bracket Filling with Python and Pandas
# In[1]:
from urllib.request import urlretrieve
import numpy as np
import pandas as pd
from openpyxl import Workbook, load_workbook
import csv
import datetime
import warnings
warnings.filterwarnings('ignore')
# In[2]:
# Get the composite csv from masseyratings.com
urlretrieve('http://www.masseyratings.com/cb/compare.csv', 'masseyratings.csv')
# In[3]:
# need to parse the csv
# first row of interest starts at "team" in column 1
# data starts two rows after
data = []
with open('masseyratings.csv', newline='') as csvfile:
reader = csv.reader(csvfile, delimiter = ',')
for row in reader:
data.append(row)
# In[4]:
# The csv file wasn't formatted in a way pandas could read.
# There is a header section followed by the data, but the header
# section is variable in length. We can simply iterate through
# until we see the 'Team' string indicating the start of the data
i = 0 # counter of lines until Team found
for row in data:
if row and row[0] != 'Team':
i = i + 1
if row and row[0] == 'Team':
break
# The header data contains the abbreviations and URLs for the
# various ranking systems in the dataset, might be useful later
header_data = data[:i+2]
team_data = data[i+3:]
column_names = team_data[0] # Dataset column names
# strip whitespace from the column names for normalization
for i in range(len(column_names)):
column_names[i] = column_names[i].lstrip()
# In[5]:
# drop the data set in a pandas Dataframe
team_data_df = pd.DataFrame(team_data[2:],columns=column_names)
# In[6]:
# Drop columns that are unnecessary
cols_to_drop = ["WL","Rank","Mean","Trimmed","Median","StDev","AP","USA"] # columns we won't use
# Split off human ratings and computer ratings
comp_ratings = team_data_df.drop(cols_to_drop,axis=1) # computer ratings
human_ratings = team_data_df[['Team','Conf','AP','USA']] # pulls out human ratings
# In[7]:
human_ratings.head(5)
# In[8]:
summary_df = team_data_df[["Team","Conf"]] # this will hold the final results
# In[9]:
human_ratings['AP'] = human_ratings['AP'].str.strip()
human_ratings['USA'] = human_ratings['USA'].str.strip()
human_ratings = human_ratings.apply(lambda x: pd.to_numeric(x, errors='ignore'))
comp_ratings = comp_ratings.apply(lambda x: pd.to_numeric(x, errors='ignore'))
# In[10]:
summary_df["comp_mean"] = comp_ratings.mean(axis=1,numeric_only=True)
summary_df["human_mean"] = human_ratings.mean(axis=1,numeric_only=True,skipna=True)
# In[11]:
human_ratings.mean(axis=1,skipna=True)
# In[12]:
summary_df
# In[13]:
def rank_calc(x, y):
if np.isnan(y) :
return x
else:
return ((2 * x + y) / 3.)
# In[14]:
summary_df["final_rank"] = np.vectorize(rank_calc)(
summary_df["comp_mean"], summary_df["human_mean"])
# In[15]:
summary_df.sort_values(by=['final_rank'],inplace=True)
# In[16]:
summary_df.head(5)
# In[17]:
# now that final rank is calculated, need to find auto-bids and
# at large
auto_bid_confs = pd.unique(summary_df['Conf'])
# In[ ]:
# In[18]:
summary_df[summary_df['Conf'].isin(['BE'])].ix[0]['Team']
# In[19]:
summary_df.head(5)
# In[20]:
# Using groupby to grab the auto bids
auto_bid_teams = summary_df.groupby(['Conf']).head(1)['Team'].values
# and we can use ~isin now to get at larges
at_large_teams = summary_df[~summary_df['Team'].isin(auto_bid_teams)].head(36)['Team'].values
# all 68 teams in one array
all_68 = np.append(auto_bid_teams,at_large_teams)
# In[21]:
all_68
# In[22]:
final_68 = summary_df[summary_df['Team'].isin(all_68)]
# In[23]:
final_68
# In[24]:
#add seeds
seeds = np.array([
1,1,1,1,
2,2,2,2,
3,3,3,3,
4,4,4,4,
5,5,5,5,
6,6,6,6,
7,7,7,7,
8,8,8,8,
9,9,9,9,
10,10,10,10,
11,11,11,11,11,11,
12,12,12,12,
13,13,13,13,
14,14,14,14,
15,15,15,15,
16,16,16,16,16,16
])
# In[25]:
final_68["seed"] = seeds
# In[26]:
final_68.head(16)
# In[27]:
# Need to tell Excel which cells get which team. Because we're
# using 'snake' method, there are specific cells corresponding
# to each seed and rank. Easiest way to do this is to simply
# hard code the table in, for now
excel_placements = [
'C7', 'R7', 'R39', 'C39', 'C67', 'R67', 'R35', 'C35',
'C27', 'R27', 'R59', 'C59', 'C51', 'R51', 'R19', 'C19',
'C15', 'R15', 'R47', 'C47', 'C55', 'R55', 'R23', 'C23',
'C31', 'R31', 'R63', 'C63', 'C43', 'R43', 'R11', 'C11',
'C13', 'R13', 'R45', 'C45', 'C65', 'R65', 'R33', 'C33',
'C25', 'R25', 'Q71', 'Q73', 'D71', 'D73', # 11 seeds
'C49', 'R49', 'R17', 'C17', 'C21', 'R21', 'R53', 'C53',
'C61', 'R61', 'R29', 'C29', 'C37', 'R37', 'R69', 'C69',
'C41', 'R41', 'O71', 'O73', 'F71', 'F73'
]
# In[28]:
# append that to our final_68 dataframe
final_68['excel'] = excel_placements
# In[29]:
# input to the workbook. There is one set up called 'bracket.xlsx'
final_68.head(5)
# In[30]:
for index, row in final_68.iterrows():
print(row['Team'], row['excel'])
# In[31]:
wb = Workbook()
ws = wb.active
for index, row in final_68.iterrows():
ws[row['excel']] = row['Team']
# In[32]:
# get today's date, save bracket
today = str(datetime.date.today())
save_file = 'bracket' + today + '.xlsx'
wb.save(save_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment