Created
January 26, 2017 23:08
-
-
Save ryangooch/db17932967f5b624424d557ac3727773 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# ## Automated NCAA Tournament Bracket Filling with Python and Pandas | |
# In[1]: | |
from urllib.request import urlretrieve | |
import numpy as np | |
import pandas as pd | |
from openpyxl import Workbook, load_workbook | |
import csv | |
import datetime | |
import warnings | |
warnings.filterwarnings('ignore') | |
# In[2]: | |
# Get the composite csv from masseyratings.com | |
urlretrieve('http://www.masseyratings.com/cb/compare.csv', 'masseyratings.csv') | |
# In[3]: | |
# need to parse the csv | |
# first row of interest starts at "team" in column 1 | |
# data starts two rows after | |
data = [] | |
with open('masseyratings.csv', newline='') as csvfile: | |
reader = csv.reader(csvfile, delimiter = ',') | |
for row in reader: | |
data.append(row) | |
# In[4]: | |
# The csv file wasn't formatted in a way pandas could read. | |
# There is a header section followed by the data, but the header | |
# section is variable in length. We can simply iterate through | |
# until we see the 'Team' string indicating the start of the data | |
i = 0 # counter of lines until Team found | |
for row in data: | |
if row and row[0] != 'Team': | |
i = i + 1 | |
if row and row[0] == 'Team': | |
break | |
# The header data contains the abbreviations and URLs for the | |
# various ranking systems in the dataset, might be useful later | |
header_data = data[:i+2] | |
team_data = data[i+3:] | |
column_names = team_data[0] # Dataset column names | |
# strip whitespace from the column names for normalization | |
for i in range(len(column_names)): | |
column_names[i] = column_names[i].lstrip() | |
# In[5]: | |
# drop the data set in a pandas Dataframe | |
team_data_df = pd.DataFrame(team_data[2:],columns=column_names) | |
# In[6]: | |
# Drop columns that are unnecessary | |
cols_to_drop = ["WL","Rank","Mean","Trimmed","Median","StDev","AP","USA"] # columns we won't use | |
# Split off human ratings and computer ratings | |
comp_ratings = team_data_df.drop(cols_to_drop,axis=1) # computer ratings | |
human_ratings = team_data_df[['Team','Conf','AP','USA']] # pulls out human ratings | |
# In[7]: | |
human_ratings.head(5) | |
# In[8]: | |
summary_df = team_data_df[["Team","Conf"]] # this will hold the final results | |
# In[9]: | |
human_ratings['AP'] = human_ratings['AP'].str.strip() | |
human_ratings['USA'] = human_ratings['USA'].str.strip() | |
human_ratings = human_ratings.apply(lambda x: pd.to_numeric(x, errors='ignore')) | |
comp_ratings = comp_ratings.apply(lambda x: pd.to_numeric(x, errors='ignore')) | |
# In[10]: | |
summary_df["comp_mean"] = comp_ratings.mean(axis=1,numeric_only=True) | |
summary_df["human_mean"] = human_ratings.mean(axis=1,numeric_only=True,skipna=True) | |
# In[11]: | |
human_ratings.mean(axis=1,skipna=True) | |
# In[12]: | |
summary_df | |
# In[13]: | |
def rank_calc(x, y): | |
if np.isnan(y) : | |
return x | |
else: | |
return ((2 * x + y) / 3.) | |
# In[14]: | |
summary_df["final_rank"] = np.vectorize(rank_calc)( | |
summary_df["comp_mean"], summary_df["human_mean"]) | |
# In[15]: | |
summary_df.sort_values(by=['final_rank'],inplace=True) | |
# In[16]: | |
summary_df.head(5) | |
# In[17]: | |
# now that final rank is calculated, need to find auto-bids and | |
# at large | |
auto_bid_confs = pd.unique(summary_df['Conf']) | |
# In[ ]: | |
# In[18]: | |
summary_df[summary_df['Conf'].isin(['BE'])].ix[0]['Team'] | |
# In[19]: | |
summary_df.head(5) | |
# In[20]: | |
# Using groupby to grab the auto bids | |
auto_bid_teams = summary_df.groupby(['Conf']).head(1)['Team'].values | |
# and we can use ~isin now to get at larges | |
at_large_teams = summary_df[~summary_df['Team'].isin(auto_bid_teams)].head(36)['Team'].values | |
# all 68 teams in one array | |
all_68 = np.append(auto_bid_teams,at_large_teams) | |
# In[21]: | |
all_68 | |
# In[22]: | |
final_68 = summary_df[summary_df['Team'].isin(all_68)] | |
# In[23]: | |
final_68 | |
# In[24]: | |
#add seeds | |
seeds = np.array([ | |
1,1,1,1, | |
2,2,2,2, | |
3,3,3,3, | |
4,4,4,4, | |
5,5,5,5, | |
6,6,6,6, | |
7,7,7,7, | |
8,8,8,8, | |
9,9,9,9, | |
10,10,10,10, | |
11,11,11,11,11,11, | |
12,12,12,12, | |
13,13,13,13, | |
14,14,14,14, | |
15,15,15,15, | |
16,16,16,16,16,16 | |
]) | |
# In[25]: | |
final_68["seed"] = seeds | |
# In[26]: | |
final_68.head(16) | |
# In[27]: | |
# Need to tell Excel which cells get which team. Because we're | |
# using 'snake' method, there are specific cells corresponding | |
# to each seed and rank. Easiest way to do this is to simply | |
# hard code the table in, for now | |
excel_placements = [ | |
'C7', 'R7', 'R39', 'C39', 'C67', 'R67', 'R35', 'C35', | |
'C27', 'R27', 'R59', 'C59', 'C51', 'R51', 'R19', 'C19', | |
'C15', 'R15', 'R47', 'C47', 'C55', 'R55', 'R23', 'C23', | |
'C31', 'R31', 'R63', 'C63', 'C43', 'R43', 'R11', 'C11', | |
'C13', 'R13', 'R45', 'C45', 'C65', 'R65', 'R33', 'C33', | |
'C25', 'R25', 'Q71', 'Q73', 'D71', 'D73', # 11 seeds | |
'C49', 'R49', 'R17', 'C17', 'C21', 'R21', 'R53', 'C53', | |
'C61', 'R61', 'R29', 'C29', 'C37', 'R37', 'R69', 'C69', | |
'C41', 'R41', 'O71', 'O73', 'F71', 'F73' | |
] | |
# In[28]: | |
# append that to our final_68 dataframe | |
final_68['excel'] = excel_placements | |
# In[29]: | |
# input to the workbook. There is one set up called 'bracket.xlsx' | |
final_68.head(5) | |
# In[30]: | |
for index, row in final_68.iterrows(): | |
print(row['Team'], row['excel']) | |
# In[31]: | |
wb = Workbook() | |
ws = wb.active | |
for index, row in final_68.iterrows(): | |
ws[row['excel']] = row['Team'] | |
# In[32]: | |
# get today's date, save bracket | |
today = str(datetime.date.today()) | |
save_file = 'bracket' + today + '.xlsx' | |
wb.save(save_file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment