ryangooch/Automated-Bracket-Seeding-with-Python-and-Pandas.py Secret

## Automated-Bracket-Seeding-with-Python-and-Pandas.py

# coding: utf-8

# ## Automated NCAA Tournament Bracket Filling with Python and Pandas

# In[1]:

from urllib.request import urlretrieve
import numpy as np
import pandas as pd
from openpyxl import Workbook, load_workbook
import csv
import datetime

import warnings
warnings.filterwarnings('ignore')


# In[2]:

# Get the composite csv from masseyratings.com
urlretrieve('http://www.masseyratings.com/cb/compare.csv', 'masseyratings.csv')


# In[3]:

# need to parse the csv
# first row of interest starts at "team" in column 1
# data starts two rows after
data = []
with open('masseyratings.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter = ',')
    for row in reader:
        data.append(row)


# In[4]:

# The csv file wasn't formatted in a way pandas could read.
# There is a header section followed by the data, but the header
# section is variable in length. We can simply iterate through
# until we see the 'Team' string indicating the start of the data

i = 0 # counter of lines until Team found
for row in data:
    if row and row[0] != 'Team':
        i = i + 1
    if row and row[0] == 'Team':
        break

# The header data contains the abbreviations and URLs for the
# various ranking systems in the dataset, might be useful later
header_data = data[:i+2]
team_data = data[i+3:]
column_names = team_data[0] # Dataset column names
# strip whitespace from the column names for normalization
for i in range(len(column_names)):
    column_names[i] = column_names[i].lstrip()


# In[5]:

# drop the data set in a pandas Dataframe
team_data_df = pd.DataFrame(team_data[2:],columns=column_names)


# In[6]:

# Drop columns that are unnecessary
cols_to_drop = ["WL","Rank","Mean","Trimmed","Median","StDev","AP","USA"] # columns we won't use
# Split off human ratings and computer ratings
comp_ratings = team_data_df.drop(cols_to_drop,axis=1) # computer ratings
human_ratings = team_data_df[['Team','Conf','AP','USA']] # pulls out human ratings


# In[7]:

human_ratings.head(5)


# In[8]:

summary_df = team_data_df[["Team","Conf"]] # this will hold the final results


# In[9]:

human_ratings['AP'] = human_ratings['AP'].str.strip()
human_ratings['USA'] = human_ratings['USA'].str.strip()
human_ratings = human_ratings.apply(lambda x: pd.to_numeric(x, errors='ignore'))
comp_ratings = comp_ratings.apply(lambda x: pd.to_numeric(x, errors='ignore'))


# In[10]:

summary_df["comp_mean"] = comp_ratings.mean(axis=1,numeric_only=True)
summary_df["human_mean"] = human_ratings.mean(axis=1,numeric_only=True,skipna=True)


# In[11]:

human_ratings.mean(axis=1,skipna=True)


# In[12]:

summary_df


# In[13]:

def rank_calc(x, y):
    if np.isnan(y) :
        return x
    else:
        return ((2 * x + y) / 3.)


# In[14]:

summary_df["final_rank"] =     np.vectorize(rank_calc)(
        summary_df["comp_mean"], summary_df["human_mean"])


# In[15]:

summary_df.sort_values(by=['final_rank'],inplace=True)


# In[16]:

summary_df.head(5)


# In[17]:

# now that final rank is calculated, need to find auto-bids and
# at large
auto_bid_confs = pd.unique(summary_df['Conf'])


# In[ ]:


# In[18]:

summary_df[summary_df['Conf'].isin(['BE'])].ix[0]['Team']


# In[19]:

summary_df.head(5)


# In[20]:

# Using groupby to grab the auto bids
auto_bid_teams = summary_df.groupby(['Conf']).head(1)['Team'].values

# and we can use ~isin now to get at larges
at_large_teams = summary_df[~summary_df['Team'].isin(auto_bid_teams)].head(36)['Team'].values

# all 68 teams in one array
all_68 = np.append(auto_bid_teams,at_large_teams)


# In[21]:

all_68


# In[22]:

final_68 = summary_df[summary_df['Team'].isin(all_68)]


# In[23]:

final_68


# In[24]:

#add seeds
seeds = np.array([
        1,1,1,1,
        2,2,2,2,
        3,3,3,3,
        4,4,4,4,
        5,5,5,5,
        6,6,6,6,
        7,7,7,7,
        8,8,8,8,
        9,9,9,9,
        10,10,10,10,
        11,11,11,11,11,11,
        12,12,12,12,
        13,13,13,13,
        14,14,14,14,
        15,15,15,15,
        16,16,16,16,16,16
    ])


# In[25]:

final_68["seed"] = seeds


# In[26]:

final_68.head(16)


# In[27]:

# Need to tell Excel which cells get which team. Because we're
# using 'snake' method, there are specific cells corresponding
# to each seed and rank. Easiest way to do this is to simply
# hard code the table in, for now
excel_placements = [
    'C7', 'R7', 'R39', 'C39', 'C67', 'R67', 'R35', 'C35',
    'C27', 'R27', 'R59', 'C59', 'C51', 'R51', 'R19', 'C19',
    'C15', 'R15', 'R47', 'C47', 'C55', 'R55', 'R23', 'C23',
    'C31', 'R31', 'R63', 'C63', 'C43', 'R43', 'R11', 'C11',
    'C13', 'R13', 'R45', 'C45', 'C65', 'R65', 'R33', 'C33',
    'C25', 'R25', 'Q71', 'Q73', 'D71', 'D73', # 11 seeds
    'C49', 'R49', 'R17', 'C17', 'C21', 'R21', 'R53', 'C53',
    'C61', 'R61', 'R29', 'C29', 'C37', 'R37', 'R69', 'C69',
    'C41', 'R41', 'O71', 'O73', 'F71', 'F73'
]


# In[28]:

# append that to our final_68 dataframe
final_68['excel'] = excel_placements


# In[29]:

# input to the workbook. There is one set up called 'bracket.xlsx'
final_68.head(5)


# In[30]:

for index, row in final_68.iterrows():
    print(row['Team'], row['excel'])


# In[31]:

wb = Workbook()
ws = wb.active
for index, row in final_68.iterrows():
    ws[row['excel']] = row['Team']


# In[32]:

# get today's date, save bracket
today = str(datetime.date.today())
save_file = 'bracket' + today + '.xlsx'
wb.save(save_file)

	# coding: utf-8

	# ## Automated NCAA Tournament Bracket Filling with Python and Pandas

	# In[1]:

	from urllib.request import urlretrieve
	import numpy as np
	import pandas as pd
	from openpyxl import Workbook, load_workbook
	import csv
	import datetime

	import warnings
	warnings.filterwarnings('ignore')


	# In[2]:

	# Get the composite csv from masseyratings.com
	urlretrieve('http://www.masseyratings.com/cb/compare.csv', 'masseyratings.csv')


	# In[3]:

	# need to parse the csv
	# first row of interest starts at "team" in column 1
	# data starts two rows after
	data = []
	with open('masseyratings.csv', newline='') as csvfile:
	reader = csv.reader(csvfile, delimiter = ',')
	for row in reader:
	data.append(row)


	# In[4]:

	# The csv file wasn't formatted in a way pandas could read.
	# There is a header section followed by the data, but the header
	# section is variable in length. We can simply iterate through
	# until we see the 'Team' string indicating the start of the data

	i = 0 # counter of lines until Team found
	for row in data:
	if row and row[0] != 'Team':
	i = i + 1
	if row and row[0] == 'Team':
	break

	# The header data contains the abbreviations and URLs for the
	# various ranking systems in the dataset, might be useful later
	header_data = data[:i+2]
	team_data = data[i+3:]
	column_names = team_data[0] # Dataset column names
	# strip whitespace from the column names for normalization
	for i in range(len(column_names)):
	column_names[i] = column_names[i].lstrip()


	# In[5]:

	# drop the data set in a pandas Dataframe
	team_data_df = pd.DataFrame(team_data[2:],columns=column_names)


	# In[6]:

	# Drop columns that are unnecessary
	cols_to_drop = ["WL","Rank","Mean","Trimmed","Median","StDev","AP","USA"] # columns we won't use
	# Split off human ratings and computer ratings
	comp_ratings = team_data_df.drop(cols_to_drop,axis=1) # computer ratings
	human_ratings = team_data_df[['Team','Conf','AP','USA']] # pulls out human ratings


	# In[7]:

	human_ratings.head(5)


	# In[8]:

	summary_df = team_data_df[["Team","Conf"]] # this will hold the final results


	# In[9]:

	human_ratings['AP'] = human_ratings['AP'].str.strip()
	human_ratings['USA'] = human_ratings['USA'].str.strip()
	human_ratings = human_ratings.apply(lambda x: pd.to_numeric(x, errors='ignore'))
	comp_ratings = comp_ratings.apply(lambda x: pd.to_numeric(x, errors='ignore'))


	# In[10]:

	summary_df["comp_mean"] = comp_ratings.mean(axis=1,numeric_only=True)
	summary_df["human_mean"] = human_ratings.mean(axis=1,numeric_only=True,skipna=True)


	# In[11]:

	human_ratings.mean(axis=1,skipna=True)


	# In[12]:

	summary_df


	# In[13]:

	def rank_calc(x, y):
	if np.isnan(y) :
	return x
	else:
	return ((2 * x + y) / 3.)


	# In[14]:

	summary_df["final_rank"] = np.vectorize(rank_calc)(
	summary_df["comp_mean"], summary_df["human_mean"])


	# In[15]:

	summary_df.sort_values(by=['final_rank'],inplace=True)


	# In[16]:

	summary_df.head(5)


	# In[17]:

	# now that final rank is calculated, need to find auto-bids and
	# at large
	auto_bid_confs = pd.unique(summary_df['Conf'])


	# In[ ]:




	# In[18]:

	summary_df[summary_df['Conf'].isin(['BE'])].ix[0]['Team']


	# In[19]:

	summary_df.head(5)


	# In[20]:

	# Using groupby to grab the auto bids
	auto_bid_teams = summary_df.groupby(['Conf']).head(1)['Team'].values

	# and we can use ~isin now to get at larges
	at_large_teams = summary_df[~summary_df['Team'].isin(auto_bid_teams)].head(36)['Team'].values

	# all 68 teams in one array
	all_68 = np.append(auto_bid_teams,at_large_teams)


	# In[21]:

	all_68


	# In[22]:

	final_68 = summary_df[summary_df['Team'].isin(all_68)]


	# In[23]:

	final_68


	# In[24]:

	#add seeds
	seeds = np.array([
	1,1,1,1,
	2,2,2,2,
	3,3,3,3,
	4,4,4,4,
	5,5,5,5,
	6,6,6,6,
	7,7,7,7,
	8,8,8,8,
	9,9,9,9,
	10,10,10,10,
	11,11,11,11,11,11,
	12,12,12,12,
	13,13,13,13,
	14,14,14,14,
	15,15,15,15,
	16,16,16,16,16,16
	])


	# In[25]:

	final_68["seed"] = seeds


	# In[26]:

	final_68.head(16)


	# In[27]:

	# Need to tell Excel which cells get which team. Because we're
	# using 'snake' method, there are specific cells corresponding
	# to each seed and rank. Easiest way to do this is to simply
	# hard code the table in, for now
	excel_placements = [
	'C7', 'R7', 'R39', 'C39', 'C67', 'R67', 'R35', 'C35',
	'C27', 'R27', 'R59', 'C59', 'C51', 'R51', 'R19', 'C19',
	'C15', 'R15', 'R47', 'C47', 'C55', 'R55', 'R23', 'C23',
	'C31', 'R31', 'R63', 'C63', 'C43', 'R43', 'R11', 'C11',
	'C13', 'R13', 'R45', 'C45', 'C65', 'R65', 'R33', 'C33',
	'C25', 'R25', 'Q71', 'Q73', 'D71', 'D73', # 11 seeds
	'C49', 'R49', 'R17', 'C17', 'C21', 'R21', 'R53', 'C53',
	'C61', 'R61', 'R29', 'C29', 'C37', 'R37', 'R69', 'C69',
	'C41', 'R41', 'O71', 'O73', 'F71', 'F73'
	]


	# In[28]:

	# append that to our final_68 dataframe
	final_68['excel'] = excel_placements


	# In[29]:

	# input to the workbook. There is one set up called 'bracket.xlsx'
	final_68.head(5)


	# In[30]:

	for index, row in final_68.iterrows():
	print(row['Team'], row['excel'])


	# In[31]:

	wb = Workbook()
	ws = wb.active
	for index, row in final_68.iterrows():
	ws[row['excel']] = row['Team']


	# In[32]:

	# get today's date, save bracket
	today = str(datetime.date.today())
	save_file = 'bracket' + today + '.xlsx'
	wb.save(save_file)