dfeng/scraper.py

## scraper.py
import lxml.html
import requests
import csv
import os

base_url = 'http://fantasyfootballcalculator.com/'
folder = "./Drafts"
num = 1
verbose = True

def get_mock_drafts():
# Scraping the list of drafts
	if verbose:
		print "Scraping the list of mock drafts"
	drafts = []
	for i in xrange(num):
		url = base_url + "completed_drafts.php?format=standard&teams=all&list=%d" % (i*25)
		html = requests.get(url)
		dom = lxml.html.fromstring(html.text)
		for tr in dom.cssselect("#completed-drafts tbody tr"):
			draft = dict()
			draft['url'] = list(tr[8])[0].get('href')
			draft['totalteams'] = tr[4].text
			drafts.append(draft)
	if verbose:
		print "Scraped %d mock draft urls" % len(drafts)
	return drafts

def get_picks(draft):
# draft gives the url and total number of teams of a particular mock draft
# returns the mock draft results, including team information and draft pick order
	url = base_url + draft['url']
	totalteams = int(draft['totalteams'])
	r = requests.get(url)
	if r.status_code != requests.codes.ok:
		return False
	dom = lxml.html.fromstring(r.text)
	teams = []
	picks = []
	for th in dom.cssselect("#headRow th[class!='roundCol']"):
		team = dict()
		team['name'] = th.text
		typ = th.get('class')
		if not typ:
			team['type'] = 'human'
		else:
			team['type'] = typ
		teams.append(team)
	css = dom.cssselect("#draftboardBody td[class!='rowLabel']")
	draftno = len(css)
	totalrows = draftno / totalteams
	gap = (totalteams - draftno) % totalteams
	ugly = False
	if totalrows % 2 == 1 and gap != 0:
		ugly = True
	for i,td in enumerate(css):
		pick = dict()
		pick['name'] = " ".join([td.text, lxml.html.tostring(list(td)[0])[4:]])
		pick['position'], pick['team'], pick['number'] = lxml.html.tostring(list(td)[1])[4:].replace("(","").replace(")","").split(" ")

		mod = i % totalteams
		row = i / totalteams
		if i/totalteams % 2 == 0:
			pos = i
		else:
			pos = (i/totalteams+1)*totalteams - mod-1
		# edge case where the table didn't fill up, and ends on an even row - ruins all the alignment
		if row == totalrows and ugly:
			mod = mod + gap
			pos = pos - gap
		pick['teamname'] = teams[mod]['name']
		pick['teamtype'] = teams[mod]['type']
		pick['draftposition'] = pos+1
		picks.append(pick)
	return picks

def dic_to_csv(picks,path):
# picks: a dictionary of mock draft results
# path: filepath to the csv file to be created
	keys = picks[0].keys()
	with open(path, 'wb') as f:
		dt = csv.DictWriter(f, keys)
		dt.writer.writerow(keys)
		dt.writerows(picks)

def main():
	# create folder for csv
	if not os.path.exists(folder):
	    os.makedirs(folder)
	for draft in get_mock_drafts():
		mockid = draft['url'][6:]
		filepath = "".join([folder, "/", mockid, '.csv'])
		if not os.path.isfile(filepath):
			picks = get_picks(draft)
			if picks:
				dic_to_csv(picks,filepath)

main()
	import lxml.html
	import requests
	import csv
	import os

	base_url = 'http://fantasyfootballcalculator.com/'
	folder = "./Drafts"
	num = 1
	verbose = True

	def get_mock_drafts():
	# Scraping the list of drafts
	if verbose:
	print "Scraping the list of mock drafts"
	drafts = []
	for i in xrange(num):
	url = base_url + "completed_drafts.php?format=standard&teams=all&list=%d" % (i*25)
	html = requests.get(url)
	dom = lxml.html.fromstring(html.text)
	for tr in dom.cssselect("#completed-drafts tbody tr"):
	draft = dict()
	draft['url'] = list(tr[8])[0].get('href')
	draft['totalteams'] = tr[4].text
	drafts.append(draft)
	if verbose:
	print "Scraped %d mock draft urls" % len(drafts)
	return drafts

	def get_picks(draft):
	# draft gives the url and total number of teams of a particular mock draft
	# returns the mock draft results, including team information and draft pick order
	url = base_url + draft['url']
	totalteams = int(draft['totalteams'])
	r = requests.get(url)
	if r.status_code != requests.codes.ok:
	return False
	dom = lxml.html.fromstring(r.text)
	teams = []
	picks = []
	for th in dom.cssselect("#headRow th[class!='roundCol']"):
	team = dict()
	team['name'] = th.text
	typ = th.get('class')
	if not typ:
	team['type'] = 'human'
	else:
	team['type'] = typ
	teams.append(team)
	css = dom.cssselect("#draftboardBody td[class!='rowLabel']")
	draftno = len(css)
	totalrows = draftno / totalteams
	gap = (totalteams - draftno) % totalteams
	ugly = False
	if totalrows % 2 == 1 and gap != 0:
	ugly = True
	for i,td in enumerate(css):
	pick = dict()
	pick['name'] = " ".join([td.text, lxml.html.tostring(list(td)[0])[4:]])
	pick['position'], pick['team'], pick['number'] = lxml.html.tostring(list(td)[1])[4:].replace("(","").replace(")","").split(" ")

	mod = i % totalteams
	row = i / totalteams
	if i/totalteams % 2 == 0:
	pos = i
	else:
	pos = (i/totalteams+1)*totalteams - mod-1
	# edge case where the table didn't fill up, and ends on an even row - ruins all the alignment
	if row == totalrows and ugly:
	mod = mod + gap
	pos = pos - gap
	pick['teamname'] = teams[mod]['name']
	pick['teamtype'] = teams[mod]['type']
	pick['draftposition'] = pos+1
	picks.append(pick)
	return picks

	def dic_to_csv(picks,path):
	# picks: a dictionary of mock draft results
	# path: filepath to the csv file to be created
	keys = picks[0].keys()
	with open(path, 'wb') as f:
	dt = csv.DictWriter(f, keys)
	dt.writer.writerow(keys)
	dt.writerows(picks)

	def main():
	# create folder for csv
	if not os.path.exists(folder):
	os.makedirs(folder)
	for draft in get_mock_drafts():
	mockid = draft['url'][6:]
	filepath = "".join([folder, "/", mockid, '.csv'])
	if not os.path.isfile(filepath):
	picks = get_picks(draft)
	if picks:
	dic_to_csv(picks,filepath)

	main()