Created
September 26, 2012 02:25
-
-
Save dfeng/3785641 to your computer and use it in GitHub Desktop.
Horse Scrape
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lxml.html | |
from lxml import etree | |
import requests | |
import csv | |
import os | |
import datetime | |
base_url = 'https://gg.com/racing/' | |
def get_dates_interval(start,finish,format): | |
# Creating a list of dates between the given dates, parsed by the parameter format | |
start = datetime.datetime.strptime(start,format) | |
finish = datetime.datetime.strptime(finish,format) | |
while True: | |
yield start.strftime("%d-%b-%Y").lower() | |
start = start + datetime.timedelta(days=1) | |
if start > finish: | |
break | |
def get_races(date): | |
# returns url of races on that day | |
url = base_url + date | |
r = requests.get(url) | |
dom = etree.HTML(r.text) | |
races = [] | |
# sometimes, a race won't be run, but the link is still there | |
# see: https://gg.com/racing/21-may-2010 | |
# We catch those situations by only finding those with the word RESULT | |
el = dom.xpath("//span[@class='result']/../following-sibling::td[1]/a") | |
for e in el: | |
races.append(e.get("href")) | |
return races | |
def get_results(url): | |
# gets the results from a race url | |
print url | |
r = requests.get(url) | |
dom = lxml.html.fromstring(r.text) | |
race = [] | |
date, coursetime = url.split("/")[-2:] | |
course, time = coursetime.rsplit("-",1) | |
# retrieving the racecourse condition | |
condition = "" | |
for i in dom.cssselect("h1.winning-post .going"): | |
condition = i.text | |
# retrieving a list of nonrunners | |
nonrunners = "" | |
for i in dom.cssselect(".footnote"): | |
nonrunners = i.text.split(", ") | |
# for now, let's simply take the number of nonrunners | |
num_nonrunners = len(nonrunners) | |
# looping over the table cells in the results table, which gives you 4 cells per horse | |
for i,row in enumerate(dom.cssselect("#race-card tr td")): | |
mod = i % 4 | |
pos = i / 4 | |
# first cell contains finish, place, ... | |
if mod == 0: | |
horse = dict() | |
# we calculate finish ourselves. Sometimes though, a horse may not have an no ordinal number "1st" | |
# (for example, when they injured themselves) | |
# Hence, it is not enough to just take this variable | |
horse['finish'] = pos + 1 | |
# we also extract the ordinal number | |
horse['place'] = row.text | |
horse['no'], horse['draw'] = row[1].text.replace("(","").replace(")","").split(" ") | |
# second cell | |
elif mod == 1: | |
horse['additional'] = "" | |
horse['form'] = lxml.html.tostring(list(row)[1]).split(">")[1] | |
# additional is the interesting information that sometimes is shown, | |
# like if this horse won the previous race | |
for child in row: | |
if child.tag == "a": | |
horse['additional'] = child.text | |
elif mod == 2: | |
horse['horse'] = row[0].text | |
horse['horse_id'] = row[0].get("href").split("-")[-1] | |
horse['jockey'] = row[2].text | |
horse['jockey_id'] = row[2].get("href").split("-")[-1] | |
horse['trainer'] = row[4].text | |
horse['trainer_id'] = row[4].get("href").split("-")[-1] | |
horse['owner'] = row[6].text | |
elif mod == 3: | |
horse['explain'] = "" | |
if len(row) == 1: | |
# situation where you don't have odds, so it's a withdrawal | |
horse['odds'] = "" | |
horse['explain'] = lxml.html.tostring(list(row)[0]).split(">")[1] | |
elif len(row) == 2: | |
# situation where you definitely have odds | |
horse['odds'] = row[0].text | |
# when there is no place (always given with a dash), then there is always (?) an explanation | |
if horse['place'] == "-" and len(row) != 1: | |
horse['explain'] = lxml.html.tostring(list(row)[1]).split(">")[1] | |
# these are just common information for each race. since we are dealing with a single flat file, | |
# the redundancy can't be helped | |
horse['date'] = date | |
horse['course'] = course | |
horse['time'] = time | |
horse['condition'] = condition | |
horse['nonrunners'] = num_nonrunners | |
race.append(horse) | |
return race | |
def dic_to_csv(d,path): | |
# d: a dictionary of results from one race | |
# path: filepath to the csv file to be created | |
keys = d[0].keys() | |
with open(path, 'wb') as f: | |
dt = csv.DictWriter(f, keys) | |
dt.writer.writerow(keys) | |
dt.writerows(d) | |
def main(dates): | |
table = [] | |
for date in dates: | |
for race in get_races(date): | |
table.extend(get_results(race)) | |
dic_to_csv(table,"./race.csv") | |
# daterange = get_dates_interval("2011-08-03","2011-08-03","%Y-%m-%d") | |
# main(daterange) | |
main(["03-oct-2011"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment