Skip to content

Instantly share code, notes, and snippets.

@dfeng
Created September 26, 2012 02:25
Show Gist options
  • Save dfeng/3785641 to your computer and use it in GitHub Desktop.
Save dfeng/3785641 to your computer and use it in GitHub Desktop.
Horse Scrape
import lxml.html
from lxml import etree
import requests
import csv
import os
import datetime
base_url = 'https://gg.com/racing/'
def get_dates_interval(start,finish,format):
# Creating a list of dates between the given dates, parsed by the parameter format
start = datetime.datetime.strptime(start,format)
finish = datetime.datetime.strptime(finish,format)
while True:
yield start.strftime("%d-%b-%Y").lower()
start = start + datetime.timedelta(days=1)
if start > finish:
break
def get_races(date):
# returns url of races on that day
url = base_url + date
r = requests.get(url)
dom = etree.HTML(r.text)
races = []
# sometimes, a race won't be run, but the link is still there
# see: https://gg.com/racing/21-may-2010
# We catch those situations by only finding those with the word RESULT
el = dom.xpath("//span[@class='result']/../following-sibling::td[1]/a")
for e in el:
races.append(e.get("href"))
return races
def get_results(url):
# gets the results from a race url
print url
r = requests.get(url)
dom = lxml.html.fromstring(r.text)
race = []
date, coursetime = url.split("/")[-2:]
course, time = coursetime.rsplit("-",1)
# retrieving the racecourse condition
condition = ""
for i in dom.cssselect("h1.winning-post .going"):
condition = i.text
# retrieving a list of nonrunners
nonrunners = ""
for i in dom.cssselect(".footnote"):
nonrunners = i.text.split(", ")
# for now, let's simply take the number of nonrunners
num_nonrunners = len(nonrunners)
# looping over the table cells in the results table, which gives you 4 cells per horse
for i,row in enumerate(dom.cssselect("#race-card tr td")):
mod = i % 4
pos = i / 4
# first cell contains finish, place, ...
if mod == 0:
horse = dict()
# we calculate finish ourselves. Sometimes though, a horse may not have an no ordinal number "1st"
# (for example, when they injured themselves)
# Hence, it is not enough to just take this variable
horse['finish'] = pos + 1
# we also extract the ordinal number
horse['place'] = row.text
horse['no'], horse['draw'] = row[1].text.replace("(","").replace(")","").split(" ")
# second cell
elif mod == 1:
horse['additional'] = ""
horse['form'] = lxml.html.tostring(list(row)[1]).split(">")[1]
# additional is the interesting information that sometimes is shown,
# like if this horse won the previous race
for child in row:
if child.tag == "a":
horse['additional'] = child.text
elif mod == 2:
horse['horse'] = row[0].text
horse['horse_id'] = row[0].get("href").split("-")[-1]
horse['jockey'] = row[2].text
horse['jockey_id'] = row[2].get("href").split("-")[-1]
horse['trainer'] = row[4].text
horse['trainer_id'] = row[4].get("href").split("-")[-1]
horse['owner'] = row[6].text
elif mod == 3:
horse['explain'] = ""
if len(row) == 1:
# situation where you don't have odds, so it's a withdrawal
horse['odds'] = ""
horse['explain'] = lxml.html.tostring(list(row)[0]).split(">")[1]
elif len(row) == 2:
# situation where you definitely have odds
horse['odds'] = row[0].text
# when there is no place (always given with a dash), then there is always (?) an explanation
if horse['place'] == "-" and len(row) != 1:
horse['explain'] = lxml.html.tostring(list(row)[1]).split(">")[1]
# these are just common information for each race. since we are dealing with a single flat file,
# the redundancy can't be helped
horse['date'] = date
horse['course'] = course
horse['time'] = time
horse['condition'] = condition
horse['nonrunners'] = num_nonrunners
race.append(horse)
return race
def dic_to_csv(d,path):
# d: a dictionary of results from one race
# path: filepath to the csv file to be created
keys = d[0].keys()
with open(path, 'wb') as f:
dt = csv.DictWriter(f, keys)
dt.writer.writerow(keys)
dt.writerows(d)
def main(dates):
table = []
for date in dates:
for race in get_races(date):
table.extend(get_results(race))
dic_to_csv(table,"./race.csv")
# daterange = get_dates_interval("2011-08-03","2011-08-03","%Y-%m-%d")
# main(daterange)
main(["03-oct-2011"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment