dfeng/horsescrape.py

## horsescrape.py
import lxml.html
from lxml import etree
import requests
import csv
import os
import datetime

base_url = 'https://gg.com/racing/'

def get_dates_interval(start,finish,format):
# Creating a list of dates between the given dates, parsed by the parameter format
   start = datetime.datetime.strptime(start,format)
   finish = datetime.datetime.strptime(finish,format)
   while True:
      yield start.strftime("%d-%b-%Y").lower()
      start = start + datetime.timedelta(days=1)
      if start > finish:
         break

def get_races(date):
# returns url of races on that day
   url = base_url + date
   r = requests.get(url)
   dom = etree.HTML(r.text)
   races = []
   # sometimes, a race won't be run, but the link is still there
   # see: https://gg.com/racing/21-may-2010
   # We catch those situations by only finding those with the word RESULT
   el = dom.xpath("//span[@class='result']/../following-sibling::td[1]/a")
   for e in el:
      races.append(e.get("href"))
   return races

def get_results(url):
# gets the results from a race url
   print url
   r = requests.get(url)
   dom = lxml.html.fromstring(r.text)
   race = []
   date, coursetime = url.split("/")[-2:]
   course, time = coursetime.rsplit("-",1)
   # retrieving the racecourse condition
   condition = ""
   for i in dom.cssselect("h1.winning-post .going"):
      condition = i.text
   # retrieving a list of nonrunners
   nonrunners = ""
   for i in dom.cssselect(".footnote"):
      nonrunners = i.text.split(", ")
   # for now, let's simply take the number of nonrunners
   num_nonrunners = len(nonrunners)
   # looping over the table cells in the results table, which gives you 4 cells per horse
   for i,row in enumerate(dom.cssselect("#race-card tr td")):
      mod = i % 4
      pos = i / 4
      # first cell contains finish, place, ...
      if mod == 0:
         horse = dict()
         # we calculate finish ourselves. Sometimes though, a horse may not have an no ordinal number "1st"
         # (for example, when they injured themselves)
         # Hence, it is not enough to just take this variable
         horse['finish'] = pos + 1
         # we also extract the ordinal number
         horse['place'] = row.text
         horse['no'], horse['draw'] = row[1].text.replace("(","").replace(")","").split(" ")
      # second cell
      elif mod == 1:
         horse['additional'] = ""
         horse['form'] = lxml.html.tostring(list(row)[1]).split(">")[1]
         # additional is the interesting information that sometimes is shown,
         # like if this horse won the previous race
         for child in row:
            if child.tag == "a":
               horse['additional'] = child.text
      elif mod == 2:
         horse['horse'] = row[0].text
         horse['horse_id'] = row[0].get("href").split("-")[-1]
         horse['jockey'] = row[2].text
         horse['jockey_id'] = row[2].get("href").split("-")[-1]
         horse['trainer'] = row[4].text
         horse['trainer_id'] = row[4].get("href").split("-")[-1]
         horse['owner'] = row[6].text
      elif mod == 3:
         horse['explain'] = ""
         if len(row) == 1:
            # situation where you don't have odds, so it's a withdrawal
            horse['odds'] = ""
            horse['explain'] = lxml.html.tostring(list(row)[0]).split(">")[1]
         elif len(row) == 2:
            # situation where you definitely have odds
            horse['odds'] = row[0].text
         # when there is no place (always given with a dash), then there is always (?) an explanation
         if horse['place'] == "-" and len(row) != 1:
            horse['explain'] = lxml.html.tostring(list(row)[1]).split(">")[1]
         # these are just common information for each race. since we are dealing with a single flat file,
         # the redundancy can't be helped
         horse['date'] = date
         horse['course'] = course
         horse['time'] = time
         horse['condition'] = condition
         horse['nonrunners'] = num_nonrunners
         race.append(horse)
   return race

def dic_to_csv(d,path):
# d: a dictionary of results from one race
# path: filepath to the csv file to be created
   keys = d[0].keys()
   with open(path, 'wb') as f:
      dt = csv.DictWriter(f, keys)
      dt.writer.writerow(keys)
      dt.writerows(d)

def main(dates):
   table = []
   for date in dates:
      for race in get_races(date):
         table.extend(get_results(race))
   dic_to_csv(table,"./race.csv")

# daterange = get_dates_interval("2011-08-03","2011-08-03","%Y-%m-%d")
# main(daterange)
main(["03-oct-2011"])
	import lxml.html
	from lxml import etree
	import requests
	import csv
	import os
	import datetime

	base_url = 'https://gg.com/racing/'

	def get_dates_interval(start,finish,format):
	# Creating a list of dates between the given dates, parsed by the parameter format
	start = datetime.datetime.strptime(start,format)
	finish = datetime.datetime.strptime(finish,format)
	while True:
	yield start.strftime("%d-%b-%Y").lower()
	start = start + datetime.timedelta(days=1)
	if start > finish:
	break

	def get_races(date):
	# returns url of races on that day
	url = base_url + date
	r = requests.get(url)
	dom = etree.HTML(r.text)
	races = []
	# sometimes, a race won't be run, but the link is still there
	# see: https://gg.com/racing/21-may-2010
	# We catch those situations by only finding those with the word RESULT
	el = dom.xpath("//span[@class='result']/../following-sibling::td[1]/a")
	for e in el:
	races.append(e.get("href"))
	return races

	def get_results(url):
	# gets the results from a race url
	print url
	r = requests.get(url)
	dom = lxml.html.fromstring(r.text)
	race = []
	date, coursetime = url.split("/")[-2:]
	course, time = coursetime.rsplit("-",1)
	# retrieving the racecourse condition
	condition = ""
	for i in dom.cssselect("h1.winning-post .going"):
	condition = i.text
	# retrieving a list of nonrunners
	nonrunners = ""
	for i in dom.cssselect(".footnote"):
	nonrunners = i.text.split(", ")
	# for now, let's simply take the number of nonrunners
	num_nonrunners = len(nonrunners)
	# looping over the table cells in the results table, which gives you 4 cells per horse
	for i,row in enumerate(dom.cssselect("#race-card tr td")):
	mod = i % 4
	pos = i / 4
	# first cell contains finish, place, ...
	if mod == 0:
	horse = dict()
	# we calculate finish ourselves. Sometimes though, a horse may not have an no ordinal number "1st"
	# (for example, when they injured themselves)
	# Hence, it is not enough to just take this variable
	horse['finish'] = pos + 1
	# we also extract the ordinal number
	horse['place'] = row.text
	horse['no'], horse['draw'] = row[1].text.replace("(","").replace(")","").split(" ")
	# second cell
	elif mod == 1:
	horse['additional'] = ""
	horse['form'] = lxml.html.tostring(list(row)[1]).split(">")[1]
	# additional is the interesting information that sometimes is shown,
	# like if this horse won the previous race
	for child in row:
	if child.tag == "a":
	horse['additional'] = child.text
	elif mod == 2:
	horse['horse'] = row[0].text
	horse['horse_id'] = row[0].get("href").split("-")[-1]
	horse['jockey'] = row[2].text
	horse['jockey_id'] = row[2].get("href").split("-")[-1]
	horse['trainer'] = row[4].text
	horse['trainer_id'] = row[4].get("href").split("-")[-1]
	horse['owner'] = row[6].text
	elif mod == 3:
	horse['explain'] = ""
	if len(row) == 1:
	# situation where you don't have odds, so it's a withdrawal
	horse['odds'] = ""
	horse['explain'] = lxml.html.tostring(list(row)[0]).split(">")[1]
	elif len(row) == 2:
	# situation where you definitely have odds
	horse['odds'] = row[0].text
	# when there is no place (always given with a dash), then there is always (?) an explanation
	if horse['place'] == "-" and len(row) != 1:
	horse['explain'] = lxml.html.tostring(list(row)[1]).split(">")[1]
	# these are just common information for each race. since we are dealing with a single flat file,
	# the redundancy can't be helped
	horse['date'] = date
	horse['course'] = course
	horse['time'] = time
	horse['condition'] = condition
	horse['nonrunners'] = num_nonrunners
	race.append(horse)
	return race

	def dic_to_csv(d,path):
	# d: a dictionary of results from one race
	# path: filepath to the csv file to be created
	keys = d[0].keys()
	with open(path, 'wb') as f:
	dt = csv.DictWriter(f, keys)
	dt.writer.writerow(keys)
	dt.writerows(d)

	def main(dates):
	table = []
	for date in dates:
	for race in get_races(date):
	table.extend(get_results(race))
	dic_to_csv(table,"./race.csv")

	# daterange = get_dates_interval("2011-08-03","2011-08-03","%Y-%m-%d")
	# main(daterange)
	main(["03-oct-2011"])