Skip to content

Instantly share code, notes, and snippets.

@unicore32
Created December 30, 2016 17:23
Show Gist options
  • Save unicore32/ac9bd9638e07ab34f3f7ddf07584a15b to your computer and use it in GitHub Desktop.
Save unicore32/ac9bd9638e07ab34f3f7ddf07584a15b to your computer and use it in GitHub Desktop.
#!/usr/bin/python
import re
import xlwt
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import pandas as pd
import japandas as jpd
LOGIN_URL = "https://account.netkeiba.com/?pid=login"
SEARCH_RACE_URL = "http://db.netkeiba.com/?pid=race_search_detail"
RACE_RESULT_URL = "http://race.netkeiba.com/?pid=race&id=c%s&mode=result"
RACE_OIKIRI_URL = "http://race.netkeiba.com/?pid=race&id=c%s&mode=oikiri"
RACE_SHUTUBA_URL = "http://race.netkeiba.com/?pid=shutuba_detail&id=c%s"
HORSE_RESULT_URL = "http://db.netkeiba.com/horse/result/%s/"
JOCKEY_RESULT_URL = "http://db.netkeiba.com/jockey/%s/"
xls_file = "horse.xls"
book = xlwt.Workbook()
ID = ""
PASSWORD = ""
class browser(object):
__driver = webdriver.Chrome()
def __init__(self):
self.__driver.get(LOGIN_URL)
self.__driver.find_element_by_name("login_id").send_keys(ID)
self.__driver.find_element_by_name("pswd").send_keys(PASSWORD)
self.__driver.find_element_by_name("pswd").send_keys(Keys.RETURN)
def open(self, url):
self.__driver.get(url)
def searchRace(self, name, start_year=2008, start_month=0, end_year=0, end_month=0):
self.__driver.get(SEARCH_RACE_URL)
if start_year:
Select(self.__driver.find_element_by_name("start_year")).select_by_visible_text(str(start_year))
if end_year:
Select(self.__driver.find_element_by_name("end_year")).select_by_visible_text(str(end_year))
self.__driver.find_element_by_id("db_search_detail_form").find_element_by_name("word").send_keys(name)
if start_month:
if start_month < 13:
Select(self.__driver.find_element_by_name("start_mon")).select_by_visible_text(str(start_month))
if end_month:
if end_mon < 13:
Select(self.__driver.find_element_by_name("end_mon")).select_by_visible_text(str(end_month))
for num in range(1, 11):
str_name = "check_Jyo_" + "{0:02d}".format(num)
self.__driver.find_element_by_id(str_name).click()
self.__driver.find_element_by_name("end_year").send_keys(Keys.RETURN)
self.__driver.find_element_by_name("sortimage").click()
def readValue(self, tag, class_name=""):
__html = self.__driver.page_source
__soup = BeautifulSoup(__html, 'html.parser')
if class_name == "":
return __soup.find_all(tag)
else:
return __soup.find_all(tag, class_=class_name)
def click(self, text):
self.__driver.find_element_by_link_text(text).click()
def back(self):
self.__driver.back()
def newWindow(self):
self.__driver.execute_script("window.open('');")
def screenshot(self, name):
self.__driver.save_screenshot(name)
def quit(self):
self.__driver.quit()
def tableParse(no_del=0):
table = browser.readValue("table", "race_table_01")[0]
rows = table.find_all("tr")
return_rows = rows[:]
del return_rows[0]
return return_rows
def extractHorses(sheet):
rows = tableParse(1)
for num, name in enumerate(["着順", "枠番", "馬番", "馬名", "性別", "年齢", "斤量", "騎手", "タイム", "着差", "タイム指数", "通過順", "上り3F", "単勝オッズ", "人気", "馬体重", "備考", "調教師", "馬主"]):
sheet.write(0, num, name)
for num1, row in enumerate(rows):
row_td = row.find_all("td")
row_td.insert(5, re.search("\d{1,2}", str(row_td[4])).group(0))
row_td[4] = re.sub("\d{1,2}", "", str(row_td[4]))
del row_td[16:18]
row_td.pop()
for num2, cell in enumerate(row_td):
soup = BeautifulSoup(str(cell))
if num2 == 5:
val = str(cell)
elif soup.td.string:
val = str(soup.td.string)
elif soup.find("a"):
val = str(soup.a.string)
else:
val = None
if val:
return_val = val.strip()
if return_val.replace(".", "").isdigit():
return_val = float(return_val)
sheet.write(num1+1, num2, return_val)
def extractRace(race):
# レースID
race_id = int(race.replace("/race/", "").replace("/", ""))
text1 = browser.readValue("dl", "racedata fc")[0]
text2 = browser.readValue("p", "smalltxt")[0]
# 天候
race_weather_text = str(re.search(u"天候 : [晴|曇|雨|小雨|雪|小雪]", str(text1.find_all("span"))).group(0))
if race_weather_text.find(u"晴") != -1:
race_weather = 1
elif race_weather_text.find(u"曇") != -1:
race_weather = 2
elif race_weather_text.find(u"雨") != -1:
race_weather = 3
elif race_weather_text.find(u"小雨") != -1:
race_weather = 4
elif race_weather_text.find(u"雪") != -1:
race_weather = 5
elif race_weather_text.find(u"小雪") != -1 :
race_weather = 6
else:
race_weather = None
# 馬場状態(芝)
race_going_turf_obj = re.search(u"芝 : [良|稍重|重|不良]", str(text1.find_all("span")))
if race_going_turf_obj:
race_going_turf_str = str(race_going_turf_obj.group(0))
if race_going_turf_str.find(u"良") != -1:
race_going_turf = 1
elif race_going_turf_str.find(u"稍重"):
race_going_turf = 2
elif race_going_turf_str.find(u"重"):
race_going_turf = 3
elif race_going_turf_str.find(u"不良"):
race_going_turf = 4
else:
race_going_turf = None
# 馬場状態(ダート)
race_going_dirt_obj = re.search(u"ダート : [良|稍重|重|不良]", str(text1.find_all("span")))
if race_going_dirt_obj:
race_going_dirt_str = str(race_going_dirt_obj.group(0))
if race_going_dirt_str.find(u"良") != -1:
race_going_dirt = 1
elif race_going_dirt_str.find(u"稍重"):
race_going_dirt = 2
elif race_going_dirt_str.find(u"重"):
race_going_dirt = 3
elif race_going_dirt_str.find(u"不良"):
race_going_dirt = 4
else:
race_going_dirt = None
# 日付
date_text = re.search(u"\d{4}年\d{1,2}月\d{1,2}日", str(text2)).group(0)
# date_time_text = re.search(u"\d{2}:\d{2}", str(text1.find_all("span"))).group(0).replace(":", "時") + u"分"
date = jpd.to_datetime(date_text, format=u'%Y年%m月%d日')
# 開催回数
race_num = int(re.search(u"\d{1,2}回", str(text2)).group(0).replace("回", ""))
# 開催日
race_day = int(re.search(u"\d{1,2}日目", str(text2)).group(0).replace("日目", ""))
# 競馬場
if str(text2).find(u"札幌") != -1:
race_venue = 1
elif str(text2).find(u"函館") != -1:
race_venue = 2
elif str(text2).find(u"福島") != -1:
race_venue = 3
elif str(text2).find(u"新潟") != -1:
race_venue = 4
elif str(text2).find(u"東京") != -1:
race_venue = 5
elif str(text2).find(u"中山") != -1:
race_venue = 6
elif str(text2).find(u"中京") != -1:
race_venue = 7
elif str(text2).find(u"京都") != -1:
race_venue = 8
elif str(text2).find(u"阪神") != -1:
race_venue = 9
elif str(text2).find(u"小倉") != -1:
race_venue = 10
print("天候:", race_weather)
print("芝状態:", race_going_turf, "ダート状態", race_going_dirt)
print("日付:", date, "回数:", race_num, "競馬場", race_venue, "開催日", race_day)
sheet = book.add_sheet(re.search(u"\d{4}", str(text2)).group(0))
extractHorses(sheet)
def extractRaces(browser, name, start_year, end_year=0):
browser.searchRace(name, start_year, start_month=0, end_year=0, end_month=0)
rows = tableParse()
break_flag = 1
while break_flag:
for row in rows:
for cell in row.findAll("a"):
race = cell.get("href")
if re.match("/race/\d{12}/", race):
browser.open("http://db.netkeiba.com/" + race)
extractRace(race)
browser.back()
pager_str = str(browser.readValue("div", "pager"))
if re.search("次</a>", pager_str):
browser.click("次")
else:
break_flag = 0
book.save(xls_file)
if __name__ == "__main__":
browser = browser()
extractRaces(browser, "中山金杯", 2010)
browser.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment