Created
December 30, 2016 17:23
-
-
Save unicore32/ac9bd9638e07ab34f3f7ddf07584a15b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import re | |
import xlwt | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.support.select import Select | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import japandas as jpd | |
LOGIN_URL = "https://account.netkeiba.com/?pid=login" | |
SEARCH_RACE_URL = "http://db.netkeiba.com/?pid=race_search_detail" | |
RACE_RESULT_URL = "http://race.netkeiba.com/?pid=race&id=c%s&mode=result" | |
RACE_OIKIRI_URL = "http://race.netkeiba.com/?pid=race&id=c%s&mode=oikiri" | |
RACE_SHUTUBA_URL = "http://race.netkeiba.com/?pid=shutuba_detail&id=c%s" | |
HORSE_RESULT_URL = "http://db.netkeiba.com/horse/result/%s/" | |
JOCKEY_RESULT_URL = "http://db.netkeiba.com/jockey/%s/" | |
xls_file = "horse.xls" | |
book = xlwt.Workbook() | |
ID = "" | |
PASSWORD = "" | |
class browser(object): | |
__driver = webdriver.Chrome() | |
def __init__(self): | |
self.__driver.get(LOGIN_URL) | |
self.__driver.find_element_by_name("login_id").send_keys(ID) | |
self.__driver.find_element_by_name("pswd").send_keys(PASSWORD) | |
self.__driver.find_element_by_name("pswd").send_keys(Keys.RETURN) | |
def open(self, url): | |
self.__driver.get(url) | |
def searchRace(self, name, start_year=2008, start_month=0, end_year=0, end_month=0): | |
self.__driver.get(SEARCH_RACE_URL) | |
if start_year: | |
Select(self.__driver.find_element_by_name("start_year")).select_by_visible_text(str(start_year)) | |
if end_year: | |
Select(self.__driver.find_element_by_name("end_year")).select_by_visible_text(str(end_year)) | |
self.__driver.find_element_by_id("db_search_detail_form").find_element_by_name("word").send_keys(name) | |
if start_month: | |
if start_month < 13: | |
Select(self.__driver.find_element_by_name("start_mon")).select_by_visible_text(str(start_month)) | |
if end_month: | |
if end_mon < 13: | |
Select(self.__driver.find_element_by_name("end_mon")).select_by_visible_text(str(end_month)) | |
for num in range(1, 11): | |
str_name = "check_Jyo_" + "{0:02d}".format(num) | |
self.__driver.find_element_by_id(str_name).click() | |
self.__driver.find_element_by_name("end_year").send_keys(Keys.RETURN) | |
self.__driver.find_element_by_name("sortimage").click() | |
def readValue(self, tag, class_name=""): | |
__html = self.__driver.page_source | |
__soup = BeautifulSoup(__html, 'html.parser') | |
if class_name == "": | |
return __soup.find_all(tag) | |
else: | |
return __soup.find_all(tag, class_=class_name) | |
def click(self, text): | |
self.__driver.find_element_by_link_text(text).click() | |
def back(self): | |
self.__driver.back() | |
def newWindow(self): | |
self.__driver.execute_script("window.open('');") | |
def screenshot(self, name): | |
self.__driver.save_screenshot(name) | |
def quit(self): | |
self.__driver.quit() | |
def tableParse(no_del=0): | |
table = browser.readValue("table", "race_table_01")[0] | |
rows = table.find_all("tr") | |
return_rows = rows[:] | |
del return_rows[0] | |
return return_rows | |
def extractHorses(sheet): | |
rows = tableParse(1) | |
for num, name in enumerate(["着順", "枠番", "馬番", "馬名", "性別", "年齢", "斤量", "騎手", "タイム", "着差", "タイム指数", "通過順", "上り3F", "単勝オッズ", "人気", "馬体重", "備考", "調教師", "馬主"]): | |
sheet.write(0, num, name) | |
for num1, row in enumerate(rows): | |
row_td = row.find_all("td") | |
row_td.insert(5, re.search("\d{1,2}", str(row_td[4])).group(0)) | |
row_td[4] = re.sub("\d{1,2}", "", str(row_td[4])) | |
del row_td[16:18] | |
row_td.pop() | |
for num2, cell in enumerate(row_td): | |
soup = BeautifulSoup(str(cell)) | |
if num2 == 5: | |
val = str(cell) | |
elif soup.td.string: | |
val = str(soup.td.string) | |
elif soup.find("a"): | |
val = str(soup.a.string) | |
else: | |
val = None | |
if val: | |
return_val = val.strip() | |
if return_val.replace(".", "").isdigit(): | |
return_val = float(return_val) | |
sheet.write(num1+1, num2, return_val) | |
def extractRace(race): | |
# レースID | |
race_id = int(race.replace("/race/", "").replace("/", "")) | |
text1 = browser.readValue("dl", "racedata fc")[0] | |
text2 = browser.readValue("p", "smalltxt")[0] | |
# 天候 | |
race_weather_text = str(re.search(u"天候 : [晴|曇|雨|小雨|雪|小雪]", str(text1.find_all("span"))).group(0)) | |
if race_weather_text.find(u"晴") != -1: | |
race_weather = 1 | |
elif race_weather_text.find(u"曇") != -1: | |
race_weather = 2 | |
elif race_weather_text.find(u"雨") != -1: | |
race_weather = 3 | |
elif race_weather_text.find(u"小雨") != -1: | |
race_weather = 4 | |
elif race_weather_text.find(u"雪") != -1: | |
race_weather = 5 | |
elif race_weather_text.find(u"小雪") != -1 : | |
race_weather = 6 | |
else: | |
race_weather = None | |
# 馬場状態(芝) | |
race_going_turf_obj = re.search(u"芝 : [良|稍重|重|不良]", str(text1.find_all("span"))) | |
if race_going_turf_obj: | |
race_going_turf_str = str(race_going_turf_obj.group(0)) | |
if race_going_turf_str.find(u"良") != -1: | |
race_going_turf = 1 | |
elif race_going_turf_str.find(u"稍重"): | |
race_going_turf = 2 | |
elif race_going_turf_str.find(u"重"): | |
race_going_turf = 3 | |
elif race_going_turf_str.find(u"不良"): | |
race_going_turf = 4 | |
else: | |
race_going_turf = None | |
# 馬場状態(ダート) | |
race_going_dirt_obj = re.search(u"ダート : [良|稍重|重|不良]", str(text1.find_all("span"))) | |
if race_going_dirt_obj: | |
race_going_dirt_str = str(race_going_dirt_obj.group(0)) | |
if race_going_dirt_str.find(u"良") != -1: | |
race_going_dirt = 1 | |
elif race_going_dirt_str.find(u"稍重"): | |
race_going_dirt = 2 | |
elif race_going_dirt_str.find(u"重"): | |
race_going_dirt = 3 | |
elif race_going_dirt_str.find(u"不良"): | |
race_going_dirt = 4 | |
else: | |
race_going_dirt = None | |
# 日付 | |
date_text = re.search(u"\d{4}年\d{1,2}月\d{1,2}日", str(text2)).group(0) | |
# date_time_text = re.search(u"\d{2}:\d{2}", str(text1.find_all("span"))).group(0).replace(":", "時") + u"分" | |
date = jpd.to_datetime(date_text, format=u'%Y年%m月%d日') | |
# 開催回数 | |
race_num = int(re.search(u"\d{1,2}回", str(text2)).group(0).replace("回", "")) | |
# 開催日 | |
race_day = int(re.search(u"\d{1,2}日目", str(text2)).group(0).replace("日目", "")) | |
# 競馬場 | |
if str(text2).find(u"札幌") != -1: | |
race_venue = 1 | |
elif str(text2).find(u"函館") != -1: | |
race_venue = 2 | |
elif str(text2).find(u"福島") != -1: | |
race_venue = 3 | |
elif str(text2).find(u"新潟") != -1: | |
race_venue = 4 | |
elif str(text2).find(u"東京") != -1: | |
race_venue = 5 | |
elif str(text2).find(u"中山") != -1: | |
race_venue = 6 | |
elif str(text2).find(u"中京") != -1: | |
race_venue = 7 | |
elif str(text2).find(u"京都") != -1: | |
race_venue = 8 | |
elif str(text2).find(u"阪神") != -1: | |
race_venue = 9 | |
elif str(text2).find(u"小倉") != -1: | |
race_venue = 10 | |
print("天候:", race_weather) | |
print("芝状態:", race_going_turf, "ダート状態", race_going_dirt) | |
print("日付:", date, "回数:", race_num, "競馬場", race_venue, "開催日", race_day) | |
sheet = book.add_sheet(re.search(u"\d{4}", str(text2)).group(0)) | |
extractHorses(sheet) | |
def extractRaces(browser, name, start_year, end_year=0): | |
browser.searchRace(name, start_year, start_month=0, end_year=0, end_month=0) | |
rows = tableParse() | |
break_flag = 1 | |
while break_flag: | |
for row in rows: | |
for cell in row.findAll("a"): | |
race = cell.get("href") | |
if re.match("/race/\d{12}/", race): | |
browser.open("http://db.netkeiba.com/" + race) | |
extractRace(race) | |
browser.back() | |
pager_str = str(browser.readValue("div", "pager")) | |
if re.search("次</a>", pager_str): | |
browser.click("次") | |
else: | |
break_flag = 0 | |
book.save(xls_file) | |
if __name__ == "__main__": | |
browser = browser() | |
extractRaces(browser, "中山金杯", 2010) | |
browser.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment