Create a gist now

Instantly share code, notes, and snippets.

Embed
What would you like to do?
楽天競馬とnetkeiba.comから地方競馬データをスクレイピングするPythonスクリプト
# -*- coding: utf-8 -*-
import urllib.request
import codecs
import time
from datetime import datetime as dt
from collections import Counter
from bs4 import BeautifulSoup
import pandas
import re
for m in range(4,5):
cal_html=urllib.request.urlopen("http://keiba.rakuten.co.jp/calendar?l-id=top_headernavi_2nd_calendar&tYear=2016&tMonth="+str(m))
cal_soup = BeautifulSoup(cal_html,"lxml")
for atag in cal_soup.find_all("a",href=re.compile("race_card/list/RACEID/2016.*0$")):
race_src=atag.get("href")
result_src=re.sub("race_card","race_performance",race_src)
result_URL="http://keiba.rakuten.co.jp"+str(result_src)
for race_list in ["01","02","03","04","05","06","07","08","09","10","11","12"]:
race_URL=re.sub(r'00$',str(race_list),result_URL)
race_html=urllib.request.urlopen(race_URL)
race_soup=BeautifulSoup(race_html,"lxml")
# 日付の取得
race_date_raw=result_src[30:38]
race_date=dt.strptime(race_date_raw,"%Y%m%d")
# 競馬場名の取得
course_name_elem=race_soup.find_all("span",class_="racePlace")
course_name_raw=re.sub(r"<[^>]*?>","",str(course_name_elem))
course_name=re.sub(r"[\[\]]","",str(course_name_raw))
# レース番号の取得
race_no_elem=race_soup.find_all("span",class_="num")
race_no=race_list
# 距離の取得
distance_elem=race_soup.find_all("li",class_="distance")
distance_raw=re.sub(r"<[^>]*?>","",str(distance_elem))
distance=re.sub(r"[ダ芝,m\[\]]","",str(distance_raw))
distance=re.sub(r"\(.\)","",str(distance))
# 1着賞金の取得
main_prize_elem=race_soup.find_all("li",text=re.compile("1着"))
main_prize_raw=re.sub(r"<[^>]*?>","",str(main_prize_elem))
main_prize=re.sub(",","",main_prize_raw[3:len(main_prize_raw)-2])
# 馬場状態の取得
course_cond_elem=race_soup.find_all("dd")
course_cond=re.sub(r"<[^>]*?>","",str(course_cond_elem[1]))
# コースの取得(盛岡に芝があるので)
course_type=distance_raw[1]
# 馬名の取得
horse_name_elem=race_soup.find_all("td",class_="horse")
# 年齢・毛色の取得
horse_age_elem=race_soup.find_all("td",class_="state")
# 馬番の取得
horse_number_elem=race_soup.find_all("td",class_="number")
# 着順の取得
horse_order_elem=race_soup.find_all("td",class_="order")
# 斤量の取得
horse_weightTax_elem=race_soup.find_all("td",class_="weightTax")
# 馬体重の取得
horse_weight_elem=race_soup.find_all("td",class_="weight")
# 騎手名の取得
jockey_elem=race_soup.find_all("td",class_="jockey")
# タイムの取得
horse_time_elem=race_soup.find_all("td",class_="time")
# 上がり3Fの取得
horse_3F_elem=race_soup.find_all("td",class_="spurt")
# 調教師名の取得
trainer_elem=race_soup.find_all("td",class_="tamer")
# 人気の取得
horse_rank_elem=race_soup.find_all("td",class_="rank")
# 頭数の算出
num_horses=len(horse_name_elem)
for i in range(num_horses):
horse_name=re.sub(r"<[^>]*?>","",str(horse_name_elem[i]))
horse_age_raw=re.sub(r"<[^>]*?>","",str(horse_age_elem[i]))
horse_sex=re.sub(r"[^牡牝セ]","",horse_age_raw)
horse_age=re.sub(r"[^0-9]","",horse_age_raw)
horse_color=re.sub(r".*/","",horse_age_raw)
horse_number=re.sub(r"<[^>]*?>","",str(horse_number_elem[i]))
horse_order=re.sub(r"<[^>]*?>","",str(horse_order_elem[i]))
if horse_order == "-":
horse_order=""
horse_weightTax=re.sub(r"<[^>]*?>","",str(horse_weightTax_elem[i]))
horse_weight_raw=re.sub(r"<[^>]*?>","",str(horse_weight_elem[i]))
tmp=re.search(r"\+.*|-.*|±.*",str(horse_weight_raw))
if tmp is not None:
horse_weight_change_raw=tmp.group(0)
horse_weight_change=re.sub(r"\+|±","",str(horse_weight_change_raw))
else:
horse_weight_change=""
if horse_weight_raw == "-\n":
horse_weight=""
horse_weight_change=""
else:
horse_weight=re.sub(r"\+.*|-.*|±.*","",str(horse_weight_raw))
jockey_raw=re.sub(r"<[^>]*?>","",str(jockey_elem[i]))
jockey=re.sub(r"[\n☆★△▲]","",str(jockey_raw))
horse_time_raw=re.sub(r"<[^>]*?>","",str(horse_time_elem[i]))
counter=Counter(str(horse_time_raw))
if horse_time_raw == "-":
horse_time=""
horse_time_sec=""
else:
if counter[':'] == 0:
horse_time_orig=re.sub(r"$","0",str(horse_time_raw))
horse_time=dt.strptime(horse_time_orig,"%S.%f")
# タイム差を計算しやすくするため秒に変換
horse_time_sec=(horse_time.second+(horse_time.microsecond/1000000))
else:
horse_time_orig=re.sub(r"^|$","0",str(horse_time_raw))
horse_time=dt.strptime(horse_time_orig,"%M:%S.%f")
# タイム差を計算しやすくするため秒に変換
horse_time_sec=(horse_time.minute*60)+horse_time.second+(horse_time.microsecond/1000000)
horse_3F_raw=re.sub(r"<[^>]*?>","",str(horse_3F_elem[i]))
# ばんえいの上り3Fは存在しないので
horse_3F=re.sub("-","",str(horse_3F_raw))
trainer=re.sub(r"<[^>]*?>","",str(trainer_elem[i]))
horse_rank_raw=re.sub(r"<[^>]*?>","",str(horse_rank_elem[i]))
horse_rank=re.sub("-","",str(horse_rank_raw))
# オッズの取得
if course_name=="帯広ば":
n_course=65
elif course_name=="門別":
n_course=30
elif course_name=="盛岡":
n_course=35
elif course_name=="水沢":
n_course=36
elif course_name=="浦和":
n_course=42
elif course_name=="船橋":
n_course=43
elif course_name=="大井":
n_course=44
elif course_name=="川崎":
n_course=45
elif course_name=="金沢":
n_course=46
elif course_name=="笠松":
n_course=47
elif course_name=="名古屋":
n_course=48
elif course_name=="園田":
n_course=50
elif course_name=="姫路":
n_course=51
elif course_name=="福山":
n_course=53
elif course_name=="高知":
n_course=54
elif course_name=="佐賀":
n_course=55
elif course_name=="荒尾":
n_course=56
# odds_url="http://db.netkeiba.com/race/"+str(race_date.year)+str(n_course)+str(race_date.month).rjust(2,"0")+str(race_date.day).rjust(2,"0")+str(race_no).rjust(2,"0")
odds_url="http://nar.netkeiba.com/?pid=race&mode=result&id=c"+str(race_date.year)+str(n_course)+str(race_date.month).rjust(2,"0")+str(race_date.day).rjust(2,"0")+str(race_no).rjust(2,"0")
odds_df=pandas.io.html.read_html(odds_url)
odds_popularity=odds_df[0][9][1:]
odds=odds_df[0][10][1:]
print(str(dt.strftime(race_date,"%Y-%m-%d"))+","+course_name+","+race_no+","+course_type+","+distance+","+course_cond+","+main_prize+","+horse_order+","+horse_number+","+horse_name+","+horse_sex+","+horse_age+","+horse_color+","+horse_weightTax+","+jockey+","+horse_weight+","+horse_weight_change+","+str(horse_time_sec)+","+horse_3F+","+trainer+","+str(odds_popularity[i+1])+","+str(odds[i+1]))
out=codecs.open("./nar_race_result_2016.csv","a","utf-8")
out.write(str(dt.strftime(race_date,"%Y-%m-%d"))+","+course_name+","+race_no+","+course_type+","+distance+","+course_cond+","+main_prize+","+horse_order+","+horse_number+","+horse_name+","+horse_sex+","+horse_age+","+horse_color+","+horse_weightTax+","+jockey+","+horse_weight+","+horse_weight_change+","+str(horse_time_sec)+","+horse_3F+","+trainer+","+str(odds_popularity[i+1])+","+str(odds[i+1])+"\n")
out.close()
# time.sleep(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment