Skip to content

Instantly share code, notes, and snippets.

@who-you-me
Created December 21, 2012 15:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save who-you-me/4353334 to your computer and use it in GitHub Desktop.
Save who-you-me/4353334 to your computer and use it in GitHub Desktop.
# -*- coding: utf- 8 -*-
import csv
from BeautifulSoup import BeautifulSoup
from teams import teams, teams_old
PATH = "./html/%s_%s_%s.html"
def scraping(year, mode, team):
players = []
path = PATH % (year, mode, team)
f = open(path)
html = f.read()
soup = BeautifulSoup(html)
rows = soup.findAll('tr', {'class': 'ststats'})
for row in rows:
player = [cell.string for cell in row.findAll('td')]
players.append([year,t]+player)
return players
def get_header(year, mode, team):
header = []
path = PATH % (year, mode, team)
f = open(path)
html = f.read()
soup = BeautifulSoup(html)
cols = soup.find(id='stdivmaintbl').findAll('th')
for col in cols:
h = ''.join([content.encode("utf8") for content in col.contents
if content.string])
header.append(h.replace(" ", ""))
header = ["年度", "チーム"] + header
return header
if __name__ == "__main__":
years = range(2005, 2013)
stats_b = []
stats_p = []
header_flg = False
for year in years:
print year
if year < 2012:
ts = teams_old
else:
ts = teams
for t in ts:
stats_b += scraping(year, 'b', t)
stats_p += scraping(year, 'p', t)
if not header_flg:
header_b = get_header(year, 'b', t)
header_p = get_header(year, 'p', t)
writer = csv.writer(open("batting.csv", "w"), lineterminator="\n")
writer.writerow(header_b)
writer.writerows(stats_b)
writer = csv.writer(open("pitching.csv", "w"), lineterminator="\n")
writer.writerow(header_p)
writer.writerows(stats_p)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment