Created
August 1, 2015 05:18
-
-
Save moni360/b688c8c8d365b003d977 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import math | |
import urllib2 | |
from bs4 import BeautifulSoup | |
from bs4 import element as bs4element | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
def get_team_data(team_name): | |
"""指定されたチームの選手データを取得し、DataFrame形式で返す""" | |
req = urllib2.Request('http://bis.npb.or.jp/teams/rst_%s.html' % team_name) | |
try: | |
response = urllib2.urlopen(req) | |
except urllib2.URLError, e: | |
print 'Error code: ', e.code | |
sys.exit(0) | |
except urllib2.HTTPError, e: | |
print 'Error code: ', e.code | |
sys.exit(0) | |
html = response.read() | |
soup = BeautifulSoup(html) | |
# HTMLから選手のテーブルの部分だけ抽出 | |
main_div = soup.find('div', id='tedivmaintbl') | |
# すべてのtableタグを検索 | |
table_list = main_div.find_all('table') | |
df = pd.DataFrame() | |
for table in table_list: | |
for row in table.contents: | |
# 間に空行のNavigableString objが入ってくるのでTag objに絞る | |
if isinstance(row, bs4element.Tag): | |
if row['class'][0] == 'rosterMainHead': | |
# ポジション名を保存しておく | |
position = row.find('th', class_="rosterPos").string | |
# テーブルをまとめて扱えるように列名を名前に変更 | |
row.find('th', class_="rosterPos").string.replace_with('Name') | |
# 1列目にポジション列を追加 | |
header = ['Position'] + [x.string for x in row.contents] | |
elif row['class'][0] == 'rosterPlayer': | |
# 1列目にポジションを追加 | |
data = [position] + [x.string for x in row.contents] | |
s = pd.Series(data, index=header) | |
df = df.append(s, ignore_index=True) | |
# NaN列を削除 | |
del df[None] | |
# 身長、体重の型を変換 | |
df[u'身長'] = df[u'身長'].dropna(how='any').astype(int) | |
df[u'体重'] = df[u'体重'].dropna(how='any').astype(int) | |
# 生年月日の型を変換 | |
df[u'生年月日'] = pd.to_datetime(df[u'生年月日'].str.replace('.', '-')) | |
# 投打をカテゴリカルデータに変換 | |
df[u'投'] = df[u'投'].dropna(how='any').astype('category') | |
df[u'打'] = df[u'打'].dropna(how='any').astype('category') | |
return df | |
def plot_hist(team_name): | |
"""指定されたチームの全選手の満年齢をヒストグラム表示する""" | |
df = get_team_data(team_name) | |
# 現在の年月日をintで | |
t = int(pd.tslib.Timestamp.now().strftime('%Y%m%d')) | |
# 選手の満年齢を求める | |
year = df[u'生年月日'].apply(lambda x: math.floor((t - int(x.strftime('%Y%m%d')))/10000)) | |
# 集計してインデックスでソートする | |
hist = year.value_counts().sort_index() | |
# プリントでヒストグラムっぽく表示 | |
for i, bar in enumerate(hist.apply(lambda x: x*'*')): | |
print '%3d %s' % (hist.index[i], bar) | |
# ヒストグラムをプロット | |
year.plot(kind='hist', bins=20, alpha=0.5, color='b', xlim=(15, 50), ylim=(0, 15)) | |
plt.show() | |
if __name__ == '__main__': | |
plot_hist('d') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment