Skip to content

Instantly share code, notes, and snippets.

@tosh1ki
Created April 29, 2015 14:39
Show Gist options
  • Save tosh1ki/a071be7a1e22046bca39 to your computer and use it in GitHub Desktop.
Save tosh1ki/a071be7a1e22046bca39 to your computer and use it in GitHub Desktop.
http://hamada.tsukaeru.info/jump/ からデータをスクレイピングする
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import time
import requests
import urllib
from bs4 import BeautifulSoup
import pandas as pd
def get_html(url):
r = requests.get(url)
return r.text.encode('ISO-8859-1').decode('euc-jp')
if __name__ == '__main__':
for year in range(2003, 2016):
url = 'http://hamada.tsukaeru.info/jump/{0}/jump{0}.html'.format(year)
html = get_html(url)
soup = BeautifulSoup(html)
link = [link.get('href') for link in soup.find_all('a')]
link = filter(lambda x: x.startswith('./jump'), link)
url_list = map(lambda l: urllib.parse.urljoin(url, l), link)
df_temp = []
for n, url in enumerate(url_list):
time.sleep(5)
print(year, n, ':', url)
gou = int(re.search('jump\d{2}(\d{2})', url).group(1))
_df = pd.read_html(url, header=0, index_col='作品名')
_df = _df[0][[-2]] # ページの週の列だけを抜き出してくる
df_temp.append(_df)
df = pd.concat(df_temp, axis=1)
# 例えば37,38号が合併号のとき,
# 列名が`37.38`のようになっているので`37`に直す
df.columns = df.columns.map(lambda x: int(x.split('.')[0]))
df = df.sort_index(axis=1)
df.T.to_csv('data/{0}.csv'.format(year))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment