Skip to content

Instantly share code, notes, and snippets.

@elnikkis
Created June 9, 2019 13:03
Show Gist options
  • Save elnikkis/148a22420a4f5dfb2bf4e09c76a84376 to your computer and use it in GitHub Desktop.
Save elnikkis/148a22420a4f5dfb2bf4e09c76a84376 to your computer and use it in GitHub Desktop.
# coding: utf-8
'''
はねろコイキングのとっくんのCPテーブルをダウンロード
'''
import os
import urllib.request
import bs4
CACHE_DIR = 'html_cache'
def get_html(url, name):
filepath = os.path.join(CACHE_DIR, '{}.html'.format(name))
if os.path.exists(filepath):
with open(filepath, 'rb') as fp:
data = fp.read()
else:
response = urllib.request.urlopen(url)
data = urllib.request.urlopen(url).read()
os.makedirs(os.path.dirname(filepath), exist_ok=True)
with open(filepath, 'wb') as fp:
fp.write(data)
return data
def main():
url = 'https://www.smart-game-blog.net/koiking-training/'
soup = bs4.BeautifulSoup(get_html(url, 'smart-game-blog'))
# テーブルのタイトル一覧を取得
names = [tag.text for tag in soup.find_all('h2', limit=15)]
tables = soup.find_all('table')
assert len(tables) == 15
for name, table in zip(names, tables):
rows = table.find_all('tr')
data = []
# get titles
cells = rows[0].find_all('th')
header = [cell.text.strip() for cell in cells]
data.append(header)
for row in rows:
cells = row.find_all('td')
d = [cell.text.strip() for cell in cells]
if d:
data.append(d)
print(data)
# save data
filepath = os.path.join('outputs', 'tables', '{}.tsv'.format(name))
os.makedirs(os.path.join('outputs', 'tables'), exist_ok=True)
with open(filepath, 'w') as fp:
for line in data:
print(*line, sep=',', file=fp)
print('Saved at {}'.format(filepath))
return soup
if __name__ == '__main__':
soup = main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment