Last active
November 4, 2017 17:29
-
-
Save NaoY-2501/4b906e393ac55383a897fb82fa4cfc0a to your computer and use it in GitHub Desktop.
Fetch_M3_2017Fall_Circle_list
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<head> | |
<title>M3 2017 サークルリスト</title> | |
</head> | |
<body> | |
<table> | |
{% for circle in circles %} | |
<tr> | |
<td>{{ circle.space }}</td> | |
<td>{{ circle.name }}</td> | |
<td>{{ circle.description }}</td> | |
</tr> | |
{% endfor %} | |
</table> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import re | |
import time | |
import os | |
from collections import defaultdict | |
import requests | |
from bs4 import BeautifulSoup as bs | |
from jinja2 import Environment, FileSystemLoader | |
PATTERN = re.compile( | |
r'^(?P<other>^[a-z3\:\/\.:]+)(?P<season>[\dfs]+)(?P<suffix>\.html)$' | |
) | |
URL_PATTERN = re.compile(r'http[\w/\.\-:@@=_?\\]+') | |
def get_content(url): | |
""" レスポンスボディを取得し、bytes形式で返す | |
:param: url: サークルリストページURL | |
:return: r.content: bytes形式のレスポンスボディ | |
:rtype: bytes | |
""" | |
r = requests.get(url) | |
if r.status_code != 200: | |
print('Http Error: {}'.format(r.status_code)) | |
return 1 | |
else: | |
return r.content | |
def get_circle_name(datum): | |
""" サークル名を取得する | |
サークル名・Twitterアカウント・ニコニコ動画アカウントなどが | |
1サークル分の情報としてまとまっているので、分割してサークル名のみ返す | |
:param: datum: 1サークル分のサークル情報(サークル名, SNSのURLなど) | |
:return name | |
:rype: str | |
""" | |
circle_info = datum.get_text().replace('\t\t', ',').split(',') | |
for _ in circle_info: | |
try: | |
circle_info.remove('') | |
except ValueError: | |
break | |
circle_info[0] = re.sub(URL_PATTERN, '', circle_info[0]) | |
return circle_info[0].replace('\t', '') | |
def parse_html(content): | |
""" HTMLをパースし、サークルリストを返す | |
1サークル分の情報をdictにし、listにまとめて返す | |
:param: content: サークル一覧ページのbytes形式のレスポンスボディ | |
:return circles | |
:rype: list | |
""" | |
soup = bs(content, 'html.parser') | |
circle_table = soup.find_all('table', class_='tblCircleList') | |
circles = [] | |
for row in circle_table: | |
spaces = row.find_all('td', class_='left') | |
data = row.find_all('td', class_='center') | |
descriptions = row.find_all('td', class_='right') | |
# 2013sのみ,<td class=right">となっているため対応 | |
if len(descriptions) == 0: | |
descriptions = row.find_all('td', class_='right"') | |
for space, datum, description in zip(spaces, data, descriptions): | |
space = space.get_text() | |
name = get_circle_name(datum) | |
description = description.get_text() | |
circle = defaultdict(str) | |
circle['space'] = space | |
circle['name'] = name | |
circle['description'] = description | |
circles.append(circle) | |
return circles | |
def render_html(circle_list, url): | |
""" Jinja2を使って、サークルリストをHTMLとして出力する | |
:param: circle_list: サークルリスト url: URL | |
:return: | |
:rtype: | |
""" | |
# Templateの読み込みディレクトリをカレントディレクトリに設定 | |
env = Environment(loader=FileSystemLoader('.')) | |
template = env.get_template('circle_list.tpl') | |
season = get_season(url) | |
file_name = 'circle_list/M3_{season}_circle_list.html'.format( | |
season=season | |
) | |
with open(file_name, mode='w', encoding='utf-8') as f: | |
f.write(template.render(season=season, circles=circle_list)) | |
def create_csv(all_season_dict): | |
""" サークルリストをCSVで出力する | |
ヘッダ:開催年, サークル名 | |
:param: circle_list: サークルリスト url: URL | |
:return: | |
:rtype: | |
""" | |
file_name = 'circle_list.csv' | |
fieldnames = ['season', 'name'] | |
with open(file_name, mode='w', encoding='utf-8') as f: | |
writer = csv.DictWriter(f, fieldnames=fieldnames) | |
for key, value in all_season_dict.items(): | |
season = get_season(key) | |
for circle in value: | |
writer.writerow({'season': season, 'name': circle['name']}) | |
def get_season(url): | |
""" サークル一覧URLから開催年を抽出する | |
:param: url: URL | |
:return season | |
:rype: str | |
""" | |
m = PATTERN.match(url) | |
return m.group('season') | |
def get_urls(): | |
""" 過去7年分のサークルリストページのURLリストを作る | |
:return: urls | |
:rtype: list | |
""" | |
urls = [] | |
urls.append('http://www.m3net.jp/attendance/circle2017f.html') | |
for year in range(2010,2018): | |
urls.append('http://www.m3net.jp/event/{year}s.html'.format(year=year)) | |
urls.append('http://www.m3net.jp/event/{year}f.html'.format(year=year)) | |
return urls | |
def main(): | |
urls = get_urls() | |
all_season_dict = defaultdict() | |
for url in urls: | |
print(url) | |
content = get_content(url) | |
if content != 1: | |
circle_list = parse_html(content) | |
all_season_dict[url] = circle_list | |
time.sleep(1) | |
create_csv(all_season_dict) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment