Skip to content

Instantly share code, notes, and snippets.

@NaoY-2501
Last active November 4, 2017 17:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NaoY-2501/4b906e393ac55383a897fb82fa4cfc0a to your computer and use it in GitHub Desktop.
Save NaoY-2501/4b906e393ac55383a897fb82fa4cfc0a to your computer and use it in GitHub Desktop.
Fetch_M3_2017Fall_Circle_list
<!DOCTYPE html>
<head>
<title>M3 2017 サークルリスト</title>
</head>
<body>
<table>
{% for circle in circles %}
<tr>
<td>{{ circle.space }}</td>
<td>{{ circle.name }}</td>
<td>{{ circle.description }}</td>
</tr>
{% endfor %}
</table>
</body>
</html>
import csv
import re
import time
import os
from collections import defaultdict
import requests
from bs4 import BeautifulSoup as bs
from jinja2 import Environment, FileSystemLoader
PATTERN = re.compile(
r'^(?P<other>^[a-z3\:\/\.:]+)(?P<season>[\dfs]+)(?P<suffix>\.html)$'
)
URL_PATTERN = re.compile(r'http[\w/\.\-:@@=_?\\]+')
def get_content(url):
""" レスポンスボディを取得し、bytes形式で返す
:param: url: サークルリストページURL
:return: r.content: bytes形式のレスポンスボディ
:rtype: bytes
"""
r = requests.get(url)
if r.status_code != 200:
print('Http Error: {}'.format(r.status_code))
return 1
else:
return r.content
def get_circle_name(datum):
""" サークル名を取得する
サークル名・Twitterアカウント・ニコニコ動画アカウントなどが
1サークル分の情報としてまとまっているので、分割してサークル名のみ返す
:param: datum: 1サークル分のサークル情報(サークル名, SNSのURLなど)
:return name
:rype: str
"""
circle_info = datum.get_text().replace('\t\t', ',').split(',')
for _ in circle_info:
try:
circle_info.remove('')
except ValueError:
break
circle_info[0] = re.sub(URL_PATTERN, '', circle_info[0])
return circle_info[0].replace('\t', '')
def parse_html(content):
""" HTMLをパースし、サークルリストを返す
1サークル分の情報をdictにし、listにまとめて返す
:param: content: サークル一覧ページのbytes形式のレスポンスボディ
:return circles
:rype: list
"""
soup = bs(content, 'html.parser')
circle_table = soup.find_all('table', class_='tblCircleList')
circles = []
for row in circle_table:
spaces = row.find_all('td', class_='left')
data = row.find_all('td', class_='center')
descriptions = row.find_all('td', class_='right')
# 2013sのみ,<td class=right">となっているため対応
if len(descriptions) == 0:
descriptions = row.find_all('td', class_='right"')
for space, datum, description in zip(spaces, data, descriptions):
space = space.get_text()
name = get_circle_name(datum)
description = description.get_text()
circle = defaultdict(str)
circle['space'] = space
circle['name'] = name
circle['description'] = description
circles.append(circle)
return circles
def render_html(circle_list, url):
""" Jinja2を使って、サークルリストをHTMLとして出力する
:param: circle_list: サークルリスト url: URL
:return:
:rtype:
"""
# Templateの読み込みディレクトリをカレントディレクトリに設定
env = Environment(loader=FileSystemLoader('.'))
template = env.get_template('circle_list.tpl')
season = get_season(url)
file_name = 'circle_list/M3_{season}_circle_list.html'.format(
season=season
)
with open(file_name, mode='w', encoding='utf-8') as f:
f.write(template.render(season=season, circles=circle_list))
def create_csv(all_season_dict):
""" サークルリストをCSVで出力する
ヘッダ:開催年, サークル名
:param: circle_list: サークルリスト url: URL
:return:
:rtype:
"""
file_name = 'circle_list.csv'
fieldnames = ['season', 'name']
with open(file_name, mode='w', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
for key, value in all_season_dict.items():
season = get_season(key)
for circle in value:
writer.writerow({'season': season, 'name': circle['name']})
def get_season(url):
""" サークル一覧URLから開催年を抽出する
:param: url: URL
:return season
:rype: str
"""
m = PATTERN.match(url)
return m.group('season')
def get_urls():
""" 過去7年分のサークルリストページのURLリストを作る
:return: urls
:rtype: list
"""
urls = []
urls.append('http://www.m3net.jp/attendance/circle2017f.html')
for year in range(2010,2018):
urls.append('http://www.m3net.jp/event/{year}s.html'.format(year=year))
urls.append('http://www.m3net.jp/event/{year}f.html'.format(year=year))
return urls
def main():
urls = get_urls()
all_season_dict = defaultdict()
for url in urls:
print(url)
content = get_content(url)
if content != 1:
circle_list = parse_html(content)
all_season_dict[url] = circle_list
time.sleep(1)
create_csv(all_season_dict)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment