Skip to content

Instantly share code, notes, and snippets.

@ommadawn46
Last active December 3, 2018 09:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ommadawn46/a59ff0febf2d9ae48f07137530b1832d to your computer and use it in GitHub Desktop.
Save ommadawn46/a59ff0febf2d9ae48f07137530b1832d to your computer and use it in GitHub Desktop.
from html.parser import HTMLParser
from urllib.request import urlopen
from datetime import datetime
import json
import csv
import sys
import re
# エラーログのパス
ERRORLOG = 'error.log'
# 作者ページURLと受賞情報を抽出するためのHTMLParserクラス
class InformationParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.author_url = ''
self.awards = []
self.in_award = False
def handle_starttag(self, tag, attrs):
if tag == 'a':
attrs = dict(attrs)
if 'rel' in attrs and attrs['rel'] == 'author':
self.author_url = attrs['href']
if tag == 'td':
attrs = dict(attrs)
if 'itemprop' in attrs:
self.in_award = True
def handle_endtag(self, tag):
self.in_award = False
def handle_data(self, data):
if self.in_award:
self.awards.append(re.sub(r'(\s+)', ' ', data).strip())
self.in_award = False
def load_json_data(paths):
"""引数のlistに含まれるjsonファイルパスからデータを読み込み,漫画データリストを作成する"""
data = []
for path in paths:
with open(path, 'r') as f:
data += json.load(f)
return sorted(data, key=lambda x: x['user']['id'])
def append_information(fav_manga_data):
"""作者ページURLと受賞情報を取得し,漫画データリストに追加する"""
for data in fav_manga_data:
for manga in data['user']['favorite_manga']:
if not 'mangapedia_url' in manga:
continue
parser = InformationParser()
parser.feed(urlopen(manga['mangapedia_url']).read().decode('utf-8'))
author_url = 'https://mangapedia.com' + parser.author_url
parser.feed(urlopen(author_url).read().decode('utf-8'))
awards = parser.awards
parser.close()
manga['mangapedia_author_url'] = author_url
manga['awards_of_author'] = ','.join(awards)
def write_column_names_to_tsv(f, names):
"""引数として与えられたカラム名一覧をtvs形式でファイルに書き込む"""
writer = csv.writer(f, delimiter='\t', lineterminator='\r\n')
writer.writerow(names)
def write_manga_data_to_tsv(f, fav_manga_data):
"""漫画データをtvs形式に整形しファイルに書き込む"""
writer = csv.writer(f, delimiter='\t', lineterminator='\r\n', quoting=csv.QUOTE_ALL)
for data in fav_manga_data:
for manga in sorted(data['user']['favorite_manga'], key=lambda x: x['id']):
writer.writerow([data['user']['id'],
data['user']['login'],
manga['id'],
manga['title'],
manga['author'],
manga['mangapedia_url'],
manga['mangapedia_author_url'],
manga['awards_of_author']])
def create_tsv_from_json(tsv_path, json_paths):
"""ファイルパスtsv_pathにjsonファイルを元にしたtsvファイルを新規作成する"""
try:
fav_manga_data = load_json_data(json_paths)
append_information(fav_manga_data)
with open(tsv_path, 'w') as f:
write_column_names_to_tsv(f, ('user_id',
'user_login',
'manga_id',
'manga_title',
'manga_author',
'mangapedia_title_url',
'mangapedia_author_url',
'awards_of_author'))
write_manga_data_to_tsv(f, fav_manga_data)
except json.decoder.JSONDecodeError as e:
with open(ERRORLOG, 'a') as f:
f.write('%s\r\nJSONDecodeError: %s - %s\r\n' % (datetime.now(), e.args, json_paths))
except FileNotFoundError:
with open(ERRORLOG, 'a') as f:
f.write('%s\r\nFileNotFoundError: %s\r\n' % (datetime.now(), e.args))
except:
with open(ERRORLOG, 'a') as f:
f.write('%s\r\n%s\r\n' % (datetime.now(), sys.exc_info()))
def main():
create_tsv_from_json('favorite_manga.tsv', ['favorite_manga_001.json', 'favorite_manga_002.json'])
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment