Last active
December 3, 2018 09:06
-
-
Save ommadawn46/a8d3ee76889c82aa6d3eef63469b4ef9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from html.parser import HTMLParser | |
from urllib.request import urlopen | |
from datetime import datetime | |
import json | |
import csv | |
import sys | |
import re | |
import os | |
# エラーログのパス | |
ERRORLOG = 'error.log' | |
# 作者ページURLと受賞情報を抽出するためのHTMLParserクラス | |
class InformationParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.author_url = '' | |
self.awards = [] | |
self.in_award = False | |
def handle_starttag(self, tag, attrs): | |
if tag == 'a': | |
attrs = dict(attrs) | |
if 'rel' in attrs and attrs['rel'] == 'author': | |
self.author_url = attrs['href'] | |
if tag == 'td': | |
attrs = dict(attrs) | |
if 'itemprop' in attrs: | |
self.in_award = True | |
def handle_endtag(self, tag): | |
self.in_award = False | |
def handle_data(self, data): | |
if self.in_award: | |
self.awards.append(re.sub(r'(\s+)', ' ', data).strip()) | |
self.in_award = False | |
def load_json_data(paths): | |
"""引数のlistに含まれるjsonファイルパスからデータを読み込み,漫画データリストを作成する""" | |
data = [] | |
for path in paths: | |
with open(path, 'r') as f: | |
data += json.load(f) | |
return sorted(data, key=lambda x: x['user']['id']) | |
def append_information(fav_manga_data): | |
"""作者ページURLと受賞情報を取得し,漫画データリストに追加する""" | |
for data in fav_manga_data: | |
for manga in data['user']['favorite_manga']: | |
if not 'mangapedia_url' in manga: | |
continue | |
parser = InformationParser() | |
parser.feed(urlopen(manga['mangapedia_url']).read().decode('utf-8')) | |
author_url = 'https://mangapedia.com' + parser.author_url | |
parser.feed(urlopen(author_url).read().decode('utf-8')) | |
awards = parser.awards | |
parser.close() | |
manga['mangapedia_author_url'] = author_url | |
manga['awards_of_author'] = ','.join(awards) | |
def uniform_diff_data(diff_data): | |
"""差分データをtsvカラムの形に整形する""" | |
insert, delete = [], [] | |
for data in diff_data: | |
for manga in sorted(data['user']['favorite_manga'], key=lambda x: x['id']): | |
diff = (data['user']['id'] if 'id' in data['user'] else '', | |
data['user']['login'] if 'login' in data['user'] else '', | |
manga['id'] if 'id' in manga else '', | |
manga['title'] if 'title' in manga else '', | |
manga['author'] if 'author' in manga else '', | |
manga['mangapedia_url'] if 'mangapedia_url' in manga else '', | |
manga['mangapedia_author_url'] if 'mangapedia_author_url' in manga else '', | |
manga['awards_of_author'] if 'awards_of_author' in manga else '') | |
if manga['action'] == 'insert': | |
insert.append(diff) | |
else: | |
delete.append(diff) | |
return insert, delete | |
def insert_diff(fav_manga_data, insert_data): | |
"""insert_dataに含まれるデータを漫画データに追加する.重複は許さない""" | |
for new_data in insert_data: | |
duplicate = False | |
for manga_data in fav_manga_data: | |
if manga_data[0] == str(new_data[0]) and manga_data[2] == new_data[2]: | |
duplicate = True | |
break | |
if not duplicate: | |
fav_manga_data.append(new_data) | |
def delete_diff(fav_manga_data, delete_data): | |
"""漫画データからdelete_dataに含まれるデータをすべて削除する""" | |
for target in delete_data: | |
copy_fav_manga_data = fav_manga_data[:] | |
for manga_data in copy_fav_manga_data: | |
if manga_data[0] == str(target[0]) and manga_data[2] == target[2]: | |
fav_manga_data.remove(manga_data) | |
def write_column_names_to_tsv(f, names): | |
"""引数として与えられたカラム名一覧をtvs形式でファイルに書き込む""" | |
writer = csv.writer(f, delimiter='\t', lineterminator='\r\n') | |
writer.writerow(names) | |
def write_manga_rows_to_tsv(f, fav_manga_rows): | |
"""tvs形式に整形された漫画データをファイルに書き込む""" | |
writer = csv.writer(f, delimiter='\t', lineterminator='\r\n', quoting=csv.QUOTE_ALL) | |
for data in fav_manga_rows: | |
writer.writerow(data) | |
def write_manga_data_to_tsv(f, fav_manga_data): | |
"""漫画データをtvs形式に整形しファイルに書き込む""" | |
writer = csv.writer(f, delimiter='\t', lineterminator='\r\n', quoting=csv.QUOTE_ALL) | |
for data in fav_manga_data: | |
for manga in sorted(data['user']['favorite_manga'], key=lambda x: x['id']): | |
writer.writerow([data['user']['id'], | |
data['user']['login'], | |
manga['id'], | |
manga['title'], | |
manga['author'], | |
manga['mangapedia_url'], | |
manga['mangapedia_author_url'], | |
manga['awards_of_author']]) | |
def create_tsv_from_json(tsv_path, json_paths): | |
"""ファイルパスtsv_pathにjsonファイルを元にしたtsvファイルを新規作成する""" | |
try: | |
fav_manga_data = load_json_data(json_paths) | |
append_information(fav_manga_data) | |
with open(tsv_path, 'w') as f: | |
write_column_names_to_tsv(f, ('user_id', | |
'user_login', | |
'manga_id', | |
'manga_title', | |
'manga_author', | |
'mangapedia_title_url', | |
'mangapedia_author_url', | |
'awards_of_author')) | |
write_manga_data_to_tsv(f, fav_manga_data) | |
except json.decoder.JSONDecodeError as e: | |
with open(ERRORLOG, 'a') as f: | |
f.write('%s\r\nJSONDecodeError: %s - %s\r\n' % (datetime.now(), e.args, json_paths)) | |
except FileNotFoundError: | |
with open(ERRORLOG, 'a') as f: | |
f.write('%s\r\nFileNotFoundError: %s\r\n' % (datetime.now(), e.args)) | |
except: | |
with open(ERRORLOG, 'a') as f: | |
f.write('%s\r\n%s\r\n' % (datetime.now(), sys.exc_info())) | |
def update_tsv_with_json(tsv_path, new_tsv_path, json_paths): | |
"""ファイルパスtsv_pathのtsvファイルをjsonファイルを元に更新する""" | |
normal_end = False | |
try: | |
with open(tsv_path, "r") as f: | |
fav_manga_rows = list(csv.reader(f, delimiter='\t')) | |
column_names = fav_manga_rows.pop(0) | |
diff_data = load_json_data(json_paths) | |
append_information(diff_data) | |
insert, delete = uniform_diff_data(diff_data) | |
insert_diff(fav_manga_rows, insert) | |
delete_diff(fav_manga_rows, delete) | |
fav_manga_rows.sort(key=lambda x: '{}_{}'.format(x[0], x[2])) | |
with open(new_tsv_path, 'w') as f: | |
write_column_names_to_tsv(f, column_names) | |
write_manga_rows_to_tsv(f, fav_manga_rows) | |
except json.decoder.JSONDecodeError as e: | |
with open(ERRORLOG, 'a') as f: | |
f.write('%s\r\nJSONDecodeError: %s - %s\r\n' % (datetime.now(), e.args, json_paths)) | |
except FileNotFoundError as e: | |
with open(ERRORLOG, 'a') as f: | |
f.write('%s\r\nFileNotFoundError: %s\r\n' % (datetime.now(), e.args)) | |
except: | |
with open(ERRORLOG, 'a') as f: | |
f.write('%s\r\n%s\r\n' % (datetime.now(), sys.exc_info())) | |
else: | |
normal_end = True | |
if not normal_end and os.path.exists(tsv_path): | |
# 不正終了時は変更無しで新規ファイルを作成する | |
with open(new_tsv_path, 'wb') as f1: | |
with open(tsv_path, "rb") as f2: | |
f1.write(f2.read()) | |
def main(): | |
create_tsv_from_json('favorite_manga.tsv', ['favorite_manga_001.json', 'favorite_manga_002.json']) | |
update_tsv_with_json('favorite_manga.tsv', 'favorite_manga_20160515.tsv', ['favorite_manga_diff_20160515.json']) | |
update_tsv_with_json('favorite_manga_20160515.tsv', 'favorite_manga_20160516.tsv', ['favorite_manga_diff_20160516.json']) | |
update_tsv_with_json('favorite_manga_20160516.tsv', 'favorite_manga_20160517.tsv', ['favorite_manga_diff_20160517.json']) | |
update_tsv_with_json('favorite_manga_20160517.tsv', 'favorite_manga_20160518.tsv', ['favorite_manga_diff_20160518.json']) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment