Skip to content

Instantly share code, notes, and snippets.

@ommadawn46
Last active December 3, 2018 09:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ommadawn46/a8d3ee76889c82aa6d3eef63469b4ef9 to your computer and use it in GitHub Desktop.
Save ommadawn46/a8d3ee76889c82aa6d3eef63469b4ef9 to your computer and use it in GitHub Desktop.
from html.parser import HTMLParser
from urllib.request import urlopen
from datetime import datetime
import json
import csv
import sys
import re
import os
# エラーログのパス
ERRORLOG = 'error.log'
# 作者ページURLと受賞情報を抽出するためのHTMLParserクラス
class InformationParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.author_url = ''
self.awards = []
self.in_award = False
def handle_starttag(self, tag, attrs):
if tag == 'a':
attrs = dict(attrs)
if 'rel' in attrs and attrs['rel'] == 'author':
self.author_url = attrs['href']
if tag == 'td':
attrs = dict(attrs)
if 'itemprop' in attrs:
self.in_award = True
def handle_endtag(self, tag):
self.in_award = False
def handle_data(self, data):
if self.in_award:
self.awards.append(re.sub(r'(\s+)', ' ', data).strip())
self.in_award = False
def load_json_data(paths):
"""引数のlistに含まれるjsonファイルパスからデータを読み込み,漫画データリストを作成する"""
data = []
for path in paths:
with open(path, 'r') as f:
data += json.load(f)
return sorted(data, key=lambda x: x['user']['id'])
def append_information(fav_manga_data):
"""作者ページURLと受賞情報を取得し,漫画データリストに追加する"""
for data in fav_manga_data:
for manga in data['user']['favorite_manga']:
if not 'mangapedia_url' in manga:
continue
parser = InformationParser()
parser.feed(urlopen(manga['mangapedia_url']).read().decode('utf-8'))
author_url = 'https://mangapedia.com' + parser.author_url
parser.feed(urlopen(author_url).read().decode('utf-8'))
awards = parser.awards
parser.close()
manga['mangapedia_author_url'] = author_url
manga['awards_of_author'] = ','.join(awards)
def uniform_diff_data(diff_data):
"""差分データをtsvカラムの形に整形する"""
insert, delete = [], []
for data in diff_data:
for manga in sorted(data['user']['favorite_manga'], key=lambda x: x['id']):
diff = (data['user']['id'] if 'id' in data['user'] else '',
data['user']['login'] if 'login' in data['user'] else '',
manga['id'] if 'id' in manga else '',
manga['title'] if 'title' in manga else '',
manga['author'] if 'author' in manga else '',
manga['mangapedia_url'] if 'mangapedia_url' in manga else '',
manga['mangapedia_author_url'] if 'mangapedia_author_url' in manga else '',
manga['awards_of_author'] if 'awards_of_author' in manga else '')
if manga['action'] == 'insert':
insert.append(diff)
else:
delete.append(diff)
return insert, delete
def insert_diff(fav_manga_data, insert_data):
"""insert_dataに含まれるデータを漫画データに追加する.重複は許さない"""
for new_data in insert_data:
duplicate = False
for manga_data in fav_manga_data:
if manga_data[0] == str(new_data[0]) and manga_data[2] == new_data[2]:
duplicate = True
break
if not duplicate:
fav_manga_data.append(new_data)
def delete_diff(fav_manga_data, delete_data):
"""漫画データからdelete_dataに含まれるデータをすべて削除する"""
for target in delete_data:
copy_fav_manga_data = fav_manga_data[:]
for manga_data in copy_fav_manga_data:
if manga_data[0] == str(target[0]) and manga_data[2] == target[2]:
fav_manga_data.remove(manga_data)
def write_column_names_to_tsv(f, names):
"""引数として与えられたカラム名一覧をtvs形式でファイルに書き込む"""
writer = csv.writer(f, delimiter='\t', lineterminator='\r\n')
writer.writerow(names)
def write_manga_rows_to_tsv(f, fav_manga_rows):
"""tvs形式に整形された漫画データをファイルに書き込む"""
writer = csv.writer(f, delimiter='\t', lineterminator='\r\n', quoting=csv.QUOTE_ALL)
for data in fav_manga_rows:
writer.writerow(data)
def write_manga_data_to_tsv(f, fav_manga_data):
"""漫画データをtvs形式に整形しファイルに書き込む"""
writer = csv.writer(f, delimiter='\t', lineterminator='\r\n', quoting=csv.QUOTE_ALL)
for data in fav_manga_data:
for manga in sorted(data['user']['favorite_manga'], key=lambda x: x['id']):
writer.writerow([data['user']['id'],
data['user']['login'],
manga['id'],
manga['title'],
manga['author'],
manga['mangapedia_url'],
manga['mangapedia_author_url'],
manga['awards_of_author']])
def create_tsv_from_json(tsv_path, json_paths):
"""ファイルパスtsv_pathにjsonファイルを元にしたtsvファイルを新規作成する"""
try:
fav_manga_data = load_json_data(json_paths)
append_information(fav_manga_data)
with open(tsv_path, 'w') as f:
write_column_names_to_tsv(f, ('user_id',
'user_login',
'manga_id',
'manga_title',
'manga_author',
'mangapedia_title_url',
'mangapedia_author_url',
'awards_of_author'))
write_manga_data_to_tsv(f, fav_manga_data)
except json.decoder.JSONDecodeError as e:
with open(ERRORLOG, 'a') as f:
f.write('%s\r\nJSONDecodeError: %s - %s\r\n' % (datetime.now(), e.args, json_paths))
except FileNotFoundError:
with open(ERRORLOG, 'a') as f:
f.write('%s\r\nFileNotFoundError: %s\r\n' % (datetime.now(), e.args))
except:
with open(ERRORLOG, 'a') as f:
f.write('%s\r\n%s\r\n' % (datetime.now(), sys.exc_info()))
def update_tsv_with_json(tsv_path, new_tsv_path, json_paths):
"""ファイルパスtsv_pathのtsvファイルをjsonファイルを元に更新する"""
normal_end = False
try:
with open(tsv_path, "r") as f:
fav_manga_rows = list(csv.reader(f, delimiter='\t'))
column_names = fav_manga_rows.pop(0)
diff_data = load_json_data(json_paths)
append_information(diff_data)
insert, delete = uniform_diff_data(diff_data)
insert_diff(fav_manga_rows, insert)
delete_diff(fav_manga_rows, delete)
fav_manga_rows.sort(key=lambda x: '{}_{}'.format(x[0], x[2]))
with open(new_tsv_path, 'w') as f:
write_column_names_to_tsv(f, column_names)
write_manga_rows_to_tsv(f, fav_manga_rows)
except json.decoder.JSONDecodeError as e:
with open(ERRORLOG, 'a') as f:
f.write('%s\r\nJSONDecodeError: %s - %s\r\n' % (datetime.now(), e.args, json_paths))
except FileNotFoundError as e:
with open(ERRORLOG, 'a') as f:
f.write('%s\r\nFileNotFoundError: %s\r\n' % (datetime.now(), e.args))
except:
with open(ERRORLOG, 'a') as f:
f.write('%s\r\n%s\r\n' % (datetime.now(), sys.exc_info()))
else:
normal_end = True
if not normal_end and os.path.exists(tsv_path):
# 不正終了時は変更無しで新規ファイルを作成する
with open(new_tsv_path, 'wb') as f1:
with open(tsv_path, "rb") as f2:
f1.write(f2.read())
def main():
create_tsv_from_json('favorite_manga.tsv', ['favorite_manga_001.json', 'favorite_manga_002.json'])
update_tsv_with_json('favorite_manga.tsv', 'favorite_manga_20160515.tsv', ['favorite_manga_diff_20160515.json'])
update_tsv_with_json('favorite_manga_20160515.tsv', 'favorite_manga_20160516.tsv', ['favorite_manga_diff_20160516.json'])
update_tsv_with_json('favorite_manga_20160516.tsv', 'favorite_manga_20160517.tsv', ['favorite_manga_diff_20160517.json'])
update_tsv_with_json('favorite_manga_20160517.tsv', 'favorite_manga_20160518.tsv', ['favorite_manga_diff_20160518.json'])
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment