Created
October 20, 2016 21:37
-
-
Save okusama27/66d184a5e7091b1e24948af66c205106 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from html.parser import HTMLParser | |
import pprint | |
from datetime import datetime as dt | |
base_path = '../html/blog.iron-hot.com' | |
base_dir = 'p' | |
save_dir = 'change_md' | |
main_title = ' – 覚えたことを復唱してみる。だって、忘れっぽいんですもの。' | |
class MyHTMLParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.res = list() | |
def handle_starttag(self, tag, attrs): | |
self.res.append(tag) | |
self.res.append(attrs) | |
def handle_data(self, data): | |
if data == 'Tweet': | |
self.res.clear() | |
else: | |
self.res.append(data) | |
def handle_endtag(self, tag): | |
if tag == 'ul' or tag == 'ol': | |
# print('/' + tag) | |
self.res.append('/' + tag) | |
def survay_categories(cat_list): | |
cat = list() | |
for idx, item in enumerate(cat_list): | |
# print(item) | |
if item == 'a': | |
cat.append(cat_list[idx + 2]) | |
return cat | |
def create_write_data(options, bodies): | |
res_text = list() | |
# 日付、タイトル | |
line1 = '# [{date}] {title}'.format(date=options['entry-date'], | |
title=options['title']) | |
res_text.append(line1) | |
# カテゴリ | |
line2 = '_{}_'.format(', '.join(options['cat-links'])) | |
res_text.append(line2) | |
# 改行 | |
res_text.append('') | |
# 本文 | |
ul_no = -1 | |
# pprint.pprint(bodies) | |
for idx, row in enumerate(bodies): | |
if idx == 0: | |
continue | |
# print(row) | |
row_text = '' | |
flg_a = 0 | |
flg_img = 0 | |
flg_p = False | |
print(row) | |
for col_index, col in enumerate(row): | |
if col == 'ul' or col == 'ol': | |
ul_no += 1 | |
if col == '/ul' or col == '/ol': | |
ul_no -= 1 | |
if col == 'a': | |
col_attrs = dict(row[col_index+1]) | |
href = col_attrs['href'] | |
try: | |
data = row[col_index+2] | |
except: | |
data = '' | |
if data == 'img': | |
col_attrs = dict(row[col_index + 3]) | |
src = col_attrs['src'] | |
if 'alt' in col_attrs: | |
alt = col_attrs['alt'] | |
else: | |
alt = 'img' | |
data = '![{alt}]({src})'.format(alt=alt, src=src) | |
flg_img = 1 | |
a_txt = '<a href="{href}" target="_blank">{data}</a>'.format(href=href, | |
data=data) | |
row_text += a_txt | |
flg_a = 2 | |
elif flg_img == 0 and col == 'img': | |
col_attrs = dict(row[col_index + 1]) | |
src = col_attrs['src'] | |
if 'alt' in col_attrs: | |
alt = col_attrs['alt'] | |
else: | |
alt='img' | |
img_txt = '![{alt}]({src})'.format(alt=alt, src=src) | |
row_text += img_txt | |
flg_img = 1 | |
else: | |
if flg_a > 0: | |
flg_a -= 1 | |
continue | |
if flg_img > 0: | |
flg_img = 0 | |
continue | |
if col_index == 0: | |
if col == 'li': | |
# print(ul_no) | |
row_text += ' '*ul_no + '- ' | |
continue | |
if col == 'p': | |
flg_p = True | |
continue | |
if type(col) == str: | |
row_text += col | |
continue | |
if col_index == 1: | |
if len(col) == 0: | |
continue | |
else: | |
pass | |
if col_index >= 2 and type(col) == str: | |
if col != 'br': | |
row_text += col | |
if flg_p: | |
row_text += '\n' | |
flg_p = False | |
res_text.append(row_text.replace('brdivg:plusone p','')) | |
# pprint.pprint(res_text) | |
return res_text | |
def create_md(file_name, p_base_path, p_base_dir): | |
flg_contents = False | |
mds = dict() | |
body_list = list() | |
with open(os.path.join(p_base_path, p_base_dir, file_name), encoding='utf-8') as f: | |
for rr in f: | |
# print(rr) | |
row = rr.rstrip().replace('<strong>', '**').replace('</strong>', '**') | |
# print(row) | |
if 'amazon-adsystem' in row: | |
amazon = row.replace('<p>', '').replace('</p>', '') | |
body_list.append(['amazon', [], amazon, '\n']) | |
continue | |
parser = MyHTMLParser() | |
parser.feed(row) | |
res = parser.res | |
parser.close() | |
attrs = dict() | |
# print(res) | |
if len(res) < 1: | |
continue | |
if res[0] == '/ul' or res[0] == '/ol': | |
body_list.append(res) | |
if res[0] == 'title': | |
mds['title'] = res[2].replace(main_title, '') | |
continue | |
if len(res) > 1 and type(res[1]) == list: | |
attrs = dict(res[1]) | |
if res[0] == 'span': | |
if 'class' in attrs.keys(): | |
if attrs['class'] == 'entry-date': | |
mds['entry-date'] = res[2] | |
continue | |
if attrs['class'] == 'cat-links': | |
mds['cat-links'] = survay_categories(res) | |
continue | |
if res[0] == 'section' and 'class' in attrs.keys(): | |
if attrs['class'] == 'entry-content': | |
flg_contents = True | |
if res[0] == 'div' and 'class' in attrs.keys(): | |
if attrs['class'] == 'entry-links': | |
flg_contents = False | |
# if flg_contents == False: | |
# continue | |
# print(res) | |
if flg_contents: | |
# print(res) | |
body_list.append(res) | |
# 書き込み用テキスト作成 | |
# pprint.pprint(body_list) | |
write_text = create_write_data(mds, body_list) | |
# 書き込み用ファイル名作成 | |
w_file_name = dt.strptime(mds['entry-date'], '%Y-%m-%d').strftime('%Y_%m_%d') + '.md' | |
print(w_file_name) | |
# 書き込み | |
with open(os.path.join(p_base_path, save_dir, w_file_name), 'w') as fw: | |
for w_row in write_text: | |
fw.write(w_row) | |
fw.write('\n') | |
if __name__=='__main__': | |
# parser = MyHTMLParser() | |
# file_name = '228.html' | |
b_dir = os.path.join(base_path, base_dir) | |
print(base_path) | |
print(os.path.exists(base_path)) | |
print(os.listdir(base_path)) | |
# b_dir = '../html/' | |
for file_name in os.listdir(b_dir): | |
print(file_name) | |
create_md(file_name, base_path, base_dir) | |
# create_md(file_name, b_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment