Skip to content

Instantly share code, notes, and snippets.

@okusama27
Created October 20, 2016 21:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save okusama27/66d184a5e7091b1e24948af66c205106 to your computer and use it in GitHub Desktop.
Save okusama27/66d184a5e7091b1e24948af66c205106 to your computer and use it in GitHub Desktop.
import os
from html.parser import HTMLParser
import pprint
from datetime import datetime as dt
base_path = '../html/blog.iron-hot.com'
base_dir = 'p'
save_dir = 'change_md'
main_title = ' – 覚えたことを復唱してみる。だって、忘れっぽいんですもの。'
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.res = list()
def handle_starttag(self, tag, attrs):
self.res.append(tag)
self.res.append(attrs)
def handle_data(self, data):
if data == 'Tweet':
self.res.clear()
else:
self.res.append(data)
def handle_endtag(self, tag):
if tag == 'ul' or tag == 'ol':
# print('/' + tag)
self.res.append('/' + tag)
def survay_categories(cat_list):
cat = list()
for idx, item in enumerate(cat_list):
# print(item)
if item == 'a':
cat.append(cat_list[idx + 2])
return cat
def create_write_data(options, bodies):
res_text = list()
# 日付、タイトル
line1 = '# [{date}] {title}'.format(date=options['entry-date'],
title=options['title'])
res_text.append(line1)
# カテゴリ
line2 = '_{}_'.format(', '.join(options['cat-links']))
res_text.append(line2)
# 改行
res_text.append('')
# 本文
ul_no = -1
# pprint.pprint(bodies)
for idx, row in enumerate(bodies):
if idx == 0:
continue
# print(row)
row_text = ''
flg_a = 0
flg_img = 0
flg_p = False
print(row)
for col_index, col in enumerate(row):
if col == 'ul' or col == 'ol':
ul_no += 1
if col == '/ul' or col == '/ol':
ul_no -= 1
if col == 'a':
col_attrs = dict(row[col_index+1])
href = col_attrs['href']
try:
data = row[col_index+2]
except:
data = ''
if data == 'img':
col_attrs = dict(row[col_index + 3])
src = col_attrs['src']
if 'alt' in col_attrs:
alt = col_attrs['alt']
else:
alt = 'img'
data = '![{alt}]({src})'.format(alt=alt, src=src)
flg_img = 1
a_txt = '<a href="{href}" target="_blank">{data}</a>'.format(href=href,
data=data)
row_text += a_txt
flg_a = 2
elif flg_img == 0 and col == 'img':
col_attrs = dict(row[col_index + 1])
src = col_attrs['src']
if 'alt' in col_attrs:
alt = col_attrs['alt']
else:
alt='img'
img_txt = '![{alt}]({src})'.format(alt=alt, src=src)
row_text += img_txt
flg_img = 1
else:
if flg_a > 0:
flg_a -= 1
continue
if flg_img > 0:
flg_img = 0
continue
if col_index == 0:
if col == 'li':
# print(ul_no)
row_text += ' '*ul_no + '- '
continue
if col == 'p':
flg_p = True
continue
if type(col) == str:
row_text += col
continue
if col_index == 1:
if len(col) == 0:
continue
else:
pass
if col_index >= 2 and type(col) == str:
if col != 'br':
row_text += col
if flg_p:
row_text += '\n'
flg_p = False
res_text.append(row_text.replace('brdivg:plusone p',''))
# pprint.pprint(res_text)
return res_text
def create_md(file_name, p_base_path, p_base_dir):
flg_contents = False
mds = dict()
body_list = list()
with open(os.path.join(p_base_path, p_base_dir, file_name), encoding='utf-8') as f:
for rr in f:
# print(rr)
row = rr.rstrip().replace('<strong>', '**').replace('</strong>', '**')
# print(row)
if 'amazon-adsystem' in row:
amazon = row.replace('<p>', '').replace('</p>', '')
body_list.append(['amazon', [], amazon, '\n'])
continue
parser = MyHTMLParser()
parser.feed(row)
res = parser.res
parser.close()
attrs = dict()
# print(res)
if len(res) < 1:
continue
if res[0] == '/ul' or res[0] == '/ol':
body_list.append(res)
if res[0] == 'title':
mds['title'] = res[2].replace(main_title, '')
continue
if len(res) > 1 and type(res[1]) == list:
attrs = dict(res[1])
if res[0] == 'span':
if 'class' in attrs.keys():
if attrs['class'] == 'entry-date':
mds['entry-date'] = res[2]
continue
if attrs['class'] == 'cat-links':
mds['cat-links'] = survay_categories(res)
continue
if res[0] == 'section' and 'class' in attrs.keys():
if attrs['class'] == 'entry-content':
flg_contents = True
if res[0] == 'div' and 'class' in attrs.keys():
if attrs['class'] == 'entry-links':
flg_contents = False
# if flg_contents == False:
# continue
# print(res)
if flg_contents:
# print(res)
body_list.append(res)
# 書き込み用テキスト作成
# pprint.pprint(body_list)
write_text = create_write_data(mds, body_list)
# 書き込み用ファイル名作成
w_file_name = dt.strptime(mds['entry-date'], '%Y-%m-%d').strftime('%Y_%m_%d') + '.md'
print(w_file_name)
# 書き込み
with open(os.path.join(p_base_path, save_dir, w_file_name), 'w') as fw:
for w_row in write_text:
fw.write(w_row)
fw.write('\n')
if __name__=='__main__':
# parser = MyHTMLParser()
# file_name = '228.html'
b_dir = os.path.join(base_path, base_dir)
print(base_path)
print(os.path.exists(base_path))
print(os.listdir(base_path))
# b_dir = '../html/'
for file_name in os.listdir(b_dir):
print(file_name)
create_md(file_name, base_path, base_dir)
# create_md(file_name, b_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment