okusama27/change_md.py

## change_md.py
import os
from html.parser import HTMLParser
import pprint
from datetime import datetime as dt

base_path = '../html/blog.iron-hot.com'
base_dir = 'p'
save_dir = 'change_md'
main_title = ' – 覚えたことを復唱してみる。だって、忘れっぽいんですもの。'


class MyHTMLParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.res = list()

    def handle_starttag(self, tag, attrs):
        self.res.append(tag)
        self.res.append(attrs)

    def handle_data(self, data):
        if data == 'Tweet':
            self.res.clear()
        else:
            self.res.append(data)

    def handle_endtag(self, tag):

        if tag == 'ul' or tag == 'ol':
            # print('/' + tag)
            self.res.append('/' + tag)


def survay_categories(cat_list):
    cat = list()
    for idx, item in enumerate(cat_list):
        # print(item)
        if item == 'a':
            cat.append(cat_list[idx + 2])
    return cat


def create_write_data(options, bodies):
    res_text = list()
    # 日付、タイトル
    line1 = '# [{date}] {title}'.format(date=options['entry-date'],
                                        title=options['title'])
    res_text.append(line1)
    # カテゴリ
    line2 = '_{}_'.format(', '.join(options['cat-links']))
    res_text.append(line2)
    # 改行
    res_text.append('')

    # 本文
    ul_no = -1
    # pprint.pprint(bodies)
    for idx, row in enumerate(bodies):
        if idx == 0:
            continue
        # print(row)
        row_text = ''
        flg_a = 0
        flg_img = 0
        flg_p = False
        print(row)
        for col_index, col in enumerate(row):
            if col == 'ul' or col == 'ol':
                ul_no += 1
            if col == '/ul' or col == '/ol':
                ul_no -= 1
            if col == 'a':
                col_attrs = dict(row[col_index+1])
                href = col_attrs['href']
                try:
                    data = row[col_index+2]
                except:
                    data = ''
                if data == 'img':
                    col_attrs = dict(row[col_index + 3])
                    src = col_attrs['src']
                    if 'alt' in col_attrs:
                        alt = col_attrs['alt']
                    else:
                        alt = 'img'
                    data = '![{alt}]({src})'.format(alt=alt, src=src)
                    flg_img = 1
                a_txt = '<a href="{href}" target="_blank">{data}</a>'.format(href=href,
                                                                             data=data)
                row_text += a_txt
                flg_a = 2
            elif flg_img == 0 and col == 'img':
                col_attrs = dict(row[col_index + 1])
                src = col_attrs['src']
                if 'alt' in col_attrs:
                    alt = col_attrs['alt']
                else:
                    alt='img'
                img_txt = '![{alt}]({src})'.format(alt=alt, src=src)
                row_text += img_txt
                flg_img = 1

            else:
                if flg_a > 0:
                    flg_a -= 1
                    continue
                if flg_img > 0:
                    flg_img = 0
                    continue
                if col_index == 0:
                    if col == 'li':
                        # print(ul_no)
                        row_text += '  '*ul_no +  '- '
                        continue
                    if col == 'p':
                        flg_p = True
                        continue
                    if type(col) == str:
                        row_text += col
                    continue
                if col_index == 1:
                    if len(col) == 0:
                        continue
                    else:
                        pass
                if col_index >= 2 and type(col) == str:
                    if col != 'br':
                        row_text += col
        if flg_p:
            row_text += '\n'
            flg_p = False
        res_text.append(row_text.replace('brdivg:plusone p',''))


    # pprint.pprint(res_text)
    return res_text


def create_md(file_name, p_base_path, p_base_dir):
    flg_contents = False
    mds = dict()
    body_list = list()
    with open(os.path.join(p_base_path, p_base_dir, file_name), encoding='utf-8') as f:
        for rr in f:
            # print(rr)
            row = rr.rstrip().replace('<strong>', '**').replace('</strong>', '**')
            # print(row)
            if 'amazon-adsystem' in row:
                amazon = row.replace('<p>', '').replace('</p>', '')
                body_list.append(['amazon', [], amazon, '\n'])
                continue
            parser = MyHTMLParser()
            parser.feed(row)
            res = parser.res
            parser.close()
            attrs = dict()
            # print(res)
            if len(res) < 1:
                continue
            if res[0] == '/ul' or res[0] == '/ol':
                body_list.append(res)

            if res[0] == 'title':
                mds['title'] = res[2].replace(main_title, '')
                continue
            if len(res) > 1 and type(res[1]) == list:
                attrs = dict(res[1])
            if res[0] == 'span':
                if 'class' in attrs.keys():
                    if attrs['class'] == 'entry-date':
                        mds['entry-date'] = res[2]
                        continue
                    if attrs['class'] == 'cat-links':
                        mds['cat-links'] = survay_categories(res)
                        continue
            if res[0] == 'section' and 'class' in attrs.keys():
                if attrs['class'] == 'entry-content':
                    flg_contents = True
            if res[0] == 'div' and 'class' in attrs.keys():
                if attrs['class'] == 'entry-links':
                    flg_contents = False

            # if flg_contents == False:
            #     continue

            # print(res)

            if flg_contents:
                # print(res)
                body_list.append(res)

    # 書き込み用テキスト作成
    # pprint.pprint(body_list)
    write_text = create_write_data(mds, body_list)

    # 書き込み用ファイル名作成

    w_file_name = dt.strptime(mds['entry-date'], '%Y-%m-%d').strftime('%Y_%m_%d') + '.md'
    print(w_file_name)

    # 書き込み
    with open(os.path.join(p_base_path, save_dir, w_file_name), 'w') as fw:
        for w_row in write_text:
            fw.write(w_row)
            fw.write('\n')

if __name__=='__main__':
# parser = MyHTMLParser()

# file_name = '228.html'
    b_dir = os.path.join(base_path, base_dir)
    print(base_path)
    print(os.path.exists(base_path))
    print(os.listdir(base_path))
    # b_dir = '../html/'

    for file_name in os.listdir(b_dir):
        print(file_name)
        create_md(file_name, base_path, base_dir)
    # create_md(file_name, b_dir)
	import os
	from html.parser import HTMLParser
	import pprint
	from datetime import datetime as dt

	base_path = '../html/blog.iron-hot.com'
	base_dir = 'p'
	save_dir = 'change_md'
	main_title = ' – 覚えたことを復唱してみる。だって、忘れっぽいんですもの。'


	class MyHTMLParser(HTMLParser):

	def __init__(self):
	HTMLParser.__init__(self)
	self.res = list()

	def handle_starttag(self, tag, attrs):
	self.res.append(tag)
	self.res.append(attrs)

	def handle_data(self, data):
	if data == 'Tweet':
	self.res.clear()
	else:
	self.res.append(data)

	def handle_endtag(self, tag):

	if tag == 'ul' or tag == 'ol':
	# print('/' + tag)
	self.res.append('/' + tag)


	def survay_categories(cat_list):
	cat = list()
	for idx, item in enumerate(cat_list):
	# print(item)
	if item == 'a':
	cat.append(cat_list[idx + 2])
	return cat


	def create_write_data(options, bodies):
	res_text = list()
	# 日付、タイトル
	line1 = '# [{date}] {title}'.format(date=options['entry-date'],
	title=options['title'])
	res_text.append(line1)
	# カテゴリ
	line2 = '_{}_'.format(', '.join(options['cat-links']))
	res_text.append(line2)
	# 改行
	res_text.append('')

	# 本文
	ul_no = -1
	# pprint.pprint(bodies)
	for idx, row in enumerate(bodies):
	if idx == 0:
	continue
	# print(row)
	row_text = ''
	flg_a = 0
	flg_img = 0
	flg_p = False
	print(row)
	for col_index, col in enumerate(row):
	if col == 'ul' or col == 'ol':
	ul_no += 1
	if col == '/ul' or col == '/ol':
	ul_no -= 1
	if col == 'a':
	col_attrs = dict(row[col_index+1])
	href = col_attrs['href']
	try:
	data = row[col_index+2]
	except:
	data = ''
	if data == 'img':
	col_attrs = dict(row[col_index + 3])
	src = col_attrs['src']
	if 'alt' in col_attrs:
	alt = col_attrs['alt']
	else:
	alt = 'img'
	data = '![{alt}]({src})'.format(alt=alt, src=src)
	flg_img = 1
	a_txt = '<a href="{href}" target="_blank">{data}</a>'.format(href=href,
	data=data)
	row_text += a_txt
	flg_a = 2
	elif flg_img == 0 and col == 'img':
	col_attrs = dict(row[col_index + 1])
	src = col_attrs['src']
	if 'alt' in col_attrs:
	alt = col_attrs['alt']
	else:
	alt='img'
	img_txt = '![{alt}]({src})'.format(alt=alt, src=src)
	row_text += img_txt
	flg_img = 1

	else:
	if flg_a > 0:
	flg_a -= 1
	continue
	if flg_img > 0:
	flg_img = 0
	continue
	if col_index == 0:
	if col == 'li':
	# print(ul_no)
	row_text += ' '*ul_no + '- '
	continue
	if col == 'p':
	flg_p = True
	continue
	if type(col) == str:
	row_text += col
	continue
	if col_index == 1:
	if len(col) == 0:
	continue
	else:
	pass
	if col_index >= 2 and type(col) == str:
	if col != 'br':
	row_text += col
	if flg_p:
	row_text += '\n'
	flg_p = False
	res_text.append(row_text.replace('brdivg:plusone p',''))


	# pprint.pprint(res_text)
	return res_text


	def create_md(file_name, p_base_path, p_base_dir):
	flg_contents = False
	mds = dict()
	body_list = list()
	with open(os.path.join(p_base_path, p_base_dir, file_name), encoding='utf-8') as f:
	for rr in f:
	# print(rr)
	row = rr.rstrip().replace('<strong>', '').replace('</strong>', '')
	# print(row)
	if 'amazon-adsystem' in row:
	amazon = row.replace('<p>', '').replace('</p>', '')
	body_list.append(['amazon', [], amazon, '\n'])
	continue
	parser = MyHTMLParser()
	parser.feed(row)
	res = parser.res
	parser.close()
	attrs = dict()
	# print(res)
	if len(res) < 1:
	continue
	if res[0] == '/ul' or res[0] == '/ol':
	body_list.append(res)

	if res[0] == 'title':
	mds['title'] = res[2].replace(main_title, '')
	continue
	if len(res) > 1 and type(res[1]) == list:
	attrs = dict(res[1])
	if res[0] == 'span':
	if 'class' in attrs.keys():
	if attrs['class'] == 'entry-date':
	mds['entry-date'] = res[2]
	continue
	if attrs['class'] == 'cat-links':
	mds['cat-links'] = survay_categories(res)
	continue
	if res[0] == 'section' and 'class' in attrs.keys():
	if attrs['class'] == 'entry-content':
	flg_contents = True
	if res[0] == 'div' and 'class' in attrs.keys():
	if attrs['class'] == 'entry-links':
	flg_contents = False

	# if flg_contents == False:
	# continue

	# print(res)

	if flg_contents:
	# print(res)
	body_list.append(res)

	# 書き込み用テキスト作成
	# pprint.pprint(body_list)
	write_text = create_write_data(mds, body_list)

	# 書き込み用ファイル名作成

	w_file_name = dt.strptime(mds['entry-date'], '%Y-%m-%d').strftime('%Y_%m_%d') + '.md'
	print(w_file_name)

	# 書き込み
	with open(os.path.join(p_base_path, save_dir, w_file_name), 'w') as fw:
	for w_row in write_text:
	fw.write(w_row)
	fw.write('\n')

	if __name__=='__main__':
	# parser = MyHTMLParser()

	# file_name = '228.html'
	b_dir = os.path.join(base_path, base_dir)
	print(base_path)
	print(os.path.exists(base_path))
	print(os.listdir(base_path))
	# b_dir = '../html/'

	for file_name in os.listdir(b_dir):
	print(file_name)
	create_md(file_name, base_path, base_dir)
	# create_md(file_name, b_dir)