k5trismegistus/mt_parse.py

## mt_parse.py
from dateutil.parser import parse
from bs4 import BeautifulSoup

BLOG_TITLE = <ブログ名>
EXPORTED = <エクスポートファイル名>

class MtArticle():
    def __init__(self):
        self.title = None
        self.date = None
        self.body = None
        self.published = False

    def __str__(self):
        return self.title


class MtParser():
    def __init__(self):
        self.mode = 'header'
        self.contents = []

    def parse(self, file):
        content = MtArticle()
        line = file.readline()
        while line:
            if line.startswith('--------'):
                self.contents.append(content)
                content = MtArticle()
                self.mode = 'header'
            elif line.startswith('-----'):
                self.mode = 'select'
            if self.mode == 'header':
                self.__parse_header(content, line)
            if self.mode == 'body':
                self.__parse_body(content, line)
            if self.mode == 'comment':
                pass
            if self.mode == 'select':
              self.__select_mode(line)
            line = file.readline()

    def save_text(self, only_published=True):
        to_save_contents = [c for c in self.contents if c.published] if only_published else self.contents

        for content in to_save_contents:
            body_soup = BeautifulSoup(content.body, 'html.parser')
            body_text = body_soup.get_text()

            prefix =str(content.date.year) + '-' + str(content.date.month) + '-' + str(content.date.day)

            with open('export/' + prefix + '_' + content.title.replace('/', '') + '.txt', 'w') as f:
                f.write(f'{BLOG_TITLE} というブログの筆者は、 「{content.title.strip()}」 という記事を書きました。\n\n')
                f.write('---------\n\n')
                f.write(body_text)

    def __select_mode(self, line):
        if line.startswith('BODY:'):
            self.mode = 'body'
        if line.startswith('COMMENT:'):
            self.mode = 'comment'
        if line.startswith('EXCERPT:'):
            self.mode = 'excerpt'

    def __parse_header(self, content, line):
        if line.startswith('TITLE: '):
            content.title = line[7:]
        if line.startswith('STATUS: Publish'):
            content.published = True
        if line.startswith('DATE: '):
            content.date = parse(line[6:])

    def __parse_body(self, content, line):
        if content.body is None:
            content.body = line
        else:
            content.body += line

if __name__ == '__main__':
    with open(EXPORTED) as f:
      mp = MtParser()
      mp.parse(f)
      mp.save_text()
	from dateutil.parser import parse
	from bs4 import BeautifulSoup

	BLOG_TITLE = <ブログ名>
	EXPORTED = <エクスポートファイル名>

	class MtArticle():
	def __init__(self):
	self.title = None
	self.date = None
	self.body = None
	self.published = False

	def __str__(self):
	return self.title


	class MtParser():
	def __init__(self):
	self.mode = 'header'
	self.contents = []

	def parse(self, file):
	content = MtArticle()
	line = file.readline()
	while line:
	if line.startswith('--------'):
	self.contents.append(content)
	content = MtArticle()
	self.mode = 'header'
	elif line.startswith('-----'):
	self.mode = 'select'
	if self.mode == 'header':
	self.__parse_header(content, line)
	if self.mode == 'body':
	self.__parse_body(content, line)
	if self.mode == 'comment':
	pass
	if self.mode == 'select':
	self.__select_mode(line)
	line = file.readline()

	def save_text(self, only_published=True):
	to_save_contents = [c for c in self.contents if c.published] if only_published else self.contents

	for content in to_save_contents:
	body_soup = BeautifulSoup(content.body, 'html.parser')
	body_text = body_soup.get_text()

	prefix =str(content.date.year) + '-' + str(content.date.month) + '-' + str(content.date.day)

	with open('export/' + prefix + '_' + content.title.replace('/', '') + '.txt', 'w') as f:
	f.write(f'{BLOG_TITLE} というブログの筆者は、「{content.title.strip()}」という記事を書きました。\n\n')
	f.write('---------\n\n')
	f.write(body_text)

	def __select_mode(self, line):
	if line.startswith('BODY:'):
	self.mode = 'body'
	if line.startswith('COMMENT:'):
	self.mode = 'comment'
	if line.startswith('EXCERPT:'):
	self.mode = 'excerpt'

	def __parse_header(self, content, line):
	if line.startswith('TITLE: '):
	content.title = line[7:]
	if line.startswith('STATUS: Publish'):
	content.published = True
	if line.startswith('DATE: '):
	content.date = parse(line[6:])

	def __parse_body(self, content, line):
	if content.body is None:
	content.body = line
	else:
	content.body += line

	if __name__ == '__main__':
	with open(EXPORTED) as f:
	mp = MtParser()
	mp.parse(f)
	mp.save_text()