Skip to content

Instantly share code, notes, and snippets.

@k5trismegistus
Created September 7, 2023 02:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save k5trismegistus/064bb6e973d6a618a1a543c265794499 to your computer and use it in GitHub Desktop.
Save k5trismegistus/064bb6e973d6a618a1a543c265794499 to your computer and use it in GitHub Desktop.
A script for splitting blog data exported in MT format into individual text files for training with llama-index by article.
from dateutil.parser import parse
from bs4 import BeautifulSoup
BLOG_TITLE = <ブログ名>
EXPORTED = <エクスポートファイル名>
class MtArticle():
def __init__(self):
self.title = None
self.date = None
self.body = None
self.published = False
def __str__(self):
return self.title
class MtParser():
def __init__(self):
self.mode = 'header'
self.contents = []
def parse(self, file):
content = MtArticle()
line = file.readline()
while line:
if line.startswith('--------'):
self.contents.append(content)
content = MtArticle()
self.mode = 'header'
elif line.startswith('-----'):
self.mode = 'select'
if self.mode == 'header':
self.__parse_header(content, line)
if self.mode == 'body':
self.__parse_body(content, line)
if self.mode == 'comment':
pass
if self.mode == 'select':
self.__select_mode(line)
line = file.readline()
def save_text(self, only_published=True):
to_save_contents = [c for c in self.contents if c.published] if only_published else self.contents
for content in to_save_contents:
body_soup = BeautifulSoup(content.body, 'html.parser')
body_text = body_soup.get_text()
prefix =str(content.date.year) + '-' + str(content.date.month) + '-' + str(content.date.day)
with open('export/' + prefix + '_' + content.title.replace('/', '') + '.txt', 'w') as f:
f.write(f'{BLOG_TITLE} というブログの筆者は、 「{content.title.strip()}」 という記事を書きました。\n\n')
f.write('---------\n\n')
f.write(body_text)
def __select_mode(self, line):
if line.startswith('BODY:'):
self.mode = 'body'
if line.startswith('COMMENT:'):
self.mode = 'comment'
if line.startswith('EXCERPT:'):
self.mode = 'excerpt'
def __parse_header(self, content, line):
if line.startswith('TITLE: '):
content.title = line[7:]
if line.startswith('STATUS: Publish'):
content.published = True
if line.startswith('DATE: '):
content.date = parse(line[6:])
def __parse_body(self, content, line):
if content.body is None:
content.body = line
else:
content.body += line
if __name__ == '__main__':
with open(EXPORTED) as f:
mp = MtParser()
mp.parse(f)
mp.save_text()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment