Skip to content

Instantly share code, notes, and snippets.

@suminb
Last active November 5, 2018 16:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save suminb/0c9f9961727639261009b4a5f078501c to your computer and use it in GitHub Desktop.
Save suminb/0c9f9961727639261009b4a5f078501c to your computer and use it in GitHub Desktop.
WordPress 게시물을 .rst 형식으로 변환하는 스크립트
"""Converts old blog posts (from WordPress) to .rst files."""
import os
import re
import warnings
import pypandoc
import yaml
SOURCE_PATH = 'posts.bak'
TARGET_PATH = 'posts.rst'
def process_file(filename):
datetime = extract_datetime(filename)
with open(filename) as fin:
return convert(fin.read(), datetime)
def convert(source, datetime):
"""Converts a document.
:param source: A WordPress document
:param datetime: String representation of datetime (yyyy-mm-dd)
"""
metadata_raw, html = split_document(source)
metadata = yaml.load(metadata_raw)
rst = pypandoc.convert_text(html, 'rst', format='html')
rst = migrate_all_images(rst, datetime)
metadata_text = yaml.dump(
metadata, default_flow_style=False, allow_unicode=True)
metadata_text = '\n '.join(metadata_text.split('\n'))
title = metadata['title']
headerline = '=' * (len(datetime) + 1 + width(title))
return f"""
{datetime} {title}
{headerline}
{rst}
.. (Metadata from the original post)
{metadata_text}
""".strip(), f'{datetime}.rst'
def split_document(source):
index = source.index('---', 3)
return source[3:index].strip(), source[index + 3:].strip()
def width(string):
return sum([1 if s.isascii() else 2 for s in string])
def extract_datetime(path):
"""Extracts datetime (yyyy-mm-dd) from a filename (or path)."""
path_components = os.path.split(path)
filename = path_components[-1]
s = re.search(r'\d{4}-\d{2}-\d{2}', filename)
if s:
return s.group(0)
else:
raise ValueError(f'No date was found in {path}')
def migrate_image(src_path, dest_dir):
"""
:param src_path: e.g., wp-content/uploads/2006/02/image.jpg
:param dest_dir: e.g., posts.rst/2006-02-18
"""
basepath, filename = os.path.split(src_path)
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
dest_path = os.path.join(dest_dir, filename)
if os.path.exists(src_path):
os.rename(src_path, dest_path)
else:
warnings.warn(f'{src_path} does not exist')
return dest_path
def migrate_all_images(source, datetime):
# Pre-processing
source = source.replace('http://blog...(old domain 1)', '')
source = source.replace('http://blog...(old domain 2)', '')
source = source.replace('http://blog...(old domain 3)', '')
pattern = re.compile(r'wp-content/uploads/.+\.\w+')
index = 0
while True:
match = pattern.search(source, index)
if not match:
break
path = source[match.start():match.end()]
dest_path = migrate_image(path, os.path.join(TARGET_PATH, datetime))
# Resolve the path difference between a post and an image
dest_path = os.path.relpath(dest_path, TARGET_PATH)
# `path` should include the preceeding '/'
source = source.replace('/' + path, dest_path)
index = match.end()
return source
def main():
for filename in os.listdir(SOURCE_PATH):
if not filename.endswith('.html'):
continue
print(f'Converting {filename}...')
rst, target_file = process_file(os.path.join(SOURCE_PATH, filename))
with open(os.path.join(TARGET_PATH, target_file), 'w') as fout:
fout.write(rst)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment