suminb/convert.py

## convert.py
"""Converts old blog posts (from WordPress) to .rst files."""
import os
import re
import warnings

import pypandoc
import yaml


SOURCE_PATH = 'posts.bak'
TARGET_PATH = 'posts.rst'


def process_file(filename):
    datetime = extract_datetime(filename)
    with open(filename) as fin:
        return convert(fin.read(), datetime)


def convert(source, datetime):
    """Converts a document.

    :param source: A WordPress document
    :param datetime: String representation of datetime (yyyy-mm-dd)
    """
    metadata_raw, html = split_document(source)

    metadata = yaml.load(metadata_raw)
    rst = pypandoc.convert_text(html, 'rst', format='html')

    rst = migrate_all_images(rst, datetime)

    metadata_text = yaml.dump(
        metadata, default_flow_style=False, allow_unicode=True)
    metadata_text = '\n   '.join(metadata_text.split('\n'))

    title = metadata['title']
    headerline = '=' * (len(datetime) + 1 + width(title))

    return f"""
{datetime} {title}
{headerline}

{rst}

.. (Metadata from the original post)
   {metadata_text}
""".strip(), f'{datetime}.rst'


def split_document(source):
    index = source.index('---', 3)
    return source[3:index].strip(), source[index + 3:].strip()


def width(string):
    return sum([1 if s.isascii() else 2 for s in string])


def extract_datetime(path):
    """Extracts datetime (yyyy-mm-dd) from a filename (or path)."""
    path_components = os.path.split(path)
    filename = path_components[-1]
    s = re.search(r'\d{4}-\d{2}-\d{2}', filename)

    if s:
        return s.group(0)
    else:
        raise ValueError(f'No date was found in {path}')


def migrate_image(src_path, dest_dir):
    """
    :param src_path: e.g., wp-content/uploads/2006/02/image.jpg
    :param dest_dir: e.g., posts.rst/2006-02-18
    """
    basepath, filename = os.path.split(src_path)

    if not os.path.exists(dest_dir):
        os.mkdir(dest_dir)

    dest_path = os.path.join(dest_dir, filename)
    if os.path.exists(src_path):
        os.rename(src_path, dest_path)
    else:
        warnings.warn(f'{src_path} does not exist')

    return dest_path


def migrate_all_images(source, datetime):
    # Pre-processing
    source = source.replace('http://blog...(old domain 1)', '')
    source = source.replace('http://blog...(old domain 2)', '')
    source = source.replace('http://blog...(old domain 3)', '')

    pattern = re.compile(r'wp-content/uploads/.+\.\w+')
    index = 0
    while True:
        match = pattern.search(source, index)
        if not match:
            break

        path = source[match.start():match.end()]

        dest_path = migrate_image(path, os.path.join(TARGET_PATH, datetime))

        # Resolve the path difference between a post and an image
        dest_path = os.path.relpath(dest_path, TARGET_PATH)

        # `path` should include the preceeding '/'
        source = source.replace('/' + path, dest_path)
        index = match.end()

    return source


def main():
    for filename in os.listdir(SOURCE_PATH):
        if not filename.endswith('.html'):
            continue

        print(f'Converting {filename}...')
        rst, target_file = process_file(os.path.join(SOURCE_PATH, filename))

        with open(os.path.join(TARGET_PATH, target_file), 'w') as fout:
            fout.write(rst)


if __name__ == '__main__':
    main()
	"""Converts old blog posts (from WordPress) to .rst files."""
	import os
	import re
	import warnings

	import pypandoc
	import yaml


	SOURCE_PATH = 'posts.bak'
	TARGET_PATH = 'posts.rst'


	def process_file(filename):
	datetime = extract_datetime(filename)
	with open(filename) as fin:
	return convert(fin.read(), datetime)


	def convert(source, datetime):
	"""Converts a document.

	:param source: A WordPress document
	:param datetime: String representation of datetime (yyyy-mm-dd)
	"""
	metadata_raw, html = split_document(source)

	metadata = yaml.load(metadata_raw)
	rst = pypandoc.convert_text(html, 'rst', format='html')

	rst = migrate_all_images(rst, datetime)

	metadata_text = yaml.dump(
	metadata, default_flow_style=False, allow_unicode=True)
	metadata_text = '\n '.join(metadata_text.split('\n'))

	title = metadata['title']
	headerline = '=' * (len(datetime) + 1 + width(title))

	return f"""
	{datetime} {title}
	{headerline}

	{rst}

	.. (Metadata from the original post)
	{metadata_text}
	""".strip(), f'{datetime}.rst'


	def split_document(source):
	index = source.index('---', 3)
	return source[3:index].strip(), source[index + 3:].strip()


	def width(string):
	return sum([1 if s.isascii() else 2 for s in string])


	def extract_datetime(path):
	"""Extracts datetime (yyyy-mm-dd) from a filename (or path)."""
	path_components = os.path.split(path)
	filename = path_components[-1]
	s = re.search(r'\d{4}-\d{2}-\d{2}', filename)

	if s:
	return s.group(0)
	else:
	raise ValueError(f'No date was found in {path}')


	def migrate_image(src_path, dest_dir):
	"""
	:param src_path: e.g., wp-content/uploads/2006/02/image.jpg
	:param dest_dir: e.g., posts.rst/2006-02-18
	"""
	basepath, filename = os.path.split(src_path)

	if not os.path.exists(dest_dir):
	os.mkdir(dest_dir)

	dest_path = os.path.join(dest_dir, filename)
	if os.path.exists(src_path):
	os.rename(src_path, dest_path)
	else:
	warnings.warn(f'{src_path} does not exist')

	return dest_path


	def migrate_all_images(source, datetime):
	# Pre-processing
	source = source.replace('http://blog...(old domain 1)', '')
	source = source.replace('http://blog...(old domain 2)', '')
	source = source.replace('http://blog...(old domain 3)', '')

	pattern = re.compile(r'wp-content/uploads/.+\.\w+')
	index = 0
	while True:
	match = pattern.search(source, index)
	if not match:
	break

	path = source[match.start():match.end()]

	dest_path = migrate_image(path, os.path.join(TARGET_PATH, datetime))

	# Resolve the path difference between a post and an image
	dest_path = os.path.relpath(dest_path, TARGET_PATH)

	# `path` should include the preceeding '/'
	source = source.replace('/' + path, dest_path)
	index = match.end()

	return source


	def main():
	for filename in os.listdir(SOURCE_PATH):
	if not filename.endswith('.html'):
	continue

	print(f'Converting {filename}...')
	rst, target_file = process_file(os.path.join(SOURCE_PATH, filename))

	with open(os.path.join(TARGET_PATH, target_file), 'w') as fout:
	fout.write(rst)


	if __name__ == '__main__':
	main()