trentrichardson/wp-to-md.py

## wp-to-md.py
import sys
import os
import io
import time
import datetime
import dateutil.parser
import re

# for xml parsing
from bs4 import BeautifulSoup

# for converting to markdown
import html2text

# for downloading attachments
import wget


# for converting various wordpress code tags to markdown
coderegex1 = re.compile(r'\[sourcecode language=\"[a-zA-Z0-9]*\"\](.*?)\[\/sourcecode\]', re.DOTALL)
coderegex2 = re.compile(r'\[code language=\"[a-zA-Z0-9]*\"\](.*?)\[\/code\]', re.DOTALL)
coderegex3 = re.compile(r'\[code lang=[a-zA-Z0-9]*\](.*?)\[\/code\]', re.DOTALL)

# for replacing http://www.example.com/wp-content/uploads/ with just /uploads/
uploadregex = re.compile(r'(https?\:\/\/[a-zA-Z0-9\-\.]+\/wp-content\/uploads\/)')

class Post:

	def __init__(self, title, author, date, content, category, status, slug, tags):
		self.title = title
		self.author = author
		self.date = dateutil.parser.parse(date)
		self.content = content
		self.category = category
		self.status = status
		self.slug = slug
		self.tags = tags


def load_doc(filename):

	print("> Loading document!")

	doc = ""
	with io.open(filename, 'r', encoding='UTF-8') as f:
		doc = f.read()

	return doc


def parse_doc(doc):

	print("> Parsing document!")

	posts = []
	attachments = []
	soup = BeautifulSoup(doc, 'html.parser')

	for item in soup.find_all('item'):

		if item.find('wp:post_type').string == "post":

			# get the tags
			tags = []
			for tag in item.findAll('category', { 'domain': 'post_tag'}):
				tags.append(tag['nicename'])

			# fix urls to not use wp-content nor full url
			content = uploadregex.sub('{attach}images/',item.find('content:encoded').string)

			posts.append(Post(
				item.find('title').string,
				item.find('dc:creator').string,
				item.find('wp:post_date').string,
				content,
				item.find('category', { 'domain': 'category'})['nicename'],
				item.find('wp:status').string,
				item.find('wp:post_name').string,
				'; '.join(tags) ))

		elif item.find('wp:post_type').string == "attachment":
			attachments.append(item.guid.string)

	return posts, attachments


def gen_markdown(post):

	h = html2text.HTML2Text()
	h.unicode_snob = 1
	h.body_width = 0
	h.dash_unordered_list = True

	title = post.title.translate(str.maketrans({"\"": "&#34;", ":": "&#58;"}))
	body = post.content

	header ="""Title: %s
Date: %s
Category: %s
Tags: %s
Slug: %s
Author: %s

"""%(title, post.date.strftime("%Y-%m-%d %H:%M:%S"), post.category, post.tags, post.slug, post.author)

	body = re.sub(coderegex1, r"<pre>\1</pre>", body, re.U)
	body = re.sub(coderegex2, r"<pre>\1</pre>", body, re.U)
	body = re.sub(coderegex3, r"<pre>\1</pre>", body, re.U)

	body = h.handle(body)

	return header + body


def save_posts(output, posts):

	print("> Saving posts!")

	out = ""
	directory = ""

	for p in posts:
		if p.status == "publish":
			directory = output + "_posts/"

		elif p.status == "draft":
			directory = output + "_drafts/"

		else:
			directory = output + "_other/"

		if not os.path.exists(directory):
			os.makedirs(directory)

		print("Saving", directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md")

		with io.open(directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md", 'w', encoding='UTF-8') as f:
			f.write(gen_markdown(p))


def download_attachments(output, attachments):

	print("> Wget'ing attachments!")
	# todo


def main():

	output = "./"

	if len(sys.argv) == 1:
		print("Parameters: filename for wordpress .xml export file, optional output directory")
		return

	elif len(sys.argv) == 2:
		filename = sys.argv[1]

	elif len(sys.argv) == 3:
		filename = sys.argv[1]
		output = sys.argv[2]

	doc = ""
	posts = []
	attachments = []

	doc = load_doc(filename)
	posts, attachments = parse_doc(doc)

	save_posts(output, posts)
	download_attachments(output, attachments)


if __name__ == '__main__':
	main()
	import sys
	import os
	import io
	import time
	import datetime
	import dateutil.parser
	import re

	# for xml parsing
	from bs4 import BeautifulSoup

	# for converting to markdown
	import html2text

	# for downloading attachments
	import wget


	# for converting various wordpress code tags to markdown
	coderegex1 = re.compile(r'\[sourcecode language=\"[a-zA-Z0-9]\"\](.?)\[\/sourcecode\]', re.DOTALL)
	coderegex2 = re.compile(r'\[code language=\"[a-zA-Z0-9]\"\](.?)\[\/code\]', re.DOTALL)
	coderegex3 = re.compile(r'\[code lang=[a-zA-Z0-9]\](.?)\[\/code\]', re.DOTALL)

	# for replacing http://www.example.com/wp-content/uploads/ with just /uploads/
	uploadregex = re.compile(r'(https?\:\/\/[a-zA-Z0-9\-\.]+\/wp-content\/uploads\/)')

	class Post:

	def __init__(self, title, author, date, content, category, status, slug, tags):
	self.title = title
	self.author = author
	self.date = dateutil.parser.parse(date)
	self.content = content
	self.category = category
	self.status = status
	self.slug = slug
	self.tags = tags


	def load_doc(filename):

	print("> Loading document!")

	doc = ""
	with io.open(filename, 'r', encoding='UTF-8') as f:
	doc = f.read()

	return doc


	def parse_doc(doc):

	print("> Parsing document!")

	posts = []
	attachments = []
	soup = BeautifulSoup(doc, 'html.parser')

	for item in soup.find_all('item'):

	if item.find('wp:post_type').string == "post":

	# get the tags
	tags = []
	for tag in item.findAll('category', { 'domain': 'post_tag'}):
	tags.append(tag['nicename'])

	# fix urls to not use wp-content nor full url
	content = uploadregex.sub('{attach}images/',item.find('content:encoded').string)

	posts.append(Post(
	item.find('title').string,
	item.find('dc:creator').string,
	item.find('wp:post_date').string,
	content,
	item.find('category', { 'domain': 'category'})['nicename'],
	item.find('wp:status').string,
	item.find('wp:post_name').string,
	'; '.join(tags) ))

	elif item.find('wp:post_type').string == "attachment":
	attachments.append(item.guid.string)

	return posts, attachments


	def gen_markdown(post):

	h = html2text.HTML2Text()
	h.unicode_snob = 1
	h.body_width = 0
	h.dash_unordered_list = True

	title = post.title.translate(str.maketrans({"\"": """, ":": ":"}))
	body = post.content

	header ="""Title: %s
	Date: %s
	Category: %s
	Tags: %s
	Slug: %s
	Author: %s

	"""%(title, post.date.strftime("%Y-%m-%d %H:%M:%S"), post.category, post.tags, post.slug, post.author)

	body = re.sub(coderegex1, r"<pre>\1</pre>", body, re.U)
	body = re.sub(coderegex2, r"<pre>\1</pre>", body, re.U)
	body = re.sub(coderegex3, r"<pre>\1</pre>", body, re.U)

	body = h.handle(body)

	return header + body


	def save_posts(output, posts):

	print("> Saving posts!")

	out = ""
	directory = ""

	for p in posts:
	if p.status == "publish":
	directory = output + "_posts/"

	elif p.status == "draft":
	directory = output + "_drafts/"

	else:
	directory = output + "_other/"

	if not os.path.exists(directory):
	os.makedirs(directory)

	print("Saving", directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md")

	with io.open(directory + p.date.strftime("%Y-%m-%d") + "-" + p.slug + ".md", 'w', encoding='UTF-8') as f:
	f.write(gen_markdown(p))


	def download_attachments(output, attachments):

	print("> Wget'ing attachments!")
	# todo


	def main():

	output = "./"

	if len(sys.argv) == 1:
	print("Parameters: filename for wordpress .xml export file, optional output directory")
	return

	elif len(sys.argv) == 2:
	filename = sys.argv[1]

	elif len(sys.argv) == 3:
	filename = sys.argv[1]
	output = sys.argv[2]

	doc = ""
	posts = []
	attachments = []

	doc = load_doc(filename)
	posts, attachments = parse_doc(doc)

	save_posts(output, posts)
	download_attachments(output, attachments)


	if __name__ == '__main__':
	main()