Skip to content

Instantly share code, notes, and snippets.

@wofeiwo
Created May 11, 2013 19:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wofeiwo/5561030 to your computer and use it in GitHub Desktop.
Save wofeiwo/5561030 to your computer and use it in GitHub Desktop.
trans all blog entities from phpweblog.net to markdown format. each entity a file. include all files and images.
#!/usr/bin/env python
#coding=utf-8
from xml.dom import minidom as md
from datetime import datetime
import re
from urllib2 import urlopen
from os.path import basename
from socket import setdefaulttimeout
xml = md.parse('MyBlogData.xml')
path = 'posts/'
media_path = 'media/'
files_path = media_path + 'files/'
images_path = media_path + 'images/'
content = """---
date: %s
layout: post
title: '%s'
permalink: '/%s/%s.html'
---
%s
"""
setdefaulttimeout(3)
for e in xml.getElementsByTagName('blog_Content'):
blogID = int(e.getElementsByTagName('ID')[0].childNodes[0].nodeValue)
blogType = int(e.getElementsByTagName('PostType')[0].childNodes[0].nodeValue)
blogTitle = e.getElementsByTagName('Title')[0].childNodes[0].nodeValue.replace(' ', ' ')
blogCreated = datetime.strptime(e.getElementsByTagName('DateAdded')[0].childNodes[0].nodeValue, '%Y-%m-%dT%H:%M:%S')
blogAuthor = e.getElementsByTagName('Author')[0].childNodes[0].nodeValue
if e.getElementsByTagName('Email'):
blogEmail = e.getElementsByTagName('Email')[0].childNodes[0].nodeValue
if e.getElementsByTagName('Description'):
blogDescription = e.getElementsByTagName('Description')[0].childNodes[0].nodeValue
blogUpdated = datetime.strptime(e.getElementsByTagName('DateUpdated')[0].childNodes[0].nodeValue, '%Y-%m-%dT%H:%M:%S')
blogText = e.getElementsByTagName('Text')[0].childNodes[0].nodeValue
blogText = blogText.replace("\"BACKGROUND-COLOR: #ffffff\"", "")
blogText = blogText.replace("background-color: #ffffff;", "")
blogText = blogText.replace(" ", " ")
blogText = blogText.replace("<!--<br><br>Code highlighting produced by Actipro CodeHighlighter (freeware)<br>http://www.CodeHighlighter.com/<br><br>-->", "")
blogText = blogText.replace("<!--<br /><br />Code highlighting produced by Actipro CodeHighlighter (freeware)<br />http://www.CodeHighlighter.com/<br /><br />-->", "")
blogText = blogText.replace("<img alt=\"\" src=\"http://www.phpweblog.net/Images/dot.gif\" />", "...")
blogText = blogText.replace("<img alt=\"\" src=\"/Images/dot.gif\" />", "...")
blogImages = re.findall('src="(.+?)"', blogText)
blogFiles = re.findall('href="(\/Files.+?)"', blogText)
for url in blogImages:
try:
if url.startswith('/'):
img = urlopen('http://www.phpweblog.net' + url).read()
else:
img = urlopen(url).read()
except Exception, e:
print url, e
out = open(images_path + basename(url), 'w')
out.write(img)
out.close()
blogText = re.sub('src="'+ url +'"', 'src="/images/' + basename(url) + '"', blogText)
for url in blogFiles:
try:
if url.startswith('/'):
f = urlopen('http://www.phpweblog.net' + url).read()
else:
f = urlopen(url).read()
except Exception, e:
print url, e
out = open(files_path + basename(url), 'w')
out.write(f)
out.close()
blogText = re.sub('src="'+ url +'"', 'src="/files/' + basename(url) + '"', blogText)
if blogType == 1:
print blogTitle, blogCreated
f = file(path+str(blogID)+'.md', 'w')
f.write(content % (blogCreated.strftime('%Y-%m-%d %H:%M:%S'), blogTitle.encode('utf-8'), blogCreated.strftime('%Y-%m-%d'), blogTitle.encode('utf-8').replace(' ', '-'), blogText.encode('utf-8')))
f.close()
print 'Done.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment