Skip to content

Instantly share code, notes, and snippets.

@pklaus
Last active December 14, 2016 18:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save pklaus/4546743 to your computer and use it in GitHub Desktop.
Save pklaus/4546743 to your computer and use it in GitHub Desktop.
A Tool to easily backup Wordpress posts to your local filesystem.

Backup Your Wordpress Blog Using Python

A Tool to easily backup Wordpress posts to your local filesystem.

I use this together with local-blog to host a local copy of my own blog on my laptop (for use on the road etc.).

Usage

usage: backup-wordpress-blog.py [-h] [--username USERNAME]
                                [--password PASSWORD] [--folder FOLDER]
                                [--number NUMBER] [--long-filenames]
                                [--no-meta] [--media] [-d] [-e EXTENSION]
                                BLOG_URL

Backing up the blog posts of a Wordpress blog to your local file system.

positional arguments:
  BLOG_URL              The URL of your Wordpress Blog

optional arguments:
  -h, --help            show this help message and exit
  --username USERNAME, -u USERNAME
                        Username used to login to your blog.
  --password PASSWORD, -p PASSWORD
                        Password used to login to your blog.
  --folder FOLDER, -f FOLDER
                        Folder to store the backups of the blog posts (./).
  --number NUMBER, -n NUMBER
                        Number of blog posts to back up (default is 4000).
  --long-filenames, -l  Use extended filenames for the blog post backup files.
  --no-meta             Don't store any meta information (such as tags) in the
                        backup files.
  --media               Also backup media files and their metadata.
  -d, --debug           Run in debug mode (used by the developer).
  -e EXTENSION, --extension EXTENSION
                        The file extension of the backed up blog post files
                        (default is txt).
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
(c) 2013,2015 by Philipp Klaus <philipp.l.klaus →AT→ web.de>.
Check <https://github.com/pklaus/backup-wordpress-blog> for newer versions.
"""
try:
from wordpress_xmlrpc import Client
from wordpress_xmlrpc.methods import posts
from wordpress_xmlrpc.methods import users
from wordpress_xmlrpc.methods import media
from wordpress_xmlrpc.exceptions import InvalidCredentialsError
import unidecode
except ImportError:
print("You need to install python-wordpress-xmlrpc and unidecode using pip first. Exiting")
import sys; sys.exit(1)
import datetime as dt
import json
from urllib.parse import urlparse
from urllib.request import urlopen
import shutil
import argparse, os, errno, sys, time, re
def login(site, user, password=None):
if not user:
user = input('Please enter the username for the blog %s: ' % site)
if not password:
password = input('Please enter the password for the user %s: ' % user)
return Client(site, user, password)
def sanitize_url(url):
if not (url.startswith('http://') or url.startswith('https://')):
url = 'http://' + url
if not url.endswith('xmlrpc.php'):
if not url.endswith('/'):
url += '/'
url += 'xmlrpc.php'
return url
def ensure_folder_exists(folder):
try:
os.makedirs(folder)
except OSError as e:
if not e.errno == errno.EEXIST:
raise
def post_file_name(post, short=True, extension='txt'):
status = post.post_status
slug = post.slug
if len(slug) == 0 and len(post.title) > 0:
title = post.title
# http://stackoverflow.com/a/8366771/183995
slug = re.sub(r'\W+', '-', unidecode.unidecode(title).lower())
if short:
stati = {'draft': 'd', 'private': 'pr', 'publish': 'p'}
try:
status = stati[status]
except:
pass
when = post.date.date().isoformat().replace('-','')
return '%s_%s_%s.%s' % (status, when, slug[:22], extension)
else:
when = post.date.isoformat().replace('T','_').replace(':','-')
return '%s_%s_%s.%s' % (status, when, slug, extension)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Backing up the blog posts of a Wordpress blog to your local file system.')
parser.add_argument('url', metavar='BLOG_URL',
help='The URL of your Wordpress Blog')
parser.add_argument('--username', '-u',
help='Username used to login to your blog.')
parser.add_argument('--password', '-p',
help='Password used to login to your blog.')
parser.add_argument('--folder', '-f', default='./',
help='Folder to store the backups of the blog posts (./).')
parser.add_argument('--number', '-n', type=int, default=4000,
help='Number of blog posts to back up (default is 4000).')
parser.add_argument('--long-filenames', '-l', action='store_true',
help='Use extended filenames for the blog post backup files.')
parser.add_argument('--no-meta', action='store_true',
help="Don't store any meta information (such as tags) in the backup files.")
parser.add_argument('--media', action='store_true',
help='Also backup media files and their metadata.')
parser.add_argument('-d', '--debug', action='store_true',
help='Run in debug mode (used by the developer).')
parser.add_argument('-e', '--extension', default='txt',
help='The file extension of the backed up blog post files (default is txt).')
args = parser.parse_args()
site = sanitize_url(args.url)
wp = login(site, args.username, args.password)
try:
wp.call(users.GetUserInfo())
except InvalidCredentialsError:
print("Invalid credentials")
sys.exit(1)
folder = os.path.abspath(args.folder)
#folder = os.path.join(folder, dt.date.today().isoformat())
ensure_folder_exists(folder)
if args.media:
media_path = os.path.join(folder, 'media')
ensure_folder_exists(media_path)
if args.debug: print("Fetching media library list...")
media_list = wp.call(media.GetMediaLibrary({}))
if args.debug: print("The media library contains {} items. Saving them now...".format(len(media_list)))
for m in media_list:
m_path = '.' + urlparse(m.link).path
m_full_path = os.path.join(folder, m_path)
m_dict = {
'id': m.id,
'parent': m.parent,
'title': m.title,
'description': m.description,
'caption': m.caption,
'date_created': m.date_created.isoformat(),
'link': m.link,
'thumbnail': m.thumbnail,
'metadata': m.metadata,
'path': m_path
}
m_file = open(os.path.join(media_path, '{}.json'.format(m.id)), 'w')
m_file.write(json.dumps(m_dict))
m_file.close()
ensure_folder_exists(os.path.dirname(m_full_path))
if args.debug: print("Downloading media file {}...".format(m.link))
with urlopen(m.link) as response, open(m_full_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
if args.debug: print("Fetching posts now...")
posts = wp.call(posts.GetPosts({'number': args.number,}))
if args.debug: print("Fetched {} posts. Saving them now...".format(len(posts)))
for post in posts:
tags = [t.name for t in post.terms if t.taxonomy == 'post_tag']
categories = [t.name for t in post.terms if t.taxonomy == 'category']
fname = post_file_name(post, short=(not args.long_filenames), extension=args.extension)
fname = os.path.join(folder, fname)
f = open(fname, 'w')
if not args.no_meta:
# Or YAML Style Frontmatter
# inspired by http://egonschiele.github.io/mdpress/
# See http://jekyllrb.com/docs/frontmatter/
# and http://en.wikipedia.org/wiki/YAML#Lists
f.write('# %s\n\n' % post.title)
f.write('* Categories: %s\n' % ', '.join(categories))
f.write('* Tags: %s\n' % ', '.join(tags))
f.write('* Creation Date: %s\n' % post.date.isoformat())
f.write('* Modification Date: %s\n' % post.date_modified.isoformat())
f.write('* Link: <%s>\n' % post.link)
f.write('\n### Content\n\n')
f.write(post.content)
f.close()
cr_time = time.mktime(post.date.timetuple())
os.utime(fname, (cr_time, cr_time))
print("Successfully backed up %d blog posts." % len(posts))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment