Skip to content

Instantly share code, notes, and snippets.

@sedrubal
Created February 10, 2017 20:32
Show Gist options
  • Save sedrubal/f384e84ff210a1dddf8313e215044115 to your computer and use it in GitHub Desktop.
Save sedrubal/f384e84ff210a1dddf8313e215044115 to your computer and use it in GitHub Desktop.
Import posts from (Drupal) RSS feed to WordPress
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Import posts from RSS feed and bring it to WordPress export format.
This works for our Drupal 6.
It prints the content of the WordPress export xml (wpx) to stdout and
all images that have to be imported manually to stderr.
You need to install WordPress-importer plugin in WordPress and import the file.
Note that a RSS feed may not contain all posts but e.g. the last 30 entries.
BTW: This is dirty. Use it at your own risk TM
"""
# "THE BEER-WARE LICENSE" (Revision 42):
# <basti.endres@fablab.fau.de> wrote this file. As long as you retain this
# notice you can do whatever you want with this stuff. If we meet some day,
# and you think this stuff is worth it, you can buy me a beer in return.
import sys
from datetime import datetime
from bs4 import BeautifulSoup
from dateutil.parser import parse as parse_date
from feedparser import parse as parse_feed
FEED_URL = 'https://fablab.fau.de/rss.xml'
OLD_URL = 'https://fablab.fau.de/'
WP_URL = 'https://marvin.fablab.fau.de/'
AUTHOR = 'Sebastian Endres'
AUTHOR_EMAIL = 'basti.endres@fablab.fau.de'
TEMPLATE = """
<?xml version="1.0" encoding="UTF-8" ?>
<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. -->
<!-- It contains information about your site's posts, pages, comments, categories, and other content. -->
<!-- You may use this file to transfer that content from one site to another. -->
<!-- This file is not intended to serve as a complete backup of your site. -->
<!-- To import this information into a WordPress site follow these steps: -->
<!-- 1. Log in to that site as an administrator. -->
<!-- 2. Go to Tools: Import in the WordPress admin panel. -->
<!-- 3. Install the "WordPress" importer from the list. -->
<!-- 4. Activate & Run Importer. -->
<!-- 5. Upload this file using the form provided on that page. -->
<!-- 6. You will first be asked to map the authors in this export file to users -->
<!-- on the site. For each author, you may choose to map to an -->
<!-- existing user on the site or to create a new user. -->
<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. -->
<!-- contained in this file into your site. -->
<!-- generator="wp_feed_import.py" created="{now}" -->
<rss version="2.0"
xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.2/">
<channel>
<title>{title}</title>
<link>{page_url}</link>
<description>{page_description}</description>
<pubDate>{now}</pubDate>
<language>{lang}</language>
<wp:wxr_version>1.2</wp:wxr_version>
<wp:base_site_url>{page_url}</wp:base_site_url>
<wp:base_blog_url>{page_url}</wp:base_blog_url>
<wp:author>
<wp:author_id>1</wp:author_id><wp:author_login><![CDATA[{author}]]></wp:author_login>
<wp:author_email><![CDATA[{author_email}]]></wp:author_email>
<wp:author_display_name><![CDATA[{author_display_name}]]></wp:author_display_name>
<wp:author_first_name><![CDATA[{author_first_name}]]></wp:author_first_name>
<wp:author_last_name><![CDATA[{author_last_name}]]></wp:author_last_name>
</wp:author>
<generator>wp_feed_import.py</generator>
<wp:category>
<wp:term_id>1</wp:term_id>
<wp:category_nicename><![CDATA[blog]]></wp:category_nicename>
<wp:category_parent><![CDATA[]]></wp:category_parent>
<wp:cat_name><![CDATA[Blog]]></wp:cat_name>
</wp:category>
{items}
</channel>
</rss>
"""
TEMPLATE_ITEM = """
<item>
<title>{title}</title>
<link>{link}</link>
<pubDate>{date_rfc822}</pubDate>
<dc:creator><![CDATA[{creator}]]></dc:creator>
<guid isPermaLink="{guid_is_link}">{guid}</guid>
<description>{description}</description>
<content:encoded><![CDATA[{content}]]></content:encoded>
<excerpt:encoded><![CDATA[{excerpt}]]></excerpt:encoded>
<wp:post_id>{post_id}</wp:post_id>
<wp:post_date><![CDATA[{date}]]></wp:post_date>
<wp:post_date_gmt><![CDATA[{date}]]></wp:post_date_gmt>
<wp:comment_status><![CDATA[closed]]></wp:comment_status>
<wp:ping_status><![CDATA[closed]]></wp:ping_status>
<wp:post_name><![CDATA[{slug}]]></wp:post_name>
<wp:status><![CDATA[publish]]></wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type><![CDATA[post]]></wp:post_type>
<wp:post_password><![CDATA[]]></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
<wp:postmeta>
<wp:meta_key><![CDATA[portal_description]]></wp:meta_key>
<wp:meta_value><![CDATA[{description}]]></wp:meta_value>
</wp:postmeta>
</item>
"""
def main():
"""Load the RSS feed, parse it, detect images, and print wpx content."""
feed = parse_feed(FEED_URL)
items = ''
for post_id, entry in enumerate(feed['entries']):
content_soup = BeautifulSoup(
entry.get('summary', entry.get('description', '')), 'html.parser'
)
for img in content_soup.findAll('img'):
img_src = img.attrs.get('src', '').replace('http://', 'https://')
if img_src.startswith('/') or img_src.startswith(OLD_URL):
img_full_src = OLD_URL.rstrip('/') + img.attrs['src']
print(img_full_src, file=sys.stderr)
img.attrs['src'] = img_full_src
content = str(content_soup)
items += TEMPLATE_ITEM.format(
title=entry.get('title', ''),
link=entry.get('link', feed['feed'].get('link', OLD_URL)),
date_rfc822=entry.get('published', datetime.now()),
date=parse_date(
entry.get('published', str(datetime.now()))
).strftime('%Y-%m-%d %H:%M:%S'),
creator=entry.get('author', ''),
guid_is_link=entry.get('guidislink', 'false'),
guid=entry.get('id', '{url} {pid}'.format(url=OLD_URL, pid=post_id)),
description=BeautifulSoup(
entry.get('description', entry.get('summary', '')), 'html.parser'
).text,
content=content,
excerpt=content,
post_id=post_id,
slug=entry['link'].split('/')[-1],
)
wpx = TEMPLATE.format(
now=datetime.now(),
title=feed['feed'].get('title', ''),
page_url=feed['feed'].get('link', OLD_URL),
page_description=feed['feed'].get('subtitle', ''),
lang=feed['feed'].get('language', 'en_US'),
author=AUTHOR,
author_email=AUTHOR_EMAIL,
author_display_name='',
author_first_name='',
author_last_name='',
items=items,
)
print(wpx.strip())
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment