Created
February 10, 2017 20:32
-
-
Save sedrubal/f384e84ff210a1dddf8313e215044115 to your computer and use it in GitHub Desktop.
Import posts from (Drupal) RSS feed to WordPress
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Import posts from RSS feed and bring it to WordPress export format. | |
This works for our Drupal 6. | |
It prints the content of the WordPress export xml (wpx) to stdout and | |
all images that have to be imported manually to stderr. | |
You need to install WordPress-importer plugin in WordPress and import the file. | |
Note that a RSS feed may not contain all posts but e.g. the last 30 entries. | |
BTW: This is dirty. Use it at your own risk TM | |
""" | |
# "THE BEER-WARE LICENSE" (Revision 42): | |
# <basti.endres@fablab.fau.de> wrote this file. As long as you retain this | |
# notice you can do whatever you want with this stuff. If we meet some day, | |
# and you think this stuff is worth it, you can buy me a beer in return. | |
import sys | |
from datetime import datetime | |
from bs4 import BeautifulSoup | |
from dateutil.parser import parse as parse_date | |
from feedparser import parse as parse_feed | |
FEED_URL = 'https://fablab.fau.de/rss.xml' | |
OLD_URL = 'https://fablab.fau.de/' | |
WP_URL = 'https://marvin.fablab.fau.de/' | |
AUTHOR = 'Sebastian Endres' | |
AUTHOR_EMAIL = 'basti.endres@fablab.fau.de' | |
TEMPLATE = """ | |
<?xml version="1.0" encoding="UTF-8" ?> | |
<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. --> | |
<!-- It contains information about your site's posts, pages, comments, categories, and other content. --> | |
<!-- You may use this file to transfer that content from one site to another. --> | |
<!-- This file is not intended to serve as a complete backup of your site. --> | |
<!-- To import this information into a WordPress site follow these steps: --> | |
<!-- 1. Log in to that site as an administrator. --> | |
<!-- 2. Go to Tools: Import in the WordPress admin panel. --> | |
<!-- 3. Install the "WordPress" importer from the list. --> | |
<!-- 4. Activate & Run Importer. --> | |
<!-- 5. Upload this file using the form provided on that page. --> | |
<!-- 6. You will first be asked to map the authors in this export file to users --> | |
<!-- on the site. For each author, you may choose to map to an --> | |
<!-- existing user on the site or to create a new user. --> | |
<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. --> | |
<!-- contained in this file into your site. --> | |
<!-- generator="wp_feed_import.py" created="{now}" --> | |
<rss version="2.0" | |
xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/" | |
xmlns:content="http://purl.org/rss/1.0/modules/content/" | |
xmlns:wfw="http://wellformedweb.org/CommentAPI/" | |
xmlns:dc="http://purl.org/dc/elements/1.1/" | |
xmlns:wp="http://wordpress.org/export/1.2/"> | |
<channel> | |
<title>{title}</title> | |
<link>{page_url}</link> | |
<description>{page_description}</description> | |
<pubDate>{now}</pubDate> | |
<language>{lang}</language> | |
<wp:wxr_version>1.2</wp:wxr_version> | |
<wp:base_site_url>{page_url}</wp:base_site_url> | |
<wp:base_blog_url>{page_url}</wp:base_blog_url> | |
<wp:author> | |
<wp:author_id>1</wp:author_id><wp:author_login><![CDATA[{author}]]></wp:author_login> | |
<wp:author_email><![CDATA[{author_email}]]></wp:author_email> | |
<wp:author_display_name><![CDATA[{author_display_name}]]></wp:author_display_name> | |
<wp:author_first_name><![CDATA[{author_first_name}]]></wp:author_first_name> | |
<wp:author_last_name><![CDATA[{author_last_name}]]></wp:author_last_name> | |
</wp:author> | |
<generator>wp_feed_import.py</generator> | |
<wp:category> | |
<wp:term_id>1</wp:term_id> | |
<wp:category_nicename><![CDATA[blog]]></wp:category_nicename> | |
<wp:category_parent><![CDATA[]]></wp:category_parent> | |
<wp:cat_name><![CDATA[Blog]]></wp:cat_name> | |
</wp:category> | |
{items} | |
</channel> | |
</rss> | |
""" | |
TEMPLATE_ITEM = """ | |
<item> | |
<title>{title}</title> | |
<link>{link}</link> | |
<pubDate>{date_rfc822}</pubDate> | |
<dc:creator><![CDATA[{creator}]]></dc:creator> | |
<guid isPermaLink="{guid_is_link}">{guid}</guid> | |
<description>{description}</description> | |
<content:encoded><![CDATA[{content}]]></content:encoded> | |
<excerpt:encoded><![CDATA[{excerpt}]]></excerpt:encoded> | |
<wp:post_id>{post_id}</wp:post_id> | |
<wp:post_date><![CDATA[{date}]]></wp:post_date> | |
<wp:post_date_gmt><![CDATA[{date}]]></wp:post_date_gmt> | |
<wp:comment_status><![CDATA[closed]]></wp:comment_status> | |
<wp:ping_status><![CDATA[closed]]></wp:ping_status> | |
<wp:post_name><![CDATA[{slug}]]></wp:post_name> | |
<wp:status><![CDATA[publish]]></wp:status> | |
<wp:post_parent>0</wp:post_parent> | |
<wp:menu_order>0</wp:menu_order> | |
<wp:post_type><![CDATA[post]]></wp:post_type> | |
<wp:post_password><![CDATA[]]></wp:post_password> | |
<wp:is_sticky>0</wp:is_sticky> | |
<wp:postmeta> | |
<wp:meta_key><![CDATA[portal_description]]></wp:meta_key> | |
<wp:meta_value><![CDATA[{description}]]></wp:meta_value> | |
</wp:postmeta> | |
</item> | |
""" | |
def main(): | |
"""Load the RSS feed, parse it, detect images, and print wpx content.""" | |
feed = parse_feed(FEED_URL) | |
items = '' | |
for post_id, entry in enumerate(feed['entries']): | |
content_soup = BeautifulSoup( | |
entry.get('summary', entry.get('description', '')), 'html.parser' | |
) | |
for img in content_soup.findAll('img'): | |
img_src = img.attrs.get('src', '').replace('http://', 'https://') | |
if img_src.startswith('/') or img_src.startswith(OLD_URL): | |
img_full_src = OLD_URL.rstrip('/') + img.attrs['src'] | |
print(img_full_src, file=sys.stderr) | |
img.attrs['src'] = img_full_src | |
content = str(content_soup) | |
items += TEMPLATE_ITEM.format( | |
title=entry.get('title', ''), | |
link=entry.get('link', feed['feed'].get('link', OLD_URL)), | |
date_rfc822=entry.get('published', datetime.now()), | |
date=parse_date( | |
entry.get('published', str(datetime.now())) | |
).strftime('%Y-%m-%d %H:%M:%S'), | |
creator=entry.get('author', ''), | |
guid_is_link=entry.get('guidislink', 'false'), | |
guid=entry.get('id', '{url} {pid}'.format(url=OLD_URL, pid=post_id)), | |
description=BeautifulSoup( | |
entry.get('description', entry.get('summary', '')), 'html.parser' | |
).text, | |
content=content, | |
excerpt=content, | |
post_id=post_id, | |
slug=entry['link'].split('/')[-1], | |
) | |
wpx = TEMPLATE.format( | |
now=datetime.now(), | |
title=feed['feed'].get('title', ''), | |
page_url=feed['feed'].get('link', OLD_URL), | |
page_description=feed['feed'].get('subtitle', ''), | |
lang=feed['feed'].get('language', 'en_US'), | |
author=AUTHOR, | |
author_email=AUTHOR_EMAIL, | |
author_display_name='', | |
author_first_name='', | |
author_last_name='', | |
items=items, | |
) | |
print(wpx.strip()) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment