sedrubal/wp_feed_import.py

## wp_feed_import.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Import posts from RSS feed and bring it to WordPress export format.

This works for our Drupal 6.
It prints the content of the WordPress export xml (wpx) to stdout and
all images that have to be imported manually to stderr.

You need to install WordPress-importer plugin in WordPress and import the file.

Note that a RSS feed may not contain all posts but e.g. the last 30 entries.

BTW: This is dirty. Use it at your own risk TM
"""

# "THE BEER-WARE LICENSE" (Revision 42):
# <basti.endres@fablab.fau.de> wrote this file. As long as you retain this
# notice you can do whatever you want with this stuff. If we meet some day,
# and you think this stuff is worth it, you can buy me a beer in return.

import sys
from datetime import datetime

from bs4 import BeautifulSoup
from dateutil.parser import parse as parse_date

from feedparser import parse as parse_feed

FEED_URL = 'https://fablab.fau.de/rss.xml'
OLD_URL = 'https://fablab.fau.de/'
WP_URL = 'https://marvin.fablab.fau.de/'
AUTHOR = 'Sebastian Endres'
AUTHOR_EMAIL = 'basti.endres@fablab.fau.de'


TEMPLATE = """
<?xml version="1.0" encoding="UTF-8" ?>
<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. -->
<!-- It contains information about your site's posts, pages, comments, categories, and other content. -->
<!-- You may use this file to transfer that content from one site to another. -->
<!-- This file is not intended to serve as a complete backup of your site. -->

<!-- To import this information into a WordPress site follow these steps: -->
<!-- 1. Log in to that site as an administrator. -->
<!-- 2. Go to Tools: Import in the WordPress admin panel. -->
<!-- 3. Install the "WordPress" importer from the list. -->
<!-- 4. Activate & Run Importer. -->
<!-- 5. Upload this file using the form provided on that page. -->
<!-- 6. You will first be asked to map the authors in this export file to users -->
<!--    on the site. For each author, you may choose to map to an -->
<!--    existing user on the site or to create a new user. -->
<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. -->
<!--    contained in this file into your site. -->

<!-- generator="wp_feed_import.py" created="{now}" -->
<rss version="2.0"
  xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
  xmlns:content="http://purl.org/rss/1.0/modules/content/"
  xmlns:wfw="http://wellformedweb.org/CommentAPI/"
  xmlns:dc="http://purl.org/dc/elements/1.1/"
  xmlns:wp="http://wordpress.org/export/1.2/">
  <channel>
    <title>{title}</title>
    <link>{page_url}</link>
    <description>{page_description}</description>
    <pubDate>{now}</pubDate>
    <language>{lang}</language>
    <wp:wxr_version>1.2</wp:wxr_version>
    <wp:base_site_url>{page_url}</wp:base_site_url>
    <wp:base_blog_url>{page_url}</wp:base_blog_url>
    <wp:author>
      <wp:author_id>1</wp:author_id><wp:author_login><![CDATA[{author}]]></wp:author_login>
      <wp:author_email><![CDATA[{author_email}]]></wp:author_email>
      <wp:author_display_name><![CDATA[{author_display_name}]]></wp:author_display_name>
      <wp:author_first_name><![CDATA[{author_first_name}]]></wp:author_first_name>
      <wp:author_last_name><![CDATA[{author_last_name}]]></wp:author_last_name>
    </wp:author>

    <generator>wp_feed_import.py</generator>

    <wp:category>
      <wp:term_id>1</wp:term_id>
      <wp:category_nicename><![CDATA[blog]]></wp:category_nicename>
      <wp:category_parent><![CDATA[]]></wp:category_parent>
      <wp:cat_name><![CDATA[Blog]]></wp:cat_name>
    </wp:category>

{items}
  </channel>
</rss>
"""

TEMPLATE_ITEM = """
    <item>
      <title>{title}</title>
      <link>{link}</link>
      <pubDate>{date_rfc822}</pubDate>
      <dc:creator><![CDATA[{creator}]]></dc:creator>
      <guid isPermaLink="{guid_is_link}">{guid}</guid>
      <description>{description}</description>
      <content:encoded><![CDATA[{content}]]></content:encoded>
      <excerpt:encoded><![CDATA[{excerpt}]]></excerpt:encoded>
      <wp:post_id>{post_id}</wp:post_id>
      <wp:post_date><![CDATA[{date}]]></wp:post_date>
      <wp:post_date_gmt><![CDATA[{date}]]></wp:post_date_gmt>
      <wp:comment_status><![CDATA[closed]]></wp:comment_status>
      <wp:ping_status><![CDATA[closed]]></wp:ping_status>
      <wp:post_name><![CDATA[{slug}]]></wp:post_name>
      <wp:status><![CDATA[publish]]></wp:status>
      <wp:post_parent>0</wp:post_parent>
      <wp:menu_order>0</wp:menu_order>
      <wp:post_type><![CDATA[post]]></wp:post_type>
      <wp:post_password><![CDATA[]]></wp:post_password>
      <wp:is_sticky>0</wp:is_sticky>
      <wp:postmeta>
        <wp:meta_key><![CDATA[portal_description]]></wp:meta_key>
        <wp:meta_value><![CDATA[{description}]]></wp:meta_value>
      </wp:postmeta>
    </item>
"""


def main():
    """Load the RSS feed, parse it, detect images, and print wpx content."""
    feed = parse_feed(FEED_URL)
    items = ''
    for post_id, entry in enumerate(feed['entries']):
        content_soup = BeautifulSoup(
            entry.get('summary', entry.get('description', '')), 'html.parser'
        )
        for img in content_soup.findAll('img'):
            img_src = img.attrs.get('src', '').replace('http://', 'https://')
            if img_src.startswith('/') or img_src.startswith(OLD_URL):
                img_full_src = OLD_URL.rstrip('/') + img.attrs['src']
                print(img_full_src, file=sys.stderr)
                img.attrs['src'] = img_full_src

        content = str(content_soup)
        items += TEMPLATE_ITEM.format(
            title=entry.get('title', ''),
            link=entry.get('link', feed['feed'].get('link', OLD_URL)),
            date_rfc822=entry.get('published', datetime.now()),
            date=parse_date(
                entry.get('published', str(datetime.now()))
            ).strftime('%Y-%m-%d %H:%M:%S'),
            creator=entry.get('author', ''),
            guid_is_link=entry.get('guidislink', 'false'),
            guid=entry.get('id', '{url} {pid}'.format(url=OLD_URL, pid=post_id)),
            description=BeautifulSoup(
                entry.get('description', entry.get('summary', '')), 'html.parser'
            ).text,
            content=content,
            excerpt=content,
            post_id=post_id,
            slug=entry['link'].split('/')[-1],
        )

    wpx = TEMPLATE.format(
        now=datetime.now(),
        title=feed['feed'].get('title', ''),
        page_url=feed['feed'].get('link', OLD_URL),
        page_description=feed['feed'].get('subtitle', ''),
        lang=feed['feed'].get('language', 'en_US'),
        author=AUTHOR,
        author_email=AUTHOR_EMAIL,
        author_display_name='',
        author_first_name='',
        author_last_name='',
        items=items,
    )
    print(wpx.strip())

if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	"""
	Import posts from RSS feed and bring it to WordPress export format.

	This works for our Drupal 6.
	It prints the content of the WordPress export xml (wpx) to stdout and
	all images that have to be imported manually to stderr.

	You need to install WordPress-importer plugin in WordPress and import the file.

	Note that a RSS feed may not contain all posts but e.g. the last 30 entries.

	BTW: This is dirty. Use it at your own risk TM
	"""

	# "THE BEER-WARE LICENSE" (Revision 42):
	# <basti.endres@fablab.fau.de> wrote this file. As long as you retain this
	# notice you can do whatever you want with this stuff. If we meet some day,
	# and you think this stuff is worth it, you can buy me a beer in return.

	import sys
	from datetime import datetime

	from bs4 import BeautifulSoup
	from dateutil.parser import parse as parse_date

	from feedparser import parse as parse_feed

	FEED_URL = 'https://fablab.fau.de/rss.xml'
	OLD_URL = 'https://fablab.fau.de/'
	WP_URL = 'https://marvin.fablab.fau.de/'
	AUTHOR = 'Sebastian Endres'
	AUTHOR_EMAIL = 'basti.endres@fablab.fau.de'


	TEMPLATE = """
	<?xml version="1.0" encoding="UTF-8" ?>
	<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. -->
	<!-- It contains information about your site's posts, pages, comments, categories, and other content. -->
	<!-- You may use this file to transfer that content from one site to another. -->
	<!-- This file is not intended to serve as a complete backup of your site. -->

	<!-- To import this information into a WordPress site follow these steps: -->
	<!-- 1. Log in to that site as an administrator. -->
	<!-- 2. Go to Tools: Import in the WordPress admin panel. -->
	<!-- 3. Install the "WordPress" importer from the list. -->
	<!-- 4. Activate & Run Importer. -->
	<!-- 5. Upload this file using the form provided on that page. -->
	<!-- 6. You will first be asked to map the authors in this export file to users -->
	<!-- on the site. For each author, you may choose to map to an -->
	<!-- existing user on the site or to create a new user. -->
	<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. -->
	<!-- contained in this file into your site. -->

	<!-- generator="wp_feed_import.py" created="{now}" -->
	<rss version="2.0"
	xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:wp="http://wordpress.org/export/1.2/">
	<channel>
	<title>{title}</title>
	<link>{page_url}</link>
	<description>{page_description}</description>
	<pubDate>{now}</pubDate>
	<language>{lang}</language>
	<wp:wxr_version>1.2</wp:wxr_version>
	<wp:base_site_url>{page_url}</wp:base_site_url>
	<wp:base_blog_url>{page_url}</wp:base_blog_url>
	<wp:author>
	<wp:author_id>1</wp:author_id><wp:author_login><![CDATA[{author}]]></wp:author_login>
	<wp:author_email><![CDATA[{author_email}]]></wp:author_email>
	<wp:author_display_name><![CDATA[{author_display_name}]]></wp:author_display_name>
	<wp:author_first_name><![CDATA[{author_first_name}]]></wp:author_first_name>
	<wp:author_last_name><![CDATA[{author_last_name}]]></wp:author_last_name>
	</wp:author>

	<generator>wp_feed_import.py</generator>

	<wp:category>
	<wp:term_id>1</wp:term_id>
	<wp:category_nicename><![CDATA[blog]]></wp:category_nicename>
	<wp:category_parent><![CDATA[]]></wp:category_parent>
	<wp:cat_name><![CDATA[Blog]]></wp:cat_name>
	</wp:category>

	{items}
	</channel>
	</rss>
	"""

	TEMPLATE_ITEM = """
	<item>
	<title>{title}</title>
	<link>{link}</link>
	<pubDate>{date_rfc822}</pubDate>
	<dc:creator><![CDATA[{creator}]]></dc:creator>
	<guid isPermaLink="{guid_is_link}">{guid}</guid>
	<description>{description}</description>
	<content:encoded><![CDATA[{content}]]></content:encoded>
	<excerpt:encoded><![CDATA[{excerpt}]]></excerpt:encoded>
	<wp:post_id>{post_id}</wp:post_id>
	<wp:post_date><![CDATA[{date}]]></wp:post_date>
	<wp:post_date_gmt><![CDATA[{date}]]></wp:post_date_gmt>
	<wp:comment_status><![CDATA[closed]]></wp:comment_status>
	<wp:ping_status><![CDATA[closed]]></wp:ping_status>
	<wp:post_name><![CDATA[{slug}]]></wp:post_name>
	<wp:status><![CDATA[publish]]></wp:status>
	<wp:post_parent>0</wp:post_parent>
	<wp:menu_order>0</wp:menu_order>
	<wp:post_type><![CDATA[post]]></wp:post_type>
	<wp:post_password><![CDATA[]]></wp:post_password>
	<wp:is_sticky>0</wp:is_sticky>
	<wp:postmeta>
	<wp:meta_key><![CDATA[portal_description]]></wp:meta_key>
	<wp:meta_value><![CDATA[{description}]]></wp:meta_value>
	</wp:postmeta>
	</item>
	"""


	def main():
	"""Load the RSS feed, parse it, detect images, and print wpx content."""
	feed = parse_feed(FEED_URL)
	items = ''
	for post_id, entry in enumerate(feed['entries']):
	content_soup = BeautifulSoup(
	entry.get('summary', entry.get('description', '')), 'html.parser'
	)
	for img in content_soup.findAll('img'):
	img_src = img.attrs.get('src', '').replace('http://', 'https://')
	if img_src.startswith('/') or img_src.startswith(OLD_URL):
	img_full_src = OLD_URL.rstrip('/') + img.attrs['src']
	print(img_full_src, file=sys.stderr)
	img.attrs['src'] = img_full_src

	content = str(content_soup)
	items += TEMPLATE_ITEM.format(
	title=entry.get('title', ''),
	link=entry.get('link', feed['feed'].get('link', OLD_URL)),
	date_rfc822=entry.get('published', datetime.now()),
	date=parse_date(
	entry.get('published', str(datetime.now()))
	).strftime('%Y-%m-%d %H:%M:%S'),
	creator=entry.get('author', ''),
	guid_is_link=entry.get('guidislink', 'false'),
	guid=entry.get('id', '{url} {pid}'.format(url=OLD_URL, pid=post_id)),
	description=BeautifulSoup(
	entry.get('description', entry.get('summary', '')), 'html.parser'
	).text,
	content=content,
	excerpt=content,
	post_id=post_id,
	slug=entry['link'].split('/')[-1],
	)

	wpx = TEMPLATE.format(
	now=datetime.now(),
	title=feed['feed'].get('title', ''),
	page_url=feed['feed'].get('link', OLD_URL),
	page_description=feed['feed'].get('subtitle', ''),
	lang=feed['feed'].get('language', 'en_US'),
	author=AUTHOR,
	author_email=AUTHOR_EMAIL,
	author_display_name='',
	author_first_name='',
	author_last_name='',
	items=items,
	)
	print(wpx.strip())

	if __name__ == '__main__':
	main()