battis/google_reader_to_wordpress.php

## google_reader_to_wordpress.php
/**********************************************************************
 * Google Reader to Wordpress
 *
 * 2011-11-27
 * Seth Battis (seth@battis.net)
 *
 * This script takes the output of Google Reader's JSON export of
 * shared items and converts it into an XML file that can be imported
 * into a Wordpress blog as posts. All of the data in the original JSON
 * file is preserved in the XML file, either by transfering it to
 * an appropriate format (e.g. Google Reader categories are converted
 * to WordPress post tags) or simply as an additional XML tag (e.g. the
 * Google Reader commentInfo metadata for recent shared items). In
 * situations where actual data has to be converted to make it readable
 * for WordPress, the original data is included as the JSON attribute
 * of that tag (e.g. timestamps and categories).
 *
 * As currently written, the script looks in its local directory for
 * Google Reader "shared-items.json" file and generates a matching
 * "shared-items.xml" file, also in its local directory.
 *
 * There are a number of potentially configurable (i.e. arbitrary)
 * values marked as TODO.
 *
 * Caveat emptor: this has been tested against my ~1000 item Google
 * Reader shared items feed and on my WordPress 3.2.1 site. I would
 * presume that it should work fairly well for others, but make no
 * guarantees!
 *********************************************************************/

/* returns a Wordpress slug-version of the given text (only
   alphanumeric characters and dashes) */
function sluggify($text)
{
	return preg_replace("|[^a-z0-9]+|", "-", strtolower($text));
}

/* SimpleXML doesn't really support namespaces unless you force it */
$xml = '<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0"
	xmlns:excerpt="http://wordpress.org/export/1.1/excerpt/"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:wp="http://wordpress.org/export/1.1/"
></rss>';
$rss = new SimpleXMLElement($xml);
$namespaces = $rss->getDocNamespaces(true);

/* load the Google Reader JSON file */
$file = file_get_contents("shared-items.json");
$json = json_decode($file, true);

/* Wordpress will choke if our post names aren't unique, so we track
   them separately */
$post_names = array();

/* header information describing the file itself */
$channel = $rss->addChild("channel");
$channel->addAttribute("direction", $json["direction"]);
$channel->addAttribute("id", $json["id"]);
$channel->addAttribute("self", $json["self"][0]["href"]);
$channel->addAttribute("author", $json["author"]);
$channel->addChild("title", $json["title"]);
$channel->addChild("link");
$channel->addChild("description");
$pubDate = $channel->addChild("pubDate", gmdate("D, j M Y G:i:s O", $json["updated"]));
$pubDate->addAttribute("json", $json["updated"]);
$channel->addChild("language");
$channel->addChild("wxr_version", "1.1", $namespaces["wp"]);

/* run through the list of items and add them to the XML */
foreach ($json["items"] as $item)
{
	$rssItem = $channel->addChild("item");

	/* a bunch of Google Reader-specific metadata */
	if (isset($item["isReadStateLocked"]))
	{
		$rssItem->addAttribute("isReadStateLocked", $item["isReadStateLocked"]);
	}
	$rssItem->addAttribute("crawlTimeMsec", $item["crawlTimeMsec"]);
	$rssItem->addAttribute("timestampUsec", $item["timestampUsec"]);
	$rssItem->addAttribute("id", $item["id"]);
	if (isset($item["commentInfo"]))
	{
		while (list($commentInfoKey, $commentInfoValue) = each($item["commentInfo"]))
		{
			$commentInfo = $rssItem->addChild("commentInfo", $commentInfoKey);
			$commentInfo->addAttribute("permalinkUrl", $commentInfoValue["permalinkUrl"]);
			$commentInfo->addAttribute("commentState", $commentInfoValue["commentState"]);
		}
	}

	/* annoyingly, not every item has its content in the content
	   element -- sometimes it's in the summary element (and once in a
	   while, it's just not there). I think this is an artifact of how
	   the original RSS feeds were constructed. I think. */
	if (isset($item["content"]))
	{
		$content = $item["content"]["content"];
	}
	else if (isset($item["summary"]["content"]))
	{
		$content = $item["summary"]["content"];
	}
	else
	{
		$content = "";
	}

	/* sometimes items don't even have titles */
	if (isset($item["title"]))
	{
		$rssItem->addChild("title", $item["title"]);
	}

	/* most items store the original linkback information in the
	   alternate element -- Wordpress won't honor this link tag when
	   it's imported (i.e. it won' treat it like the Daring Fireball
	   feed), so I have embedded a more descriptive linkback after the
	   annotations at the start of the content, using some information
	   from the origin element. The linkback is in the
	   "google-reader-alternate-href" div (for easy CSS-wrangling!) */
	if (isset($item["alternate"]))
	{
		$rssItem->addChild("link", htmlentities($item["alternate"][0]["href"], ENT_COMPAT, "UTF-8"));
		$content = htmlspecialchars("<div class=\"google-reader-alternate-href\"><p>Originally posted at <a href=\"{$item["alternate"][0]["href"]}\">{$item["origin"]["title"]}</a></p></div>", ENT_COMPAT, "UTF-8") . $content;
	}

	/* I haven't bothered to figure out if the dates are really GMT or
	   localized -- GMT was an easier assumption to make */
	$pubDate = $rssItem->addChild("pubDate", gmdate("D, j M Y G:i:s O", $item["published"]));
	$pubDate->addAttribute("json", $item["published"]);

	/* not every item has an a author, either -- again, an artifact of
	   the original RSS feeds */
	if (isset($item["author"]))
	{
		$rssItem->addChild("creator", $item["author"], $namespaces["dc"]);
	}

	/* annotations were tricky -- I have added them as their own XML
	   tags _and_ inserted them within a "google-reader-annotation" div
	   at the top of the post content (to match the original format on-
	   screen). All of the original data is preserved in the XML tag,
	   with an ID that matches the embedded div ID. */
	foreach($item["annotations"] as $annotation)
	{
		$annotationHTML = htmlentities("<div id=\"" . md5($annotation["content"] . $annotation["author"]) . "\" class=\"google-reader-annotation\"><blockquote><p>{$annotation["content"]}</p><p class=\"author\">{$annotation["author"]}</p></blockquote></div>", ENT_COMPAT, "UTF-8");
		$content = $annotationHTML . $content;
		$rssAnnotation = $rssItem->addChild("annotation", $annotation["content"]);
		$rssAnnotation->addAttribute("id", md5($annotation["content"] . $annotation["author"]));
		$rssAnnotation->addAttribute("author", $annotation["author"]);
		$rssAnnotation->addAttribute("userId", $annotation["userId"]);
		$rssAnnotation->addAttribute("profileId", $annotation["profileId"]);
		$rssAnnotation->addAttribute("profileCardParams", $annotation["profileCardParams"]);
	}

	/* again, sometimes content is in content, sometimes it's in the
	   summary element */
	$rssContent = $rssItem->addChild("encoded", $content, $namespaces["content"]);
	if (isset($item["content"]))
	{
		$rssContent->addAttribute("direction", $item["content"]["direction"]);
	}
	if (isset($item["summary"]))
	{
		$excerpt = $rssItem->addChild("encoded", $item["summary"]["content"], $namespaces["excerpt"]);
		$excerpt->addAttribute("direction", $item["summary"]["direction"]);
	}

	/* more Google reader metadata, this time about the original feed
	   that the item came from -- which is used above to format the
	   linkback that is embedded a the start of the content */
	$origin = $rssItem->addChild("origin");
	$origin->addAttribute("streamId", $item["origin"]["streamId"]);
	$origin->addAttribute("title", $item["origin"]["title"]);
	$origin->addAttribute("htmlUrl", $item["origin"]["htmlUrl"]);

	/* it's not clear to me whether the published or modified date is
	   when the original post was published or when the item <span class="hiddenGrammarError" pre="item ">was
	   shared</span> -- I think when published refers to when it was shared. */
	$postDate = $rssItem->addChild("post_date", date("Y-m-d G:i:s", $item["published"]), $namespaces["wp"]);
	$postDate->addAttribute("json", $item["published"]);
	$rssItem->addChild("comment_status", "open", $namespaces["wp"]);	// TODO make configurable
	$rssItem->addChild("ping_status", "open", $namespaces["wp"]);		// TODO make configurable

	/* make a Wordpress friendly title slug for the post */
	if (isset($item["title"]))
	{
		$slug = sluggify($item["title"]);
	}
	else
	{
		/* if no title, generate the slug from the timestamp */
		$slug = date("Y-m-d-G-i-s", $item["published"]);
	}

	/* make sure that our slug  is unique -- add a counter to the end
	   if it is not, and track those counter values in $post_names[] */
	if (isset($post_names[$slug]))
	{
		$post_names[$slug]++;
		$slug .= "-" . $post_names[$slug];
	}
	else
	{
		$post_names[$slug] = 0;
	}
	$rssItem->addChild("post_name", $slug, $namespaces["wp"]);

	/* more Wordpress metadata -- all of which could be tweaked */
	$rssItem->addChild("status", "publish", $namespaces["wp"]);	// TODO make configurable
	$rssItem->addchild("post_parent", 0, $namespaces["wp"]);	// TODO make configurable
	$rssItem->addChild("menu_order", 0, $namespaces["wp"]);		// TODO make configurable
	$rssItem->addChild("post_type", "post", $namespaces["wp"]);	// TODO make configurable
	$rssItem->addChild("post_password", "", $namespaces["wp"]);	// TODO make configurable
	$rssItem->addChild("is_sticky", 0, $namespaces["wp"]);		// TODO make configurable

	/* convert categories to post tags -- nota bene that Google Reader
	   has conflated the reader's categories with the original post's
	   tags, creating a... mish-mash. */
	foreach($item["categories"] as $category)
	{
		if (!preg_match("|.*/com\.google/.*|", $category))
		{
			$cleanCategory = $category;
			$cleanCategory = preg_replace("|user/\d+/label/(.*)|", "$1", $cleanCategory);
			$rssCategory = $rssItem->addChild("category", htmlentities($cleanCategory, ENT_COMPAT, "UTF-8"));
			$rssCategory->addAttribute("domain", "post_tag");
			$rssCategory->addAttribute("nicename", sluggify($cleanCategory));
			$rssCategory->addAttribute("json", $category);
		}
	}

	/* add comments -- note that for privacy reasons, while the
	   commenter's metadata is added as an XML tag, it is not embedded
	   in the Wordpress-readable wp:comment tags */
	foreach($item["comments"] as $comment)
	{
		$rssComment = $rssItem->addChild("comment", "", $namespaces["wp"]);
		$rssComment->addAttribute("id", $comment["id"]);
		$commentContent = $rssComment->addChild("comment_content", $comment["htmlContent"], $namespaces["wp"]);
		$commentContent->addAttribute("plainContent", $comment["plainContent"]);
		$author = $rssComment->addChild("comment_author", $comment["author"], $namespaces["wp"]);
		$author->addAttribute("userId", $comment["userId"]);
		$author->addAttribute("profileId", $comment["profileId"]);
		$author->addAttribute("profileCardParams", $comment["profileCardParams"]);
		$author->addAttribute("venueStreamid", $comment["venueStreamId"]);
		$commentDate = $rssComment->AddChild("comment_date", $comment["createdTime"], $namespaces["wp"]);
		$commentDate->addAttribute("modifiedTime", $comment["modifiedTime"]);
		$rssComment->addAttribute("isSpam", $comment["isSpam"]);
	}
}

/* dump the converted XML out as a file */
header ("Content-type: text/xml");
echo $rss->asXML();
file_put_contents("shared-items.xml", $rss->asXML());
	/**********************************************************************
	* Google Reader to Wordpress
	*
	* 2011-11-27
	* Seth Battis (seth@battis.net)
	*
	* This script takes the output of Google Reader's JSON export of
	* shared items and converts it into an XML file that can be imported
	* into a Wordpress blog as posts. All of the data in the original JSON
	* file is preserved in the XML file, either by transfering it to
	* an appropriate format (e.g. Google Reader categories are converted
	* to WordPress post tags) or simply as an additional XML tag (e.g. the
	* Google Reader commentInfo metadata for recent shared items). In
	* situations where actual data has to be converted to make it readable
	* for WordPress, the original data is included as the JSON attribute
	* of that tag (e.g. timestamps and categories).
	*
	* As currently written, the script looks in its local directory for
	* Google Reader "shared-items.json" file and generates a matching
	* "shared-items.xml" file, also in its local directory.
	*
	* There are a number of potentially configurable (i.e. arbitrary)
	* values marked as TODO.
	*
	* Caveat emptor: this has been tested against my ~1000 item Google
	* Reader shared items feed and on my WordPress 3.2.1 site. I would
	* presume that it should work fairly well for others, but make no
	* guarantees!
	*********************************************************************/

	/* returns a Wordpress slug-version of the given text (only
	alphanumeric characters and dashes) */
	function sluggify($text)
	{
	return preg_replace("\|[^a-z0-9]+\|", "-", strtolower($text));
	}

	/* SimpleXML doesn't really support namespaces unless you force it */
	$xml = '<?xml version="1.0" encoding="UTF-8" ?>
	<rss version="2.0"
	xmlns:excerpt="http://wordpress.org/export/1.1/excerpt/"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:wp="http://wordpress.org/export/1.1/"
	></rss>';
	$rss = new SimpleXMLElement($xml);
	$namespaces = $rss->getDocNamespaces(true);

	/* load the Google Reader JSON file */
	$file = file_get_contents("shared-items.json");
	$json = json_decode($file, true);

	/* Wordpress will choke if our post names aren't unique, so we track
	them separately */
	$post_names = array();

	/* header information describing the file itself */
	$channel = $rss->addChild("channel");
	$channel->addAttribute("direction", $json["direction"]);
	$channel->addAttribute("id", $json["id"]);
	$channel->addAttribute("self", $json["self"][0]["href"]);
	$channel->addAttribute("author", $json["author"]);
	$channel->addChild("title", $json["title"]);
	$channel->addChild("link");
	$channel->addChild("description");
	$pubDate = $channel->addChild("pubDate", gmdate("D, j M Y G:i:s O", $json["updated"]));
	$pubDate->addAttribute("json", $json["updated"]);
	$channel->addChild("language");
	$channel->addChild("wxr_version", "1.1", $namespaces["wp"]);

	/* run through the list of items and add them to the XML */
	foreach ($json["items"] as $item)
	{
	$rssItem = $channel->addChild("item");

	/* a bunch of Google Reader-specific metadata */
	if (isset($item["isReadStateLocked"]))
	{
	$rssItem->addAttribute("isReadStateLocked", $item["isReadStateLocked"]);
	}
	$rssItem->addAttribute("crawlTimeMsec", $item["crawlTimeMsec"]);
	$rssItem->addAttribute("timestampUsec", $item["timestampUsec"]);
	$rssItem->addAttribute("id", $item["id"]);
	if (isset($item["commentInfo"]))
	{
	while (list($commentInfoKey, $commentInfoValue) = each($item["commentInfo"]))
	{
	$commentInfo = $rssItem->addChild("commentInfo", $commentInfoKey);
	$commentInfo->addAttribute("permalinkUrl", $commentInfoValue["permalinkUrl"]);
	$commentInfo->addAttribute("commentState", $commentInfoValue["commentState"]);
	}
	}

	/* annoyingly, not every item has its content in the content
	element -- sometimes it's in the summary element (and once in a
	while, it's just not there). I think this is an artifact of how
	the original RSS feeds were constructed. I think. */
	if (isset($item["content"]))
	{
	$content = $item["content"]["content"];
	}
	else if (isset($item["summary"]["content"]))
	{
	$content = $item["summary"]["content"];
	}
	else
	{
	$content = "";
	}

	/* sometimes items don't even have titles */
	if (isset($item["title"]))
	{
	$rssItem->addChild("title", $item["title"]);
	}

	/* most items store the original linkback information in the
	alternate element -- Wordpress won't honor this link tag when
	it's imported (i.e. it won' treat it like the Daring Fireball
	feed), so I have embedded a more descriptive linkback after the
	annotations at the start of the content, using some information
	from the origin element. The linkback is in the
	"google-reader-alternate-href" div (for easy CSS-wrangling!) */
	if (isset($item["alternate"]))
	{
	$rssItem->addChild("link", htmlentities($item["alternate"][0]["href"], ENT_COMPAT, "UTF-8"));
	$content = htmlspecialchars("<div class=\"google-reader-alternate-href\"><p>Originally posted at <a href=\"{$item["alternate"][0]["href"]}\">{$item["origin"]["title"]}</a></p></div>", ENT_COMPAT, "UTF-8") . $content;
	}

	/* I haven't bothered to figure out if the dates are really GMT or
	localized -- GMT was an easier assumption to make */
	$pubDate = $rssItem->addChild("pubDate", gmdate("D, j M Y G:i:s O", $item["published"]));
	$pubDate->addAttribute("json", $item["published"]);

	/* not every item has an a author, either -- again, an artifact of
	the original RSS feeds */
	if (isset($item["author"]))
	{
	$rssItem->addChild("creator", $item["author"], $namespaces["dc"]);
	}

	/* annotations were tricky -- I have added them as their own XML
	tags _and_ inserted them within a "google-reader-annotation" div
	at the top of the post content (to match the original format on-
	screen). All of the original data is preserved in the XML tag,
	with an ID that matches the embedded div ID. */
	foreach($item["annotations"] as $annotation)
	{
	$annotationHTML = htmlentities("<div id=\"" . md5($annotation["content"] . $annotation["author"]) . "\" class=\"google-reader-annotation\"><blockquote><p>{$annotation["content"]}</p><p class=\"author\">{$annotation["author"]}</p></blockquote></div>", ENT_COMPAT, "UTF-8");
	$content = $annotationHTML . $content;
	$rssAnnotation = $rssItem->addChild("annotation", $annotation["content"]);
	$rssAnnotation->addAttribute("id", md5($annotation["content"] . $annotation["author"]));
	$rssAnnotation->addAttribute("author", $annotation["author"]);
	$rssAnnotation->addAttribute("userId", $annotation["userId"]);
	$rssAnnotation->addAttribute("profileId", $annotation["profileId"]);
	$rssAnnotation->addAttribute("profileCardParams", $annotation["profileCardParams"]);
	}

	/* again, sometimes content is in content, sometimes it's in the
	summary element */
	$rssContent = $rssItem->addChild("encoded", $content, $namespaces["content"]);
	if (isset($item["content"]))
	{
	$rssContent->addAttribute("direction", $item["content"]["direction"]);
	}
	if (isset($item["summary"]))
	{
	$excerpt = $rssItem->addChild("encoded", $item["summary"]["content"], $namespaces["excerpt"]);
	$excerpt->addAttribute("direction", $item["summary"]["direction"]);
	}

	/* more Google reader metadata, this time about the original feed
	that the item came from -- which is used above to format the
	linkback that is embedded a the start of the content */
	$origin = $rssItem->addChild("origin");
	$origin->addAttribute("streamId", $item["origin"]["streamId"]);
	$origin->addAttribute("title", $item["origin"]["title"]);
	$origin->addAttribute("htmlUrl", $item["origin"]["htmlUrl"]);

	/* it's not clear to me whether the published or modified date is
	when the original post was published or when the item <span class="hiddenGrammarError" pre="item ">was
	shared</span> -- I think when published refers to when it was shared. */
	$postDate = $rssItem->addChild("post_date", date("Y-m-d G:i:s", $item["published"]), $namespaces["wp"]);
	$postDate->addAttribute("json", $item["published"]);
	$rssItem->addChild("comment_status", "open", $namespaces["wp"]); // TODO make configurable
	$rssItem->addChild("ping_status", "open", $namespaces["wp"]); // TODO make configurable

	/* make a Wordpress friendly title slug for the post */
	if (isset($item["title"]))
	{
	$slug = sluggify($item["title"]);
	}
	else
	{
	/* if no title, generate the slug from the timestamp */
	$slug = date("Y-m-d-G-i-s", $item["published"]);
	}

	/* make sure that our slug is unique -- add a counter to the end
	if it is not, and track those counter values in $post_names[] */
	if (isset($post_names[$slug]))
	{
	$post_names[$slug]++;
	$slug .= "-" . $post_names[$slug];
	}
	else
	{
	$post_names[$slug] = 0;
	}
	$rssItem->addChild("post_name", $slug, $namespaces["wp"]);

	/* more Wordpress metadata -- all of which could be tweaked */
	$rssItem->addChild("status", "publish", $namespaces["wp"]); // TODO make configurable
	$rssItem->addchild("post_parent", 0, $namespaces["wp"]); // TODO make configurable
	$rssItem->addChild("menu_order", 0, $namespaces["wp"]); // TODO make configurable
	$rssItem->addChild("post_type", "post", $namespaces["wp"]); // TODO make configurable
	$rssItem->addChild("post_password", "", $namespaces["wp"]); // TODO make configurable
	$rssItem->addChild("is_sticky", 0, $namespaces["wp"]); // TODO make configurable

	/* convert categories to post tags -- nota bene that Google Reader
	has conflated the reader's categories with the original post's
	tags, creating a... mish-mash. */
	foreach($item["categories"] as $category)
	{
	if (!preg_match("\|./com\.google/.\|", $category))
	{
	$cleanCategory = $category;
	$cleanCategory = preg_replace("\|user/\d+/label/(.*)\|", "$1", $cleanCategory);
	$rssCategory = $rssItem->addChild("category", htmlentities($cleanCategory, ENT_COMPAT, "UTF-8"));
	$rssCategory->addAttribute("domain", "post_tag");
	$rssCategory->addAttribute("nicename", sluggify($cleanCategory));
	$rssCategory->addAttribute("json", $category);
	}
	}

	/* add comments -- note that for privacy reasons, while the
	commenter's metadata is added as an XML tag, it is not embedded
	in the Wordpress-readable wp:comment tags */
	foreach($item["comments"] as $comment)
	{
	$rssComment = $rssItem->addChild("comment", "", $namespaces["wp"]);
	$rssComment->addAttribute("id", $comment["id"]);
	$commentContent = $rssComment->addChild("comment_content", $comment["htmlContent"], $namespaces["wp"]);
	$commentContent->addAttribute("plainContent", $comment["plainContent"]);
	$author = $rssComment->addChild("comment_author", $comment["author"], $namespaces["wp"]);
	$author->addAttribute("userId", $comment["userId"]);
	$author->addAttribute("profileId", $comment["profileId"]);
	$author->addAttribute("profileCardParams", $comment["profileCardParams"]);
	$author->addAttribute("venueStreamid", $comment["venueStreamId"]);
	$commentDate = $rssComment->AddChild("comment_date", $comment["createdTime"], $namespaces["wp"]);
	$commentDate->addAttribute("modifiedTime", $comment["modifiedTime"]);
	$rssComment->addAttribute("isSpam", $comment["isSpam"]);
	}
	}

	/* dump the converted XML out as a file */
	header ("Content-type: text/xml");
	echo $rss->asXML();
	file_put_contents("shared-items.xml", $rss->asXML());