Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Transmogrifying those Google Reader JSON dumps into something useful
/**********************************************************************
* Google Reader to Wordpress
*
* 2011-11-27
* Seth Battis (seth@battis.net)
*
* This script takes the output of Google Reader's JSON export of
* shared items and converts it into an XML file that can be imported
* into a Wordpress blog as posts. All of the data in the original JSON
* file is preserved in the XML file, either by transfering it to
* an appropriate format (e.g. Google Reader categories are converted
* to WordPress post tags) or simply as an additional XML tag (e.g. the
* Google Reader commentInfo metadata for recent shared items). In
* situations where actual data has to be converted to make it readable
* for WordPress, the original data is included as the JSON attribute
* of that tag (e.g. timestamps and categories).
*
* As currently written, the script looks in its local directory for
* Google Reader "shared-items.json" file and generates a matching
* "shared-items.xml" file, also in its local directory.
*
* There are a number of potentially configurable (i.e. arbitrary)
* values marked as TODO.
*
* Caveat emptor: this has been tested against my ~1000 item Google
* Reader shared items feed and on my WordPress 3.2.1 site. I would
* presume that it should work fairly well for others, but make no
* guarantees!
*********************************************************************/
/* returns a Wordpress slug-version of the given text (only
alphanumeric characters and dashes) */
function sluggify($text)
{
return preg_replace("|[^a-z0-9]+|", "-", strtolower($text));
}
/* SimpleXML doesn't really support namespaces unless you force it */
$xml = '<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0"
xmlns:excerpt="http://wordpress.org/export/1.1/excerpt/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.1/"
></rss>';
$rss = new SimpleXMLElement($xml);
$namespaces = $rss->getDocNamespaces(true);
/* load the Google Reader JSON file */
$file = file_get_contents("shared-items.json");
$json = json_decode($file, true);
/* Wordpress will choke if our post names aren't unique, so we track
them separately */
$post_names = array();
/* header information describing the file itself */
$channel = $rss->addChild("channel");
$channel->addAttribute("direction", $json["direction"]);
$channel->addAttribute("id", $json["id"]);
$channel->addAttribute("self", $json["self"][0]["href"]);
$channel->addAttribute("author", $json["author"]);
$channel->addChild("title", $json["title"]);
$channel->addChild("link");
$channel->addChild("description");
$pubDate = $channel->addChild("pubDate", gmdate("D, j M Y G:i:s O", $json["updated"]));
$pubDate->addAttribute("json", $json["updated"]);
$channel->addChild("language");
$channel->addChild("wxr_version", "1.1", $namespaces["wp"]);
/* run through the list of items and add them to the XML */
foreach ($json["items"] as $item)
{
$rssItem = $channel->addChild("item");
/* a bunch of Google Reader-specific metadata */
if (isset($item["isReadStateLocked"]))
{
$rssItem->addAttribute("isReadStateLocked", $item["isReadStateLocked"]);
}
$rssItem->addAttribute("crawlTimeMsec", $item["crawlTimeMsec"]);
$rssItem->addAttribute("timestampUsec", $item["timestampUsec"]);
$rssItem->addAttribute("id", $item["id"]);
if (isset($item["commentInfo"]))
{
while (list($commentInfoKey, $commentInfoValue) = each($item["commentInfo"]))
{
$commentInfo = $rssItem->addChild("commentInfo", $commentInfoKey);
$commentInfo->addAttribute("permalinkUrl", $commentInfoValue["permalinkUrl"]);
$commentInfo->addAttribute("commentState", $commentInfoValue["commentState"]);
}
}
/* annoyingly, not every item has its content in the content
element -- sometimes it's in the summary element (and once in a
while, it's just not there). I think this is an artifact of how
the original RSS feeds were constructed. I think. */
if (isset($item["content"]))
{
$content = $item["content"]["content"];
}
else if (isset($item["summary"]["content"]))
{
$content = $item["summary"]["content"];
}
else
{
$content = "";
}
/* sometimes items don't even have titles */
if (isset($item["title"]))
{
$rssItem->addChild("title", $item["title"]);
}
/* most items store the original linkback information in the
alternate element -- Wordpress won't honor this link tag when
it's imported (i.e. it won' treat it like the Daring Fireball
feed), so I have embedded a more descriptive linkback after the
annotations at the start of the content, using some information
from the origin element. The linkback is in the
"google-reader-alternate-href" div (for easy CSS-wrangling!) */
if (isset($item["alternate"]))
{
$rssItem->addChild("link", htmlentities($item["alternate"][0]["href"], ENT_COMPAT, "UTF-8"));
$content = htmlspecialchars("<div class=\"google-reader-alternate-href\"><p>Originally posted at <a href=\"{$item["alternate"][0]["href"]}\">{$item["origin"]["title"]}</a></p></div>", ENT_COMPAT, "UTF-8") . $content;
}
/* I haven't bothered to figure out if the dates are really GMT or
localized -- GMT was an easier assumption to make */
$pubDate = $rssItem->addChild("pubDate", gmdate("D, j M Y G:i:s O", $item["published"]));
$pubDate->addAttribute("json", $item["published"]);
/* not every item has an a author, either -- again, an artifact of
the original RSS feeds */
if (isset($item["author"]))
{
$rssItem->addChild("creator", $item["author"], $namespaces["dc"]);
}
/* annotations were tricky -- I have added them as their own XML
tags _and_ inserted them within a "google-reader-annotation" div
at the top of the post content (to match the original format on-
screen). All of the original data is preserved in the XML tag,
with an ID that matches the embedded div ID. */
foreach($item["annotations"] as $annotation)
{
$annotationHTML = htmlentities("<div id=\"" . md5($annotation["content"] . $annotation["author"]) . "\" class=\"google-reader-annotation\"><blockquote><p>{$annotation["content"]}</p><p class=\"author\">{$annotation["author"]}</p></blockquote></div>", ENT_COMPAT, "UTF-8");
$content = $annotationHTML . $content;
$rssAnnotation = $rssItem->addChild("annotation", $annotation["content"]);
$rssAnnotation->addAttribute("id", md5($annotation["content"] . $annotation["author"]));
$rssAnnotation->addAttribute("author", $annotation["author"]);
$rssAnnotation->addAttribute("userId", $annotation["userId"]);
$rssAnnotation->addAttribute("profileId", $annotation["profileId"]);
$rssAnnotation->addAttribute("profileCardParams", $annotation["profileCardParams"]);
}
/* again, sometimes content is in content, sometimes it's in the
summary element */
$rssContent = $rssItem->addChild("encoded", $content, $namespaces["content"]);
if (isset($item["content"]))
{
$rssContent->addAttribute("direction", $item["content"]["direction"]);
}
if (isset($item["summary"]))
{
$excerpt = $rssItem->addChild("encoded", $item["summary"]["content"], $namespaces["excerpt"]);
$excerpt->addAttribute("direction", $item["summary"]["direction"]);
}
/* more Google reader metadata, this time about the original feed
that the item came from -- which is used above to format the
linkback that is embedded a the start of the content */
$origin = $rssItem->addChild("origin");
$origin->addAttribute("streamId", $item["origin"]["streamId"]);
$origin->addAttribute("title", $item["origin"]["title"]);
$origin->addAttribute("htmlUrl", $item["origin"]["htmlUrl"]);
/* it's not clear to me whether the published or modified date is
when the original post was published or when the item <span class="hiddenGrammarError" pre="item ">was
shared</span> -- I think when published refers to when it was shared. */
$postDate = $rssItem->addChild("post_date", date("Y-m-d G:i:s", $item["published"]), $namespaces["wp"]);
$postDate->addAttribute("json", $item["published"]);
$rssItem->addChild("comment_status", "open", $namespaces["wp"]); // TODO make configurable
$rssItem->addChild("ping_status", "open", $namespaces["wp"]); // TODO make configurable
/* make a Wordpress friendly title slug for the post */
if (isset($item["title"]))
{
$slug = sluggify($item["title"]);
}
else
{
/* if no title, generate the slug from the timestamp */
$slug = date("Y-m-d-G-i-s", $item["published"]);
}
/* make sure that our slug is unique -- add a counter to the end
if it is not, and track those counter values in $post_names[] */
if (isset($post_names[$slug]))
{
$post_names[$slug]++;
$slug .= "-" . $post_names[$slug];
}
else
{
$post_names[$slug] = 0;
}
$rssItem->addChild("post_name", $slug, $namespaces["wp"]);
/* more Wordpress metadata -- all of which could be tweaked */
$rssItem->addChild("status", "publish", $namespaces["wp"]); // TODO make configurable
$rssItem->addchild("post_parent", 0, $namespaces["wp"]); // TODO make configurable
$rssItem->addChild("menu_order", 0, $namespaces["wp"]); // TODO make configurable
$rssItem->addChild("post_type", "post", $namespaces["wp"]); // TODO make configurable
$rssItem->addChild("post_password", "", $namespaces["wp"]); // TODO make configurable
$rssItem->addChild("is_sticky", 0, $namespaces["wp"]); // TODO make configurable
/* convert categories to post tags -- nota bene that Google Reader
has conflated the reader's categories with the original post's
tags, creating a... mish-mash. */
foreach($item["categories"] as $category)
{
if (!preg_match("|.*/com\.google/.*|", $category))
{
$cleanCategory = $category;
$cleanCategory = preg_replace("|user/\d+/label/(.*)|", "$1", $cleanCategory);
$rssCategory = $rssItem->addChild("category", htmlentities($cleanCategory, ENT_COMPAT, "UTF-8"));
$rssCategory->addAttribute("domain", "post_tag");
$rssCategory->addAttribute("nicename", sluggify($cleanCategory));
$rssCategory->addAttribute("json", $category);
}
}
/* add comments -- note that for privacy reasons, while the
commenter's metadata is added as an XML tag, it is not embedded
in the Wordpress-readable wp:comment tags */
foreach($item["comments"] as $comment)
{
$rssComment = $rssItem->addChild("comment", "", $namespaces["wp"]);
$rssComment->addAttribute("id", $comment["id"]);
$commentContent = $rssComment->addChild("comment_content", $comment["htmlContent"], $namespaces["wp"]);
$commentContent->addAttribute("plainContent", $comment["plainContent"]);
$author = $rssComment->addChild("comment_author", $comment["author"], $namespaces["wp"]);
$author->addAttribute("userId", $comment["userId"]);
$author->addAttribute("profileId", $comment["profileId"]);
$author->addAttribute("profileCardParams", $comment["profileCardParams"]);
$author->addAttribute("venueStreamid", $comment["venueStreamId"]);
$commentDate = $rssComment->AddChild("comment_date", $comment["createdTime"], $namespaces["wp"]);
$commentDate->addAttribute("modifiedTime", $comment["modifiedTime"]);
$rssComment->addAttribute("isSpam", $comment["isSpam"]);
}
}
/* dump the converted XML out as a file */
header ("Content-type: text/xml");
echo $rss->asXML();
file_put_contents("shared-items.xml", $rss->asXML());
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment