Created
July 30, 2017 18:16
Star
You must be signed in to star a gist
Transmogrifying those Google Reader JSON dumps into something useful
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/********************************************************************** | |
* Google Reader to Wordpress | |
* | |
* 2011-11-27 | |
* Seth Battis (seth@battis.net) | |
* | |
* This script takes the output of Google Reader's JSON export of | |
* shared items and converts it into an XML file that can be imported | |
* into a Wordpress blog as posts. All of the data in the original JSON | |
* file is preserved in the XML file, either by transfering it to | |
* an appropriate format (e.g. Google Reader categories are converted | |
* to WordPress post tags) or simply as an additional XML tag (e.g. the | |
* Google Reader commentInfo metadata for recent shared items). In | |
* situations where actual data has to be converted to make it readable | |
* for WordPress, the original data is included as the JSON attribute | |
* of that tag (e.g. timestamps and categories). | |
* | |
* As currently written, the script looks in its local directory for | |
* Google Reader "shared-items.json" file and generates a matching | |
* "shared-items.xml" file, also in its local directory. | |
* | |
* There are a number of potentially configurable (i.e. arbitrary) | |
* values marked as TODO. | |
* | |
* Caveat emptor: this has been tested against my ~1000 item Google | |
* Reader shared items feed and on my WordPress 3.2.1 site. I would | |
* presume that it should work fairly well for others, but make no | |
* guarantees! | |
*********************************************************************/ | |
/* returns a Wordpress slug-version of the given text (only | |
alphanumeric characters and dashes) */ | |
function sluggify($text) | |
{ | |
return preg_replace("|[^a-z0-9]+|", "-", strtolower($text)); | |
} | |
/* SimpleXML doesn't really support namespaces unless you force it */ | |
$xml = '<?xml version="1.0" encoding="UTF-8" ?> | |
<rss version="2.0" | |
xmlns:excerpt="http://wordpress.org/export/1.1/excerpt/" | |
xmlns:content="http://purl.org/rss/1.0/modules/content/" | |
xmlns:wfw="http://wellformedweb.org/CommentAPI/" | |
xmlns:dc="http://purl.org/dc/elements/1.1/" | |
xmlns:wp="http://wordpress.org/export/1.1/" | |
></rss>'; | |
$rss = new SimpleXMLElement($xml); | |
$namespaces = $rss->getDocNamespaces(true); | |
/* load the Google Reader JSON file */ | |
$file = file_get_contents("shared-items.json"); | |
$json = json_decode($file, true); | |
/* Wordpress will choke if our post names aren't unique, so we track | |
them separately */ | |
$post_names = array(); | |
/* header information describing the file itself */ | |
$channel = $rss->addChild("channel"); | |
$channel->addAttribute("direction", $json["direction"]); | |
$channel->addAttribute("id", $json["id"]); | |
$channel->addAttribute("self", $json["self"][0]["href"]); | |
$channel->addAttribute("author", $json["author"]); | |
$channel->addChild("title", $json["title"]); | |
$channel->addChild("link"); | |
$channel->addChild("description"); | |
$pubDate = $channel->addChild("pubDate", gmdate("D, j M Y G:i:s O", $json["updated"])); | |
$pubDate->addAttribute("json", $json["updated"]); | |
$channel->addChild("language"); | |
$channel->addChild("wxr_version", "1.1", $namespaces["wp"]); | |
/* run through the list of items and add them to the XML */ | |
foreach ($json["items"] as $item) | |
{ | |
$rssItem = $channel->addChild("item"); | |
/* a bunch of Google Reader-specific metadata */ | |
if (isset($item["isReadStateLocked"])) | |
{ | |
$rssItem->addAttribute("isReadStateLocked", $item["isReadStateLocked"]); | |
} | |
$rssItem->addAttribute("crawlTimeMsec", $item["crawlTimeMsec"]); | |
$rssItem->addAttribute("timestampUsec", $item["timestampUsec"]); | |
$rssItem->addAttribute("id", $item["id"]); | |
if (isset($item["commentInfo"])) | |
{ | |
while (list($commentInfoKey, $commentInfoValue) = each($item["commentInfo"])) | |
{ | |
$commentInfo = $rssItem->addChild("commentInfo", $commentInfoKey); | |
$commentInfo->addAttribute("permalinkUrl", $commentInfoValue["permalinkUrl"]); | |
$commentInfo->addAttribute("commentState", $commentInfoValue["commentState"]); | |
} | |
} | |
/* annoyingly, not every item has its content in the content | |
element -- sometimes it's in the summary element (and once in a | |
while, it's just not there). I think this is an artifact of how | |
the original RSS feeds were constructed. I think. */ | |
if (isset($item["content"])) | |
{ | |
$content = $item["content"]["content"]; | |
} | |
else if (isset($item["summary"]["content"])) | |
{ | |
$content = $item["summary"]["content"]; | |
} | |
else | |
{ | |
$content = ""; | |
} | |
/* sometimes items don't even have titles */ | |
if (isset($item["title"])) | |
{ | |
$rssItem->addChild("title", $item["title"]); | |
} | |
/* most items store the original linkback information in the | |
alternate element -- Wordpress won't honor this link tag when | |
it's imported (i.e. it won' treat it like the Daring Fireball | |
feed), so I have embedded a more descriptive linkback after the | |
annotations at the start of the content, using some information | |
from the origin element. The linkback is in the | |
"google-reader-alternate-href" div (for easy CSS-wrangling!) */ | |
if (isset($item["alternate"])) | |
{ | |
$rssItem->addChild("link", htmlentities($item["alternate"][0]["href"], ENT_COMPAT, "UTF-8")); | |
$content = htmlspecialchars("<div class=\"google-reader-alternate-href\"><p>Originally posted at <a href=\"{$item["alternate"][0]["href"]}\">{$item["origin"]["title"]}</a></p></div>", ENT_COMPAT, "UTF-8") . $content; | |
} | |
/* I haven't bothered to figure out if the dates are really GMT or | |
localized -- GMT was an easier assumption to make */ | |
$pubDate = $rssItem->addChild("pubDate", gmdate("D, j M Y G:i:s O", $item["published"])); | |
$pubDate->addAttribute("json", $item["published"]); | |
/* not every item has an a author, either -- again, an artifact of | |
the original RSS feeds */ | |
if (isset($item["author"])) | |
{ | |
$rssItem->addChild("creator", $item["author"], $namespaces["dc"]); | |
} | |
/* annotations were tricky -- I have added them as their own XML | |
tags _and_ inserted them within a "google-reader-annotation" div | |
at the top of the post content (to match the original format on- | |
screen). All of the original data is preserved in the XML tag, | |
with an ID that matches the embedded div ID. */ | |
foreach($item["annotations"] as $annotation) | |
{ | |
$annotationHTML = htmlentities("<div id=\"" . md5($annotation["content"] . $annotation["author"]) . "\" class=\"google-reader-annotation\"><blockquote><p>{$annotation["content"]}</p><p class=\"author\">{$annotation["author"]}</p></blockquote></div>", ENT_COMPAT, "UTF-8"); | |
$content = $annotationHTML . $content; | |
$rssAnnotation = $rssItem->addChild("annotation", $annotation["content"]); | |
$rssAnnotation->addAttribute("id", md5($annotation["content"] . $annotation["author"])); | |
$rssAnnotation->addAttribute("author", $annotation["author"]); | |
$rssAnnotation->addAttribute("userId", $annotation["userId"]); | |
$rssAnnotation->addAttribute("profileId", $annotation["profileId"]); | |
$rssAnnotation->addAttribute("profileCardParams", $annotation["profileCardParams"]); | |
} | |
/* again, sometimes content is in content, sometimes it's in the | |
summary element */ | |
$rssContent = $rssItem->addChild("encoded", $content, $namespaces["content"]); | |
if (isset($item["content"])) | |
{ | |
$rssContent->addAttribute("direction", $item["content"]["direction"]); | |
} | |
if (isset($item["summary"])) | |
{ | |
$excerpt = $rssItem->addChild("encoded", $item["summary"]["content"], $namespaces["excerpt"]); | |
$excerpt->addAttribute("direction", $item["summary"]["direction"]); | |
} | |
/* more Google reader metadata, this time about the original feed | |
that the item came from -- which is used above to format the | |
linkback that is embedded a the start of the content */ | |
$origin = $rssItem->addChild("origin"); | |
$origin->addAttribute("streamId", $item["origin"]["streamId"]); | |
$origin->addAttribute("title", $item["origin"]["title"]); | |
$origin->addAttribute("htmlUrl", $item["origin"]["htmlUrl"]); | |
/* it's not clear to me whether the published or modified date is | |
when the original post was published or when the item <span class="hiddenGrammarError" pre="item ">was | |
shared</span> -- I think when published refers to when it was shared. */ | |
$postDate = $rssItem->addChild("post_date", date("Y-m-d G:i:s", $item["published"]), $namespaces["wp"]); | |
$postDate->addAttribute("json", $item["published"]); | |
$rssItem->addChild("comment_status", "open", $namespaces["wp"]); // TODO make configurable | |
$rssItem->addChild("ping_status", "open", $namespaces["wp"]); // TODO make configurable | |
/* make a Wordpress friendly title slug for the post */ | |
if (isset($item["title"])) | |
{ | |
$slug = sluggify($item["title"]); | |
} | |
else | |
{ | |
/* if no title, generate the slug from the timestamp */ | |
$slug = date("Y-m-d-G-i-s", $item["published"]); | |
} | |
/* make sure that our slug is unique -- add a counter to the end | |
if it is not, and track those counter values in $post_names[] */ | |
if (isset($post_names[$slug])) | |
{ | |
$post_names[$slug]++; | |
$slug .= "-" . $post_names[$slug]; | |
} | |
else | |
{ | |
$post_names[$slug] = 0; | |
} | |
$rssItem->addChild("post_name", $slug, $namespaces["wp"]); | |
/* more Wordpress metadata -- all of which could be tweaked */ | |
$rssItem->addChild("status", "publish", $namespaces["wp"]); // TODO make configurable | |
$rssItem->addchild("post_parent", 0, $namespaces["wp"]); // TODO make configurable | |
$rssItem->addChild("menu_order", 0, $namespaces["wp"]); // TODO make configurable | |
$rssItem->addChild("post_type", "post", $namespaces["wp"]); // TODO make configurable | |
$rssItem->addChild("post_password", "", $namespaces["wp"]); // TODO make configurable | |
$rssItem->addChild("is_sticky", 0, $namespaces["wp"]); // TODO make configurable | |
/* convert categories to post tags -- nota bene that Google Reader | |
has conflated the reader's categories with the original post's | |
tags, creating a... mish-mash. */ | |
foreach($item["categories"] as $category) | |
{ | |
if (!preg_match("|.*/com\.google/.*|", $category)) | |
{ | |
$cleanCategory = $category; | |
$cleanCategory = preg_replace("|user/\d+/label/(.*)|", "$1", $cleanCategory); | |
$rssCategory = $rssItem->addChild("category", htmlentities($cleanCategory, ENT_COMPAT, "UTF-8")); | |
$rssCategory->addAttribute("domain", "post_tag"); | |
$rssCategory->addAttribute("nicename", sluggify($cleanCategory)); | |
$rssCategory->addAttribute("json", $category); | |
} | |
} | |
/* add comments -- note that for privacy reasons, while the | |
commenter's metadata is added as an XML tag, it is not embedded | |
in the Wordpress-readable wp:comment tags */ | |
foreach($item["comments"] as $comment) | |
{ | |
$rssComment = $rssItem->addChild("comment", "", $namespaces["wp"]); | |
$rssComment->addAttribute("id", $comment["id"]); | |
$commentContent = $rssComment->addChild("comment_content", $comment["htmlContent"], $namespaces["wp"]); | |
$commentContent->addAttribute("plainContent", $comment["plainContent"]); | |
$author = $rssComment->addChild("comment_author", $comment["author"], $namespaces["wp"]); | |
$author->addAttribute("userId", $comment["userId"]); | |
$author->addAttribute("profileId", $comment["profileId"]); | |
$author->addAttribute("profileCardParams", $comment["profileCardParams"]); | |
$author->addAttribute("venueStreamid", $comment["venueStreamId"]); | |
$commentDate = $rssComment->AddChild("comment_date", $comment["createdTime"], $namespaces["wp"]); | |
$commentDate->addAttribute("modifiedTime", $comment["modifiedTime"]); | |
$rssComment->addAttribute("isSpam", $comment["isSpam"]); | |
} | |
} | |
/* dump the converted XML out as a file */ | |
header ("Content-type: text/xml"); | |
echo $rss->asXML(); | |
file_put_contents("shared-items.xml", $rss->asXML()); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment