Created
June 18, 2012 04:15
-
-
Save tshedor/2946802 to your computer and use it in GitHub Desktop.
A very rough way to convert Ellington data into a Wordpress XML file. Tested with WP 3.4. Use at your own risk.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* THIS IS ALPHA. PLEASE EXAMINE SCRIPT CAREFULLY TO REPLACE NECESSARY VARS WITH YOUR ORGANIZATION */ | |
/* | |
The MIT License | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in | |
all copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
THE SOFTWARE. | |
*/ | |
/* | |
Author: Tim Shedor | |
Email: tshedor@kansan.com | |
Github: http://github.com/tshedor/ | |
So Ellington is going out of business and we're looking to still maintain our website somehow, so we're going to Wordpress. If your organization is in the same boat, please fork or contact me. | |
This script does not support photo galleries, inlines, iframe ads or comments yet. Just raw articles. I plan to continue refining it soon. | |
Based off the JSON dump from Ellington between 2005 and 2012. | |
Before executing, you must | |
1) add | |
{"files":[ | |
to the beginning of the JSON file. | |
2) Find and replace all } brackets with }, . The comma is essential. | |
3) Add | |
]} | |
to the end of the JSON file. | |
4) Run no more than 800 items at a time, although this depends on your PHP memory limits | |
*/ | |
?> | |
<!-- BEGIN SCRIPT. EXECUTE BY LOADING IN BROWSER OR VIA TERMINAL COMMAND (Terminal ex. php ellington_convert.php) --> | |
<meta charset="utf-8" /> | |
<?php | |
$wordpress_start = "<?xml version='1.0' encoding='UTF-8' ?> | |
<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. --> | |
<!-- It contains information about your site's posts, pages, comments, categories, and other content. --> | |
<!-- You may use this file to transfer that content from one site to another. --> | |
<!-- This file is not intended to serve as a complete backup of your site. --> | |
<!-- To import this information into a WordPress site follow these steps: --> | |
<!-- 1. Log in to that site as an administrator. --> | |
<!-- 2. Go to Tools: Import in the WordPress admin panel. --> | |
<!-- 3. Install the 'WordPress' importer from the list. --> | |
<!-- 4. Activate & Run Importer. --> | |
<!-- 5. Upload this file using the form provided on that page. --> | |
<!-- 6. You will first be asked to map the authors in this export file to users --> | |
<!-- on the site. For each author, you may choose to map to an --> | |
<!-- existing user on the site or to create a new user. --> | |
<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. --> | |
<!-- contained in this file into your site. --> | |
<!-- generator='WordPress/3.4' created='2012-06-17 03:50' --> | |
<rss version='2.0' | |
xmlns:excerpt='http://wordpress.org/export/1.2/excerpt/' | |
xmlns:content='http://purl.org/rss/1.0/modules/content/' | |
xmlns:wfw='http://wellformedweb.org/CommentAPI/' | |
xmlns:dc='http://purl.org/dc/elements/1.1/' | |
xmlns:wp='http://wordpress.org/export/1.2/' | |
> | |
<channel> | |
<title>The University Daily Kansan</title> | |
<link>http://kansan.com</link> | |
<description>KU's Student Newspaper</description> | |
<pubDate>Sun, 17 Jun 2012 03:50:29 +0000</pubDate> | |
<language>en-US</language> | |
<wp:wxr_version>1.2</wp:wxr_version> | |
<wp:base_site_url>http://kansan.com</wp:base_site_url> | |
<wp:base_blog_url>http://kansan.com</wp:base_blog_url>"; | |
$wordpress_file = ""; | |
$raw = file_get_contents('json/story.json'); | |
$json = json_decode($raw, false); | |
$master_cats = array("News","0"); | |
$master_authors = array("0"); | |
foreach($json->files as $f) { | |
$headline = $f->headline; | |
$headline = str_replace('&', 'and', $headline); | |
if(isset($f->one_off_byline)) { | |
$author = $f->one_off_byline; | |
$author = str_replace('By ', '', $author); | |
$author = str_replace(' (Contact)', '', $author); | |
} else { | |
$author = $f->bylines_1_first_name . $f->bylines_1_last_name; | |
} | |
$no_more_bys = array (' by', 'by ', ' by ', 'by'); | |
$author = str_replace($no_more_bys, '', $author); | |
array_push($master_authors, $author); | |
$story_id = $f->id; | |
if($f->tease != '') { | |
$excerpt = $f->tease; | |
} elseif ($f->subhead != '') { | |
$excerpt = $f->subhead; | |
} else { | |
$excerpt = $f->pre_story_blurb; | |
} | |
if(isset($f->metadata_1_content)) { | |
$keywords = $f->metadata_1_content; | |
} | |
if(isset($f->metadata_2_content)) { | |
$meta_desc = $f->metadata_2_content; | |
} | |
$categories = array("News", "0"); | |
if(isset($f->categories_1_path)) { | |
$cats = explode('/', $f->categories_1_path); | |
array_push($categories, $cats[1]); | |
} | |
if(isset($f->categories_2_path)) { | |
$cats = explode('/', $f->categories_2_path); | |
array_push($categories, $cats[1], $cats[2]); | |
} | |
if(isset($f->categories_3_path)) { | |
$cats = explode('/', $f->categories_3_path); | |
array_push($categories, $cats[1], $cats[2], $cats[3]); | |
} | |
if(isset($f->categories_4_path)) { | |
$cats = explode('/', $f->categories_4_path); | |
array_push($categories, $cats[1], $cats[2], $cats[3], $cats[4]); | |
} | |
if(isset($f->categories_5_path)) { | |
$cats = explode('/', $f->categories_3_path); | |
array_push($categories, $cats[1], $cats[2], $cats[3], $cats[4], $cats[5]); | |
} | |
if(isset($f->categories_6_path)) { | |
$cats = explode('/', $f->categories_6_path); | |
array_push($categories, $cats[1], $cats[2], $cats[3], $cats[4], $cats[5], $cats[6]); | |
} | |
array_merge($master_cats, $categories); | |
$categories_string = implode(',' , $categories); | |
$updated = $f->updated; | |
$wp_update = strtotime($updated); | |
$wp_update = date('c', $wp_update); | |
$wp_update = str_replace('T', ' ', $wp_update); | |
$wp_update = substr($wp_update, 0, -6); | |
//$wp_update_hour = substr($wp_update, -8); | |
//$wp_update_hour = substr($wp_update_hour, 0, 2); | |
//$wp_update_time = $wp_update_hour . substr($wp_update, -6); | |
$wp_update_time = strtotime($wp_update); | |
$wp_update_time_file = strtotime($wp_update); | |
$wp_update_time = $wp_update_time + 21600; | |
$wp_update_time = date('H:i:s', $wp_update_time); | |
$wp_update_gmt = substr($wp_update, 0, -8) . $wp_update_time; | |
$pub = $f->pub_date; | |
$date = strtotime($pub); | |
$wp_date = date('c', $date); | |
$wp_date = str_replace('T', ' ', $wp_date); | |
$wp_date = substr($wp_date, 0, -6); | |
//$wp_date_hour = substr($wp_date, -8); | |
//$wp_date_hour = substr($wp_date_hour, 0, 2); | |
//$wp_date_time = $wp_date_hour . substr($wp_date, -6); | |
$wp_date_time = strtotime($wp_date); | |
$wp_date_time = $wp_date_time + 21600; | |
$wp_date_time = date('H:i:s', $wp_date_time); | |
$wp_date_gmt = substr($wp_date, 0, -8) . $wp_date_time; | |
$tease_photo = $f->tease_photo; | |
$slug = preg_replace('/[^a-z0-9]+/i', '_', $headline); | |
$story = $f->story; | |
$story_replace = array('<story>', '</story>', 'style="width: 180px; padding-bottom:.3em;"', 'style="width: 160px; background: #fff;"'); | |
$story = str_replace($story_replace, '', $story); | |
$story = str_replace('style="color:#666; font:11px verdana,sans-serif; margin-top:0;"', 'class="caption"', $story); | |
$story = str_replace('style="border: 1px solid #000;"', 'class="old_image"', $story); | |
$story = str_replace('style="color:#999; font-size:.9em; text-align:center; margin:0 0 2px 0;"', 'class="old_ad_disclaim"', $story); | |
$story = $f->pre_story_blurb . $story . $f->post_story_blurb; | |
$story = strip_tags($story, '<div><p><a><span><img><blockquote><cite><section><article><b><br><center><code><table><tr><td><th><em><dd><dt><dl><i><form><input><label><option><select><pre><small><sub><sup><tfoot><thead><ul><li><ol><big><caption><h1><h2><h3><h4><h5><h6><fieldset><strong>'); | |
$get_rid_of_this = array('//',']]>'); | |
$story = str_replace($get_rid_of_this, '', $story); | |
if(!isset($excerpt)){$excerpt = substr($story,0,300);}; | |
if(!isset($keywords)){$keywords = "";} | |
if(!isset($meta_desc)){$meta_desc = "";} | |
$wordpress_file .= " | |
<item> | |
<title>" . $headline . "</title> | |
<link>http://kansan.com/?p=".$story_id."</link> | |
<pubDate>".date('r', $wp_update_time_file)."</pubDate> | |
<dc:creator>".$author."</dc:creator> | |
<guid isPermalink='false'>http://kansan.com/?p=".$story_id."</guid> | |
<description></description> | |
<content:encoded><![CDATA[".$story."]]></content:encoded> | |
<excerpt:encoded><![CDATA[".$excerpt."]]></excerpt:encoded> | |
<wp:post_id>".$story_id."</wp:post_id> | |
<wp:post_date>".$wp_date_gmt."</wp:post_date> | |
<wp:post_date_gmt>".$wp_date_gmt."</wp:post_date_gmt> | |
<wp:comment_status>open</wp:comment_status> | |
<wp:ping_status>open</wp:ping_status> | |
<wp:status>publish</wp:status> | |
<wp:post_parent>0</wp:post_parent> | |
<wp:menu_order>0</wp:menu_order> | |
<wp:post_type>post</wp:post_type> | |
<wp:post_password></wp:post_password> | |
<wp:is_sticky>0</wp:is_sticky>"; | |
$appended_string = ""; | |
foreach ($categories as $cats_listings) { | |
$appended_string .= "<category domain='category' nicename='".strtolower($cats_listings)."'><![CDATA[".$cats_listings."]]></category>"; | |
} | |
$wordpress_file .= $appended_string ." | |
<wp:postmeta> | |
<wp:meta_key>_edit_last</wp:meta_key> | |
<wp:meta_value><![CDATA[1]]></wp:meta_value> | |
</wp:postmeta> | |
<wp:postmeta> | |
<wp:meta_key>udk_meta_keywords</wp:meta_key> | |
<wp:meta_value><![CDATA[".$keywords."]]></wp:meta_value> | |
</wp:postmeta> | |
<wp:postmeta> | |
<wp:meta_key>udk_meta_desc</wp:meta_key> | |
<wp:meta_value><![CDATA[".$meta_desc."]]></wp:meta_value> | |
</wp:postmeta> | |
</item>"; | |
} | |
$wordpress_file .= "</channel> | |
</rss>"; | |
$refined_categories = array_unique($master_cats); | |
$refined_authors = array_unique($master_authors); | |
$auth_count = 0; | |
foreach($refined_authors as $auths) { | |
$auth_count++; | |
$wordpress_start .= "<wp:author><wp:author_id>".$auth_count."</wp:author_id><wp:author_login>".$auths."</wp:author_login><wp:author_email></wp:author_email><wp:author_display_name><![CDATA[".$auths."]]></wp:author_display_name><wp:author_first_name><![CDATA[]]></wp:author_first_name><wp:author_last_name><![CDATA[]]></wp:author_last_name></wp:author> | |
"; | |
} | |
$cat_count = 0; | |
$wordpress_start .= " | |
"; | |
foreach($refined_categories as $fresh_cats) { | |
$cat_count++; | |
$wordpress_start .= "<wp:category><wp:term_id>".$cat_count."</wp:term_id><wp:category_nicename>".strtolower($fresh_cats)."</wp:category_nicename><wp:category_parent></wp:category_parent><wp:cat_name><![CDATA[".$fresh_cats."]]></wp:cat_name></wp:category> | |
"; | |
} | |
$wordpress_start .="<generator>http://wordpress.org/?v=3.4</generator> | |
"; | |
$wordpress_final = $wordpress_start . $wordpress_file; | |
$fh = fopen('wp-export-1.xml', 'w') or die("can't open file"); | |
fwrite($fh, $wordpress_final); | |
fclose($fh); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nice script bro