Skip to content

Instantly share code, notes, and snippets.

@zachbrowne
Created July 11, 2011 03:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save zachbrowne/1075289 to your computer and use it in GitHub Desktop.
Save zachbrowne/1075289 to your computer and use it in GitHub Desktop.
Google News Parser for WordPress
<?php
/*
*
* Plugin Name: Google News Parser
* Plugin URI: http://zachbrowne.com
* Description: Imports news from Google and creates posts for them. For information regarding this
* plugin, please contact the developer: <strong>zachbrowne@gmail.com</strong>
* Version: 1.0
* Author: Zach Browne
* Author URI: http://zachbrowne.com
*
*/
if (!get_cfg_var('safe_mode')) {
ini_set('max_execution_time', 0);
set_time_limit(0);
}
add_action('admin_menu', 'googlenews_admin_actions');
add_action('googlenews_hook_job', 'googlenews_cron_hook');
function googlenews_cron_hook() {
import_news();
die();
}
function googlenews_admin_actions() {
add_menu_page('Google News Poster', 'Google News Poster', 'manage_options', 'news-poster', 'googlenews_admin');
}
function googlenews_admin() {
if (!current_user_can('manage_options')) {
wp_die( __('You do not have sufficient permissions to access this page.'));
}
googlenews_main();
}
function create_category($category) {
$category = mysql_real_escape_string($category);
$id = get_cat_ID($category);
if(empty($id)) {
$id = wp_insert_term($category, 'category');
return $id['term_id'];
} else {
return $id;
}
}
function download_image($image_url, $keyword) {
$upload_dir = wp_upload_dir();
$upload_dir = $upload_dir['basedir'];
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_TIMEOUT, 5);
curl_setopt($ch, CURLOPT_BINARYTRANSFER,1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_URL, $image_url);
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6");
curl_setopt($ch, CURLOPT_ENCODING, "gzip,deflate");
$image = curl_exec($ch);
$results = curl_getinfo($ch);
$uid = uniqid();
$fp = fopen("$upload_dir/".$keyword."-".$uid.".jpg",'w');
fwrite($fp, $image);
fclose($fp);
return "$upload_dir/".$keyword."-".$uid.".jpg";
}
function import_news() {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://www.google.com/trends/hottrends/atom/hourly');
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6");
curl_setopt($ch, CURLOPT_TIMEOUT, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$page = curl_exec($ch);
preg_match_all('/<a href="(.+)">(.*)<\/a>/siU', $page, $arr, PREG_SET_ORDER);
for($i=0; $i < 2; $i++) {
unset($article);
unset($page);
$trend = $arr[rand(0, count($arr)-1)];
$article['keyword'] = $trend[2];
echo "Importing news for <b>".$article['keyword']."</b>\n<br>";
flush();
// Related keywords
curl_setopt($ch, CURLOPT_URL, $trend[1]);
$page = curl_exec($ch);
unset($karr);
preg_match('/<b>Related searches\:<\/b><br>(.*)<br><br>/i', $page, $karr);
if(!empty($karr)) {
$karr[1] = trim(strip_tags($karr[1]));
unset($split);
$split = explode(',', $karr[1]);
foreach($split as $value) {
$article['keywords'][] = $value;
}
}
// News
unset($page);
curl_setopt($ch, CURLOPT_URL, "http://ajax.googleapis.com/ajax/services/search/news?v=1.0&rsz=8&ned=us&q=".urlencode($trend[2]));
$page = curl_exec($ch);
$results = json_decode($page);
$article['title'] = $results->responseData->results[0]->titleNoFormatting;
$article['publisher'] = $results->responseData->results[0]->publisher;
foreach($results->responseData->results as $result) {
$result->content = str_replace("...", "", $result->content);
$article['content'][] = trim(strip_tags($result->content));
if(!empty($result->image)) {
$article['images'][] = $result->image->tbUrl."/6.jpg";
}
}
// Blog Search
unset($images);
curl_setopt($ch, CURLOPT_URL, "http://ajax.googleapis.com/ajax/services/search/blogs?v=1.0&rsz=8&ned=us&q=".urlencode($trend[2]));
$page = curl_exec($ch);
$results = json_decode($page);
foreach($results->responseData->results as $result) {
$result->content = str_replace("...", "", $result->content);
$article['content'][] = trim(strip_tags($result->content));
}
$article['content'] = array_unique($article['content']);
// Images
if(count($article['images']) == 0) {
unset($page);
curl_setopt($ch, CURLOPT_URL, "http://www.google.com/news?pz=1&cf=i&ned=us&hl=en&source=uds&cf=i&output=rss&q=".urlencode($trend[2]));
$page = curl_exec($ch);
unset($arr2);
preg_match_all('/img src=&quot;(.*)&quot;/siU', $page, $arr2, PREG_SET_ORDER);
foreach($arr2 as $image) {
$article['images'][] = $image[1];
}
}
$article['keywords'][] = $article['keyword'];
for($b=0; $b < 5; $b++) {
$random = rand(0, count($article['content'])-1);
if($b == 0) {
$first = $random;
}
if($b == 4) {
$article['body'] .= "<p>".$article['content'][$random]."</p>\n";
} else {
$article['body'] .= "<p>".$article['content'][$random]."</p>";
}
}
$article['category'][] = create_category(ucwords($article['keyword']));
$new_post = array(
'post_title' => $article['title'],
'post_content' => $article['body'],
'post_status' => 'publish',
'post_date' => date('Y-m-d H:i:s'),
'post_type' => 'post',
'tags_input' => $article['keywords'],
'post_category' => $article['category']
);
$post_id = wp_insert_post($new_post);
foreach($article['images'] as $value) {
unset($image);
$image = download_image($value, sanitize_title($article['keyword']));
$article['attachments'][] = $image;
$atype = wp_check_filetype(basename($image), null);
$attachment = array(
'post_mime_type' => $atype['type'],
'post_status' => 'inherit'
);
wp_insert_attachment($attachment, $image, $post_id);
}
add_post_meta($post_id, 'article_thumbnail', basename($article['attachments'][0]));
add_post_meta($post_id, 'keyword', $article['keyword']);
add_post_meta($post_id, 'publisher', $article['publisher']);
add_post_meta($post_id, 'content', $article['content'][0]);
add_post_meta($post_id, 'content2', $article['content'][1]);
add_post_meta($post_id, 'content3', $article['content'][2]);
add_post_meta($post_id, 'content4', $article['content'][3]);
add_post_meta($post_id, 'content5', $article['content'][4]);
add_post_meta($post_id, '_aioseop_description', substr($article['content'][$first], 0, 160));
}
update_option('googlenews_first_time', "true");
update_option('googlenews_last_check', date("F j, Y, G:i"));
update_option('googlenews_scraped', get_option('googlenews_scraped') + 2);
echo "<br>\n<b>Finished importing articles</b>";
}
function googlenews_main() {
$first_time = get_option('googlenews_first_time');
$last_time = get_option('googlenews_last_check');
$scraped = get_option('googlenews_scraped');
?>
<div class="wrap">
<?php
if($_GET['function'] == "scrape") {
echo "<h2>" . __('Manual Scrape', 'googlenews_trdom') . "</h2>"; ?>
<p>The plugin is currently manually importing articles from Google. Please wait until the page finishes loading!</p>
<?php import_news();
} elseif(!empty($first_time)) {
if($_POST['googlenews_settings'] == "true") {
if($_POST['googlenews_job'] == "hourly") {
wp_clear_scheduled_hook('googlenews_hook_job');
if (!wp_next_scheduled('googlenews_hook_job')) {
wp_schedule_event(time(), 'hourly', 'googlenews_hook_job');
}
update_option('googlenews_job_option', 'hourly');
echo "<h2>" . __('Google News Poster', 'googlenews_trdom') . "</h2>"; ?>
<p>The plugin willl now begin to automatically import articles <b>hourly</b>.</p>
<p><input class='button-secondary' type='button' name='Go Back' value='<?php _e('Go Back'); ?>' id='submitbutton' onClick="location.href='<?php echo str_replace('%7E', '~', $_SERVER['REQUEST_URI']); ?>'" /></p>
<?php } else {
wp_clear_scheduled_hook('googlenews_hook_job');
update_option('googlenews_job_option', 'manual');
echo "<h2>" . __('Google News Poster', 'googlenews_trdom') . "</h2>"; ?>
<p>Automatic cron jobs for this plugin have been <b>removed</b>.</p>
<p><input class='button-secondary' type='button' name='Go Back' value='<?php _e('Go Back'); ?>' id='submitbutton' onClick="location.href='<?php echo str_replace('%7E', '~', $_SERVER['REQUEST_URI']); ?>'" /></p>
<?php } } else { ?>
<?php echo "<h2>" . __('Plugin Statistics', 'googlenews_trdom') . "</h2>"; ?>
<form name="googlenews_form" method="post" action="<?php echo str_replace('%7E', '~', $_SERVER['REQUEST_URI']);
?>">
<input type="hidden" name="googlenews_settings" value="true">
<p>Google News Poster is currently active.</p>
<p><strong>Last Run:</strong> <?php echo $last_time ?></p>
<p><strong>Total Articles Imported:</strong> <?php echo $scraped ?></p>
<select name="googlenews_job">
<?php
if (get_option('googlenews_job_option') == 'hourly') {
?>
<option value="hourly">Hourly (Cron Job)</option>
<option value="manually">Manually</option>
<?php
} else {
?>
<option value="manually">Manually</option>
<option value="hourly">Hourly (Cron Job)</option>
<?php }
?>
</select> (Currently <?php echo get_option('googlenews_job_option');
?>)
<p class="submit">
<input class='button-primary' type='submit' name='Save' value='<?php _e('Save Options'); ?>' id='submitbutton' /> <input class='button-secondary' type='button' name='Save' value='<?php _e('Run Scraper (manual)'); ?>' id='submitbutton' onClick="location.href='<?php echo str_replace('%7E', '~', $_SERVER['REQUEST_URI']); ?>&function=scrape'" />
</p>
<?php } } else {
if($_POST['googlenews_first'] == "true") {
import_news();
} else {
echo "<h2>" . __('First Time', 'googlenews_trdom') . "</h2>"; ?>
<p>Since you are using Google News Autoposter for the first time, you will need to click the <strong>Import News</strong> button to import the initial news articles available on Google.</p>
<form name="googlenews_form" method="post" action="<?php echo str_replace('%7E', '~', $_SERVER['REQUEST_URI']);
?>">
<input type="hidden" name="googlenews_first" value="true">
<p class="submit">
<input class="button-primary" type="submit" name="Submit" value="<?php _e('Import News', 'googlenews_trdom') ?>" />
</p>
<?php } } ?>
</div>
<?php } ?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment