Skip to content

Instantly share code, notes, and snippets.

@lordmatt
Last active December 15, 2015 14:49
Show Gist options
  • Save lordmatt/5276823 to your computer and use it in GitHub Desktop.
Save lordmatt/5276823 to your computer and use it in GitHub Desktop.
Feed Fetch: Fetches feeds and caches them for a few hours and returns them in handy dandy simpleXML objects.
<?php
/**
* Feed Fetch: Fetches feeds and caches them for a few hours.
*
* Copyright (C) 2007, 2013 Matthew Brown aka Lord Matt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*********
* ABOUT *
*********
*
* This class provides the methods and functionality used to fetch one or more
* feeds for use by other systems. It provides cache control to minimise impact
* on other people's severs.
*
* This is based on a similar class that I wrote many years ago but has better
* cache control. Most of the code has been re-written for the 2013 edition.
*
* I am going to pick one fetch method that works for me but ideally I'd like
* this class to detect and try any and all available fetch methods so it "just
* works" in over 90% of cases.
*
* To make life simpler thfeeds come back as simpleXML objects
*
* @PHP 5+
* @author lordmatt
* @version 2.0
*/
class feedfetch {
/**
* This is a list of XML feeds that provide all the news that this provider
* gives out ready to be fetched.
* @var type (array) $sources
*/
protected $sources = array();
protected $feeds = array();
protected $userAgent = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)';
protected $prefix = 'cachedfeed';
/**
* @var path to cache
*/
protected $cache = './cache/';
/**
* Just a default value for the length of time to leave a cached feed before
* trying for a live copy. 0 means always get live.
* @var type Age in Hours
*/
protected $minCacheAge = 3;
protected $showerrors = true;
// public methods
/**
* Path should end in slash unless you want to force an extra prefix
* @param type $path
*/
public function setCachePath($path){
$this->cache = $path;
}
public function setMinCacheAge($hours){
$this->minCacheAge = $hours;
}
public function toggleErrors(){
if($this->showerrors===true){
$this->showerrors = false;
}else{
$this->showerrors = true;
}
}
public function setUserAgent($newAgent){
$this->userAgent = $newAgent;
}
/**
* Takes a feed with type hint and returns the array place for that entry
* @param type $url
* @param type $format
* @return type array key for feed entry
*/
public function addSource($url,$format='unknown'){
$this->sources[] = $url;
return (count($this->sources)-1);
}
public function setFileNamePrefix($text){
$this->prefix = $text;
}
/**
* Takes the $key for a feed and returns the cache age in hours
* returns false if key not found
* returns false if cache not found
*/
public function getCacheAge($key){
if( isset($this->feeds[$key]) ){
return $this->cacheAge($this->feeds[$key]);
}else{
return false;
}
}
public function fetchFeeds(){
foreach($this->sources as $key=>$source){
if(isset($source['format'])){
$format = $source['format'];
}else{
$format = 'unknown';
}
$gotfeed = $this->getThisFeed($source['url']);
$this->feed[]= array(
'format'=>$format,
'feed'=>$gotfeed,
'key'=>$key
);
}
}
// internals
/**
* Gives us a filename that shouldn't clash with any other
* @param type $url
* @return type filename
*/
protected function cacheName($url){
return $this->prefix . md5($url) . '.xml';
}
protected function cachePath($url){
return $this->cache . $this->cacheName($url);
}
/**
* @todo stop assuming that the file is valid xml.
* @param type $url
* @return object or false
*/
protected function attemptCacheGet($url){
$filename = $this->cachePath($url);
if (file_exists($filename)) {
return simplexml_load_file($filename);
}else{
return false;
}
}
/**
* Takes a URL and returns the cache age in hours (rounded to decimal places
* only) or returns false if file not found. Rounding is used as we are only
* interested in whole nubers of hours so 2dp is sufficient for now.
* @param type $url
* @return hours or false
*/
protected function cacheAge($url){
$filename = $this->cachePath($url);
clearstatcache();
if (file_exists($filename)) {
$time = filemtime($filename);
$age = time() - $time;
$hours = round( ($age / (60*60)), 2, PHP_ROUND_HALF_DOWN); #(seconds and minutes)
return $hours;
}else{
return false;
}
}
protected function errorReport($error){
if($this->showerrors){
echo "<div class='error'>$report</div>";
}
}
/**
* This method should attempt to get the feed and if successful write it to
* cache; otherwise report failure.
* Extend this method with your own fetch or fail if curl does not suit.
* I'd like to have this method detect available method and use it one day.
* @param type $url
* @return boolean
*/
protected function getFileToCache($url){
set_time_limit(0);
$fp = fopen ($this->cachePath($url), 'w+');
$ch = curl_init($url);
curl_setopt_array($ch, array(
CURLOPT_URL => $url,
CURLOPT_BINARYTRANSFER => 1,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_FILE => $fp,
CURLOPT_TIMEOUT => 50,
CURLOPT_USERAGENT => $this->userAgent
));
$results = curl_exec($ch);
if(curl_exec($ch) === false){
$this->errorReport('Curl error: ' . curl_error($ch));
}
return $results;
}
/**
*
* @param type $url
* @return false or SimpleXML object
*/
protected function getThisFeed($url){
$age = $this->cacheAge($url);
if($age===false OR $age > $this->minCacheAge){
if(getFileToCache($url)){
// file should now be in cache
return $this->attemptCacheGet($url);
}else(if$age!==false){
// not cetched but we have cache
return $this->attemptCacheGet($url);
}else{
// failed
$this->errorReport('Could not load feed or find it in cache');
return false;
}
}else{
// cache is fresh
return $this->attemptCacheGet($url);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment