Last active
December 15, 2015 14:49
-
-
Save lordmatt/5276823 to your computer and use it in GitHub Desktop.
Feed Fetch: Fetches feeds and caches them for a few hours and returns them in handy dandy simpleXML objects.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Feed Fetch: Fetches feeds and caches them for a few hours. | |
* | |
* Copyright (C) 2007, 2013 Matthew Brown aka Lord Matt | |
* | |
* This program is free software: you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation, either version 3 of the License, or | |
* (at your option) any later version. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License | |
* along with this program. If not, see <http://www.gnu.org/licenses/>. | |
* | |
********* | |
* ABOUT * | |
********* | |
* | |
* This class provides the methods and functionality used to fetch one or more | |
* feeds for use by other systems. It provides cache control to minimise impact | |
* on other people's severs. | |
* | |
* This is based on a similar class that I wrote many years ago but has better | |
* cache control. Most of the code has been re-written for the 2013 edition. | |
* | |
* I am going to pick one fetch method that works for me but ideally I'd like | |
* this class to detect and try any and all available fetch methods so it "just | |
* works" in over 90% of cases. | |
* | |
* To make life simpler thfeeds come back as simpleXML objects | |
* | |
* @PHP 5+ | |
* @author lordmatt | |
* @version 2.0 | |
*/ | |
class feedfetch { | |
/** | |
* This is a list of XML feeds that provide all the news that this provider | |
* gives out ready to be fetched. | |
* @var type (array) $sources | |
*/ | |
protected $sources = array(); | |
protected $feeds = array(); | |
protected $userAgent = 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)'; | |
protected $prefix = 'cachedfeed'; | |
/** | |
* @var path to cache | |
*/ | |
protected $cache = './cache/'; | |
/** | |
* Just a default value for the length of time to leave a cached feed before | |
* trying for a live copy. 0 means always get live. | |
* @var type Age in Hours | |
*/ | |
protected $minCacheAge = 3; | |
protected $showerrors = true; | |
// public methods | |
/** | |
* Path should end in slash unless you want to force an extra prefix | |
* @param type $path | |
*/ | |
public function setCachePath($path){ | |
$this->cache = $path; | |
} | |
public function setMinCacheAge($hours){ | |
$this->minCacheAge = $hours; | |
} | |
public function toggleErrors(){ | |
if($this->showerrors===true){ | |
$this->showerrors = false; | |
}else{ | |
$this->showerrors = true; | |
} | |
} | |
public function setUserAgent($newAgent){ | |
$this->userAgent = $newAgent; | |
} | |
/** | |
* Takes a feed with type hint and returns the array place for that entry | |
* @param type $url | |
* @param type $format | |
* @return type array key for feed entry | |
*/ | |
public function addSource($url,$format='unknown'){ | |
$this->sources[] = $url; | |
return (count($this->sources)-1); | |
} | |
public function setFileNamePrefix($text){ | |
$this->prefix = $text; | |
} | |
/** | |
* Takes the $key for a feed and returns the cache age in hours | |
* returns false if key not found | |
* returns false if cache not found | |
*/ | |
public function getCacheAge($key){ | |
if( isset($this->feeds[$key]) ){ | |
return $this->cacheAge($this->feeds[$key]); | |
}else{ | |
return false; | |
} | |
} | |
public function fetchFeeds(){ | |
foreach($this->sources as $key=>$source){ | |
if(isset($source['format'])){ | |
$format = $source['format']; | |
}else{ | |
$format = 'unknown'; | |
} | |
$gotfeed = $this->getThisFeed($source['url']); | |
$this->feed[]= array( | |
'format'=>$format, | |
'feed'=>$gotfeed, | |
'key'=>$key | |
); | |
} | |
} | |
// internals | |
/** | |
* Gives us a filename that shouldn't clash with any other | |
* @param type $url | |
* @return type filename | |
*/ | |
protected function cacheName($url){ | |
return $this->prefix . md5($url) . '.xml'; | |
} | |
protected function cachePath($url){ | |
return $this->cache . $this->cacheName($url); | |
} | |
/** | |
* @todo stop assuming that the file is valid xml. | |
* @param type $url | |
* @return object or false | |
*/ | |
protected function attemptCacheGet($url){ | |
$filename = $this->cachePath($url); | |
if (file_exists($filename)) { | |
return simplexml_load_file($filename); | |
}else{ | |
return false; | |
} | |
} | |
/** | |
* Takes a URL and returns the cache age in hours (rounded to decimal places | |
* only) or returns false if file not found. Rounding is used as we are only | |
* interested in whole nubers of hours so 2dp is sufficient for now. | |
* @param type $url | |
* @return hours or false | |
*/ | |
protected function cacheAge($url){ | |
$filename = $this->cachePath($url); | |
clearstatcache(); | |
if (file_exists($filename)) { | |
$time = filemtime($filename); | |
$age = time() - $time; | |
$hours = round( ($age / (60*60)), 2, PHP_ROUND_HALF_DOWN); #(seconds and minutes) | |
return $hours; | |
}else{ | |
return false; | |
} | |
} | |
protected function errorReport($error){ | |
if($this->showerrors){ | |
echo "<div class='error'>$report</div>"; | |
} | |
} | |
/** | |
* This method should attempt to get the feed and if successful write it to | |
* cache; otherwise report failure. | |
* Extend this method with your own fetch or fail if curl does not suit. | |
* I'd like to have this method detect available method and use it one day. | |
* @param type $url | |
* @return boolean | |
*/ | |
protected function getFileToCache($url){ | |
set_time_limit(0); | |
$fp = fopen ($this->cachePath($url), 'w+'); | |
$ch = curl_init($url); | |
curl_setopt_array($ch, array( | |
CURLOPT_URL => $url, | |
CURLOPT_BINARYTRANSFER => 1, | |
CURLOPT_RETURNTRANSFER => 1, | |
CURLOPT_FILE => $fp, | |
CURLOPT_TIMEOUT => 50, | |
CURLOPT_USERAGENT => $this->userAgent | |
)); | |
$results = curl_exec($ch); | |
if(curl_exec($ch) === false){ | |
$this->errorReport('Curl error: ' . curl_error($ch)); | |
} | |
return $results; | |
} | |
/** | |
* | |
* @param type $url | |
* @return false or SimpleXML object | |
*/ | |
protected function getThisFeed($url){ | |
$age = $this->cacheAge($url); | |
if($age===false OR $age > $this->minCacheAge){ | |
if(getFileToCache($url)){ | |
// file should now be in cache | |
return $this->attemptCacheGet($url); | |
}else(if$age!==false){ | |
// not cetched but we have cache | |
return $this->attemptCacheGet($url); | |
}else{ | |
// failed | |
$this->errorReport('Could not load feed or find it in cache'); | |
return false; | |
} | |
}else{ | |
// cache is fresh | |
return $this->attemptCacheGet($url); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment