Web scraping ReactPHP Curl Proxies, curl multi example, scraping news ycombinator, explanation here: http://256cats.com/fast-scraping-with-reactphp-curl-proxies/
{ | |
"require": { | |
"khr/react-curl": "~2.0", | |
"sunra/php-simple-html-dom-parser": "~1.5" | |
} | |
} |
<?php | |
require_once __DIR__ . '/vendor/autoload.php'; | |
use \React\EventLoop\Factory; | |
use KHR\React\Curl\Curl; | |
use KHR\React\Curl\Result; | |
use KHR\React\Curl\Exception; | |
use Sunra\PhpSimple\HtmlDomParser; | |
date_default_timezone_set('Asia/Jakarta'); | |
mb_internal_encoding("UTF-8"); | |
class Crawler { | |
private $proxies = [], $topics = [], $curl = null; | |
/** Load proxies from gimmeproxy.com **/ | |
private function loadProxies($num) { | |
echo "Load {$num} proxies\n"; | |
for($i = 0; $i < $num; $i++) { | |
$data = json_decode(file_get_contents('http://gimmeproxy.com/api/get/test/?timeout=0'), 1); | |
$this->proxies[] = $data['curl']; | |
} | |
} | |
/** Set proxy option **/ | |
private function getProxyOption() { | |
$key = array_rand($this->proxies); | |
//echo "Set proxy option {$this->proxies[$key]}\n"; | |
return [CURLOPT_PROXY => $this->proxies[$key]]; | |
//return []; | |
} | |
/** Get url from result **/ | |
private function resultGetUrl(Result $result) { | |
return $result->getOptions()[CURLOPT_URL]; | |
} | |
/** Parse main page **/ | |
public function parseMainPage($result) { | |
echo "Loaded ".$result->getOptions()[CURLOPT_URL]."\n"; | |
$links = []; | |
$dom = HtmlDomParser::str_get_html($result); | |
foreach($dom->find('a[href^=item]') as $a) { // look for links starting with "item" | |
$href = $a->href; | |
if(!isset($links[$href])) { // if link is not already visited, crawl it | |
echo "Get {$href}\n"; | |
$links[$href] = 1; | |
//Get topic page | |
$this->curl->get('https://news.ycombinator.com/'.$href, $this->getProxyOption())->then( | |
array($this, 'parseTopicPage'), // promise resolved, parse topic page | |
function($exception) { // promise rejected, i.e. some error occurred | |
echo "Error loading url ".$this->resultGetUrl($exception->result).": ".$exception->getMessage()."\n"; | |
} | |
); | |
} | |
} | |
$dom->clear(); | |
} | |
/** Parse topic page **/ | |
public function parseTopicPage($result) { | |
echo "Successfully loaded ".$this->resultGetUrl($result)."\n"; | |
$topic = ['title' => '', 'users' => []]; | |
$dom = HtmlDomParser::str_get_html($result); | |
$topic['title'] = $dom->find('.title a', 0)->innertext; | |
foreach($dom->find('a[href^=user]') as $a) { | |
$username = trim($a->innertext); | |
$topic['users'][$username] = isset($topic['users'][$username]) ? $topic['users'][$username] + 1 : 1; | |
} | |
$this->topics[] = $topic; | |
$dom->clear(); | |
} | |
/** Run crawler **/ | |
public function run() { | |
$this->loadProxies(10); // load 10 proxies from GimmeProxy.com | |
$loop = Factory::create(); | |
$this->curl = new Curl($loop); | |
$this->curl->client->setMaxRequest(5); // number of parallel requests | |
$this->curl->client->setSleep(2, 1.0, false); // make maximum 2 requests in 1 second | |
$this->curl->client->setCurlOption([ | |
CURLOPT_AUTOREFERER => true, | |
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', | |
CURLOPT_CONNECTTIMEOUT => 10, | |
CURLOPT_TIMEOUT => 10, | |
CURLOPT_SSL_VERIFYPEER => false, | |
CURLOPT_SSL_VERIFYHOST => false, | |
CURLOPT_FOLLOWLOCATION => true, | |
CURLOPT_MAXREDIRS => 9, | |
CURLOPT_RETURNTRANSFER => TRUE, | |
CURLOPT_HEADER => 0, | |
]); | |
$this->curl->get('http://icanhazip.com/', $this->getProxyOption())->then(function($result) { //check that proxy server is working | |
echo $result."\n"; | |
}); | |
$this->curl->get('https://news.ycombinator.com/', $this->getProxyOption())->then( | |
array($this, 'parseMainPage') // call $this->parseMainPage | |
); | |
$this->curl->run(); | |
$loop->run(); | |
print_r($this->topics); | |
} | |
} | |
$crawler = new Crawler(); | |
$crawler->run(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment