Skip to content

Instantly share code, notes, and snippets.

@256cats
Last active May 28, 2024 19:18
Show Gist options
  • Save 256cats/7a704640f33965a7eb92 to your computer and use it in GitHub Desktop.
Save 256cats/7a704640f33965a7eb92 to your computer and use it in GitHub Desktop.
Web scraping ReactPHP Curl Proxies, curl multi example, scraping news ycombinator, explanation here: http://256cats.com/fast-scraping-with-reactphp-curl-proxies/
{
"require": {
"khr/react-curl": "~2.0",
"sunra/php-simple-html-dom-parser": "~1.5"
}
}
<?php
require_once __DIR__ . '/vendor/autoload.php';
use \React\EventLoop\Factory;
use KHR\React\Curl\Curl;
use KHR\React\Curl\Result;
use KHR\React\Curl\Exception;
use Sunra\PhpSimple\HtmlDomParser;
date_default_timezone_set('Asia/Jakarta');
mb_internal_encoding("UTF-8");
class Crawler {
private $proxies = [], $topics = [], $curl = null;
/** Load proxies from gimmeproxy.com **/
private function loadProxies($num) {
echo "Load {$num} proxies\n";
for($i = 0; $i < $num; $i++) {
$data = json_decode(file_get_contents('http://gimmeproxy.com/api/get/test/?timeout=0'), 1);
$this->proxies[] = $data['curl'];
}
}
/** Set proxy option **/
private function getProxyOption() {
$key = array_rand($this->proxies);
//echo "Set proxy option {$this->proxies[$key]}\n";
return [CURLOPT_PROXY => $this->proxies[$key]];
//return [];
}
/** Get url from result **/
private function resultGetUrl(Result $result) {
return $result->getOptions()[CURLOPT_URL];
}
/** Parse main page **/
public function parseMainPage($result) {
echo "Loaded ".$result->getOptions()[CURLOPT_URL]."\n";
$links = [];
$dom = HtmlDomParser::str_get_html($result);
foreach($dom->find('a[href^=item]') as $a) { // look for links starting with "item"
$href = $a->href;
if(!isset($links[$href])) { // if link is not already visited, crawl it
echo "Get {$href}\n";
$links[$href] = 1;
//Get topic page
$this->curl->get('https://news.ycombinator.com/'.$href, $this->getProxyOption())->then(
array($this, 'parseTopicPage'), // promise resolved, parse topic page
function($exception) { // promise rejected, i.e. some error occurred
echo "Error loading url ".$this->resultGetUrl($exception->result).": ".$exception->getMessage()."\n";
}
);
}
}
$dom->clear();
}
/** Parse topic page **/
public function parseTopicPage($result) {
echo "Successfully loaded ".$this->resultGetUrl($result)."\n";
$topic = ['title' => '', 'users' => []];
$dom = HtmlDomParser::str_get_html($result);
$topic['title'] = $dom->find('.title a', 0)->innertext;
foreach($dom->find('a[href^=user]') as $a) {
$username = trim($a->innertext);
$topic['users'][$username] = isset($topic['users'][$username]) ? $topic['users'][$username] + 1 : 1;
}
$this->topics[] = $topic;
$dom->clear();
}
/** Run crawler **/
public function run() {
$this->loadProxies(10); // load 10 proxies from GimmeProxy.com
$loop = Factory::create();
$this->curl = new Curl($loop);
$this->curl->client->setMaxRequest(5); // number of parallel requests
$this->curl->client->setSleep(2, 1.0, false); // make maximum 2 requests in 1 second
$this->curl->client->setCurlOption([
CURLOPT_AUTOREFERER => true,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_TIMEOUT => 10,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_SSL_VERIFYHOST => false,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 9,
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_HEADER => 0,
]);
$this->curl->get('http://icanhazip.com/', $this->getProxyOption())->then(function($result) { //check that proxy server is working
echo $result."\n";
});
$this->curl->get('https://news.ycombinator.com/', $this->getProxyOption())->then(
array($this, 'parseMainPage') // call $this->parseMainPage
);
$this->curl->run();
$loop->run();
print_r($this->topics);
}
}
$crawler = new Crawler();
$crawler->run();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment