Last active
May 28, 2024 19:18
-
-
Save 256cats/7a704640f33965a7eb92 to your computer and use it in GitHub Desktop.
Web scraping ReactPHP Curl Proxies, curl multi example, scraping news ycombinator, explanation here: http://256cats.com/fast-scraping-with-reactphp-curl-proxies/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"require": { | |
"khr/react-curl": "~2.0", | |
"sunra/php-simple-html-dom-parser": "~1.5" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require_once __DIR__ . '/vendor/autoload.php'; | |
use \React\EventLoop\Factory; | |
use KHR\React\Curl\Curl; | |
use KHR\React\Curl\Result; | |
use KHR\React\Curl\Exception; | |
use Sunra\PhpSimple\HtmlDomParser; | |
date_default_timezone_set('Asia/Jakarta'); | |
mb_internal_encoding("UTF-8"); | |
class Crawler { | |
private $proxies = [], $topics = [], $curl = null; | |
/** Load proxies from gimmeproxy.com **/ | |
private function loadProxies($num) { | |
echo "Load {$num} proxies\n"; | |
for($i = 0; $i < $num; $i++) { | |
$data = json_decode(file_get_contents('http://gimmeproxy.com/api/get/test/?timeout=0'), 1); | |
$this->proxies[] = $data['curl']; | |
} | |
} | |
/** Set proxy option **/ | |
private function getProxyOption() { | |
$key = array_rand($this->proxies); | |
//echo "Set proxy option {$this->proxies[$key]}\n"; | |
return [CURLOPT_PROXY => $this->proxies[$key]]; | |
//return []; | |
} | |
/** Get url from result **/ | |
private function resultGetUrl(Result $result) { | |
return $result->getOptions()[CURLOPT_URL]; | |
} | |
/** Parse main page **/ | |
public function parseMainPage($result) { | |
echo "Loaded ".$result->getOptions()[CURLOPT_URL]."\n"; | |
$links = []; | |
$dom = HtmlDomParser::str_get_html($result); | |
foreach($dom->find('a[href^=item]') as $a) { // look for links starting with "item" | |
$href = $a->href; | |
if(!isset($links[$href])) { // if link is not already visited, crawl it | |
echo "Get {$href}\n"; | |
$links[$href] = 1; | |
//Get topic page | |
$this->curl->get('https://news.ycombinator.com/'.$href, $this->getProxyOption())->then( | |
array($this, 'parseTopicPage'), // promise resolved, parse topic page | |
function($exception) { // promise rejected, i.e. some error occurred | |
echo "Error loading url ".$this->resultGetUrl($exception->result).": ".$exception->getMessage()."\n"; | |
} | |
); | |
} | |
} | |
$dom->clear(); | |
} | |
/** Parse topic page **/ | |
public function parseTopicPage($result) { | |
echo "Successfully loaded ".$this->resultGetUrl($result)."\n"; | |
$topic = ['title' => '', 'users' => []]; | |
$dom = HtmlDomParser::str_get_html($result); | |
$topic['title'] = $dom->find('.title a', 0)->innertext; | |
foreach($dom->find('a[href^=user]') as $a) { | |
$username = trim($a->innertext); | |
$topic['users'][$username] = isset($topic['users'][$username]) ? $topic['users'][$username] + 1 : 1; | |
} | |
$this->topics[] = $topic; | |
$dom->clear(); | |
} | |
/** Run crawler **/ | |
public function run() { | |
$this->loadProxies(10); // load 10 proxies from GimmeProxy.com | |
$loop = Factory::create(); | |
$this->curl = new Curl($loop); | |
$this->curl->client->setMaxRequest(5); // number of parallel requests | |
$this->curl->client->setSleep(2, 1.0, false); // make maximum 2 requests in 1 second | |
$this->curl->client->setCurlOption([ | |
CURLOPT_AUTOREFERER => true, | |
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', | |
CURLOPT_CONNECTTIMEOUT => 10, | |
CURLOPT_TIMEOUT => 10, | |
CURLOPT_SSL_VERIFYPEER => false, | |
CURLOPT_SSL_VERIFYHOST => false, | |
CURLOPT_FOLLOWLOCATION => true, | |
CURLOPT_MAXREDIRS => 9, | |
CURLOPT_RETURNTRANSFER => TRUE, | |
CURLOPT_HEADER => 0, | |
]); | |
$this->curl->get('http://icanhazip.com/', $this->getProxyOption())->then(function($result) { //check that proxy server is working | |
echo $result."\n"; | |
}); | |
$this->curl->get('https://news.ycombinator.com/', $this->getProxyOption())->then( | |
array($this, 'parseMainPage') // call $this->parseMainPage | |
); | |
$this->curl->run(); | |
$loop->run(); | |
print_r($this->topics); | |
} | |
} | |
$crawler = new Crawler(); | |
$crawler->run(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment