jaggedsoft/composer.json

## composer.json
{
    "require": {
        "khr/react-curl": "~2.0",
	"sunra/php-simple-html-dom-parser": "~1.5"
    }
}

## crawler.php
<?php
require_once __DIR__ . '/vendor/autoload.php';
use \React\EventLoop\Factory;
use KHR\React\Curl\Curl;
use KHR\React\Curl\Result;
use KHR\React\Curl\Exception;
use Sunra\PhpSimple\HtmlDomParser;

date_default_timezone_set('Asia/Jakarta');
mb_internal_encoding("UTF-8");
class Crawler {
	private $proxies = [], $topics = [], $curl = null;
	/** Load proxies from gimmeproxy.com **/
	private function loadProxies($num) {
		echo "Load {$num} proxies\n";
		for($i = 0; $i < $num; $i++) {
			$data = json_decode(file_get_contents('http://gimmeproxy.com/api/get/test/?timeout=0'), 1);
			$this->proxies[] = $data['curl'];
		}
	}
	/** Set proxy option **/
	private function getProxyOption() {
		$key = array_rand($this->proxies);
		//echo "Set proxy option {$this->proxies[$key]}\n";
		return [CURLOPT_PROXY => $this->proxies[$key]];
		//return [];
	}
	/** Get url from result **/
	private function resultGetUrl(Result $result) {
		return $result->getOptions()[CURLOPT_URL];
	}
	/** Parse main page **/
	public function parseMainPage($result) {
		echo "Loaded ".$result->getOptions()[CURLOPT_URL]."\n";
		$links = [];
		$dom = HtmlDomParser::str_get_html($result);
		foreach($dom->find('a[href^=item]') as $a) { // look for links starting with "item"
			$href = $a->href;
			if(!isset($links[$href])) { // if link is not already visited, crawl it
				echo "Get {$href}\n";
				$links[$href] = 1;

				//Get topic page
				$this->curl->get('https://news.ycombinator.com/'.$href, $this->getProxyOption())->then(
					array($this, 'parseTopicPage'), // promise resolved, parse topic page
					function($exception) { // promise rejected, i.e. some error occurred
						echo "Error loading url ".$this->resultGetUrl($exception->result).": ".$exception->getMessage()."\n";
					}
				);
			}

		}
		$dom->clear();
	}
	/** Parse topic page **/
	public function parseTopicPage($result) {

		echo "Successfully loaded ".$this->resultGetUrl($result)."\n";
		$topic = ['title' => '', 'users' => []];
		$dom = HtmlDomParser::str_get_html($result);
		$topic['title'] = $dom->find('.title a', 0)->innertext;
		foreach($dom->find('a[href^=user]') as $a) {
			$username = trim($a->innertext);
			$topic['users'][$username] = isset($topic['users'][$username]) ? $topic['users'][$username] + 1 : 1;
		}
		$this->topics[] = $topic;
		$dom->clear();
	}
	/** Run crawler **/
	public function run() {
		$this->loadProxies(10); // load 10 proxies from GimmeProxy.com
		$loop = Factory::create();
		$this->curl = new Curl($loop);

		$this->curl->client->setMaxRequest(5); // number of parallel requests
		$this->curl->client->setSleep(2, 1.0, false); // make maximum 2 requests in 1 second
		$this->curl->client->setCurlOption([
			CURLOPT_AUTOREFERER => true,
			CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
			CURLOPT_CONNECTTIMEOUT => 10,
			CURLOPT_TIMEOUT => 10,
			CURLOPT_SSL_VERIFYPEER => false,
			CURLOPT_SSL_VERIFYHOST => false,
			CURLOPT_FOLLOWLOCATION => true,
			CURLOPT_MAXREDIRS => 9,
			CURLOPT_RETURNTRANSFER => TRUE,
			CURLOPT_HEADER => 0,
		]);

		$this->curl->get('http://icanhazip.com/', $this->getProxyOption())->then(function($result) { //check that proxy server is working
			echo $result."\n";
		});

		$this->curl->get('https://news.ycombinator.com/', $this->getProxyOption())->then(
			array($this, 'parseMainPage') // call $this->parseMainPage
		);
		$this->curl->run();
		$loop->run();
		print_r($this->topics);
	}

}

$crawler = new Crawler();
$crawler->run();
	{
	"require": {
	"khr/react-curl": "~2.0",
	"sunra/php-simple-html-dom-parser": "~1.5"
	}
	}
	<?php
	require_once __DIR__ . '/vendor/autoload.php';
	use \React\EventLoop\Factory;
	use KHR\React\Curl\Curl;
	use KHR\React\Curl\Result;
	use KHR\React\Curl\Exception;
	use Sunra\PhpSimple\HtmlDomParser;

	date_default_timezone_set('Asia/Jakarta');
	mb_internal_encoding("UTF-8");
	class Crawler {
	private $proxies = [], $topics = [], $curl = null;
	/ Load proxies from gimmeproxy.com /
	private function loadProxies($num) {
	echo "Load {$num} proxies\n";
	for($i = 0; $i < $num; $i++) {
	$data = json_decode(file_get_contents('http://gimmeproxy.com/api/get/test/?timeout=0'), 1);
	$this->proxies[] = $data['curl'];
	}
	}
	/ Set proxy option /
	private function getProxyOption() {
	$key = array_rand($this->proxies);
	//echo "Set proxy option {$this->proxies[$key]}\n";
	return [CURLOPT_PROXY => $this->proxies[$key]];
	//return [];
	}
	/ Get url from result /
	private function resultGetUrl(Result $result) {
	return $result->getOptions()[CURLOPT_URL];
	}
	/ Parse main page /
	public function parseMainPage($result) {
	echo "Loaded ".$result->getOptions()[CURLOPT_URL]."\n";
	$links = [];
	$dom = HtmlDomParser::str_get_html($result);
	foreach($dom->find('a[href^=item]') as $a) { // look for links starting with "item"
	$href = $a->href;
	if(!isset($links[$href])) { // if link is not already visited, crawl it
	echo "Get {$href}\n";
	$links[$href] = 1;

	//Get topic page
	$this->curl->get('https://news.ycombinator.com/'.$href, $this->getProxyOption())->then(
	array($this, 'parseTopicPage'), // promise resolved, parse topic page
	function($exception) { // promise rejected, i.e. some error occurred
	echo "Error loading url ".$this->resultGetUrl($exception->result).": ".$exception->getMessage()."\n";
	}
	);
	}

	}
	$dom->clear();
	}
	/ Parse topic page /
	public function parseTopicPage($result) {

	echo "Successfully loaded ".$this->resultGetUrl($result)."\n";
	$topic = ['title' => '', 'users' => []];
	$dom = HtmlDomParser::str_get_html($result);
	$topic['title'] = $dom->find('.title a', 0)->innertext;
	foreach($dom->find('a[href^=user]') as $a) {
	$username = trim($a->innertext);
	$topic['users'][$username] = isset($topic['users'][$username]) ? $topic['users'][$username] + 1 : 1;
	}
	$this->topics[] = $topic;
	$dom->clear();
	}
	/ Run crawler /
	public function run() {
	$this->loadProxies(10); // load 10 proxies from GimmeProxy.com
	$loop = Factory::create();
	$this->curl = new Curl($loop);

	$this->curl->client->setMaxRequest(5); // number of parallel requests
	$this->curl->client->setSleep(2, 1.0, false); // make maximum 2 requests in 1 second
	$this->curl->client->setCurlOption([
	CURLOPT_AUTOREFERER => true,
	CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
	CURLOPT_CONNECTTIMEOUT => 10,
	CURLOPT_TIMEOUT => 10,
	CURLOPT_SSL_VERIFYPEER => false,
	CURLOPT_SSL_VERIFYHOST => false,
	CURLOPT_FOLLOWLOCATION => true,
	CURLOPT_MAXREDIRS => 9,
	CURLOPT_RETURNTRANSFER => TRUE,
	CURLOPT_HEADER => 0,
	]);

	$this->curl->get('http://icanhazip.com/', $this->getProxyOption())->then(function($result) { //check that proxy server is working
	echo $result."\n";
	});

	$this->curl->get('https://news.ycombinator.com/', $this->getProxyOption())->then(
	array($this, 'parseMainPage') // call $this->parseMainPage
	);
	$this->curl->run();
	$loop->run();
	print_r($this->topics);
	}

	}

	$crawler = new Crawler();
	$crawler->run();