Skip to content

Instantly share code, notes, and snippets.

@kaja47
Created March 12, 2014 01:06
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaja47/9498572 to your computer and use it in GitHub Desktop.
Save kaja47/9498572 to your computer and use it in GitHub Desktop.
Example crawler using Matcher and AsyncCurl
<?php
use Atrox\Matcher;
use Atrox\Curl;
use Atrox\Async;
$userListMatcher = Matcher::multi('//table[@class="ui-table-list"]//tr', (object) [
'url' => Matcher::single('td/a/@href')->map(function ($x) { return "http://www.csfd.cz$x"; }),
'points' => Matcher::single('td[3]')->asInt(),
'ratings' => Matcher::single('td[4]')->asInt(),
'comments' => Matcher::single('td[5]')->asInt(),
'films' => Matcher::single('td[10]')->asInt(),
])->fromHtml();
$ratingMatcher = Matcher::multi('//table[@class="ui-table-list"]/tbody//tr', (object) [
'url' => Matcher::single('td[1]/a/@href')->map(function ($x) { return "http://www.csfd.cz$x"; }),
'rating' => 'td[2]/img/@alt | td[2]/strong',
'date' => 'td[3]'
])->fromHtml();
$curl = Curl::promises()->configure([
CURLOPT_TIMEOUT => 30,
CURLOPT_USERAGENT => 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
]);
function fetch($url, $checkFunction = null) {
global $curl;
return Async::flow(function () use ($url, $checkFunction, $curl) {
$try = 1;
while (true) {
try {
$resp = (yield $curl->get($url));
} catch (Exception $e) {
continue;
}
yield $resp;
return;
}
});
}
Async::concurrently(100, function () use ($userListMatcher, $ratingMatcher) {
foreach (range(1, 3000) as $pageNo) {
yield Async::flow(function () use ($pageNo, $userListMatcher, $ratingMatcher) {
$userListPage = (yield fetch("http://www.csfd.cz/uzivatele/prehled/strana-$pageNo/"));
$userList = $userListMatcher($userListPage->body);
foreach ($userList as $user) {
$ratings = [];
$maxPage = ceil($user->ratings / 100);
foreach (range(1, $maxPage) as $ratingPageNo) {
$url = $user->url . 'hodnoceni/strana-' . $ratingPageNo;
$ratingPage = (yield fetch($url));
$rs = $ratingMatcher($ratingPage->body);
$ratings = array_merge($ratings, $rs);
}
$user->ratings = $ratings;
file_put_contents('csfd-users.data', json_encode($user)."\n", LOCK_EX | FILE_APPEND);
}
});
}
});
$curl->loop();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment