Skip to content

Instantly share code, notes, and snippets.

@kaja47
Created November 30, 2011 06:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaja47/1408271 to your computer and use it in GitHub Desktop.
Save kaja47/1408271 to your computer and use it in GitHub Desktop.
linkuj.cz crawler
<?php
// for i in $(seq 1 200000); do wget -nv -O "$i.html" "http://linkuj.cz/?id=show&viewnr=4&typ=0&par=$i"; done;
error_reporting(E_ALL);
$fs = glob("*.html");
//$fs = array_slice($fs, 0, 1000);
function xpath($dom, $path) {
$xpath = new domxpath($dom);
return $xpath->query($path);
}
$finalRes = array();
foreach ($fs as $f) {
echo "file: $f\n";
$dom = new DomDocument();
$res = @$dom->loadHTMLFile($f);
if ($res === false)
echo "file was not loaded properly\n";
if (xpath($dom, '//div[@id="contents"]')->length === 0) {
echo "invalid file *******\n";
continue;
}
$idex = explode('.', $f);
$linkers = xpath($dom, '//div[@id="contents"]/a[position() > 1][@title]');
$linkersRes = array();
foreach ($linkers as $l) {
$linkersRes[] = array(
'user' => $l->textContent,
'user_url' => 'http://linkuj.cz/'.$l->getAttribute('href'),
'title' => $l->getAttribute('title'),
);
}
$similar = xpath($dom, '//div[@id="contents"]/div/div/p[@class="forum"]/a/@href');
$similarRes = array();
foreach ($similar as $s) {
preg_match('~(\d+)$~', $s->textContent, $m);
$similarRes[] = $m[1];
}
$res = array(
'id' => $idex[0],
'permalink' => 'http://linkuj.cz/?id=show&viewnr=4&typ=0&par=' . $idex[0],
'time_saved'=> filemtime($f),
'img' => xpath($dom, '//div[@class="news-summary"]/div/a/@href') ->item(0)->textContent,
'title' => xpath($dom, '//div[@class="news-summary"]/div/h3/a') ->item(0)->textContent,
'url' => xpath($dom, '//div[@class="news-summary"]/div/h3/a/@href')->item(0)->textContent,
'author' => xpath($dom, '//div[@class="news-summary"]/div/p[@class="news-submitted"]/a[1]')->item(0)->textContent,
'author_url'=> 'http://linkuj.cz/' . xpath($dom, '//div[@class="news-summary"]/div/p[@class="news-submitted"]/a[1]/@href')->item(0)->textContent,
'ago' => trim(substr(xpath($dom, '//div[@class="news-summary"]/div/p[@class="news-submitted"]/text()[3]')->item(0)->textContent, 0, -1)),
'link_count'=> xpath($dom, '//ul[@class="news-linkuj"]//strong')->item(0)->textContent,
'linkers' => $linkersRes,
'similar' => $similarRes,
//'' => xpath($dom, ''),
);
$finalRes[] = $res;
}
file_put_contents('_data', json_encode($finalRes));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment