Skip to content

Instantly share code, notes, and snippets.

@davidreuss
Created September 13, 2010 17:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davidreuss/577732 to your computer and use it in GitHub Desktop.
Save davidreuss/577732 to your computer and use it in GitHub Desktop.
scrape-fu
<?php
require 'phpQuery/phpQuery.php';
$prefix = 'http://fyrrum.dk/portal.php';
$doc = phpQuery::newDocumentHTML(file_get_contents(
$prefix
));
$captures = array();
foreach ($doc['#page-body table dl.icon'] as $node) {
$capture = array();
$href = pq($node)->find('a.topictitle:first');
if ($href->count() == 0) {
continue;
}
$capture['topic'] = $href->text();
$url = $href->attr('href');
if ($url[0] == '.') {
$url = substr($url, 1);
}
$capture['url'] = $prefix . $url;
$last = pq($node)->find('dd.lastpost');
$capture['last_post_by'] = $last->find('a:first')->text();
$at = $last->find('span')->text();
$parts = explode("\n", $at);
$capture['last_post_at'] = trim($parts[1]);
$captures[] = $capture;
}
foreach ($captures as $c) {
printf("%s / %s / %s\n", $c['last_post_at'], $c['topic'], $c['last_post_by']);
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment