Skip to content

Instantly share code, notes, and snippets.

@krakjoe
Created March 5, 2014 12:13
Show Gist options
  • Star 21 You must be signed in to star a gist
  • Fork 9 You must be signed in to fork a gist
  • Save krakjoe/b1526fcc828621e840cb to your computer and use it in GitHub Desktop.
Save krakjoe/b1526fcc828621e840cb to your computer and use it in GitHub Desktop.
scraping in php
<?php
/**
* Scraping the bjesus out of php.net with pthreads
*
* The following _example_ code shows how you might use pthreads to construct threaded scraping software
* It uses very simple DOM/XPath to scrape php.net pages for function descriptions.
*
* WARNING: DO NOT HAMMER THE CRAP OUT OF PHP.NET !! YOU WILL UPSET A LOT OF PEOPLE, INCLUDING ME !!
*/
class WebWorker extends Worker {
/* maybe connect to a database or whatever here */
public function run(){}
}
class WebPath extends Stackable {
public function __construct($path) {
$this->path = $path;
$this->found = null;
}
public function execute(DOMDocument $dom) {
$xpath = new DOMXPath($dom);
foreach ($xpath->query($this->path) as $found) {
$this->found = (string) $found->textContent;
break;
}
var_dump($this);
}
public function run(){}
protected $path;
protected $found;
}
class WebPaths extends Stackable {
public function __construct($paths, &$objects) {
foreach ($paths as $id => $path) {
$this[$id] =
$objects[] = new WebPath($path);
}
}
public function execute(DOMDocument $dom) {
foreach ($this as $path)
$path->execute($dom);
}
public function run(){}
}
class WebScraper extends Stackable {
public function __construct($url, WebPaths $paths) {
$this->url = $url;
$this->paths = $paths;
}
public function run() {
$data = file_get_contents($this->url);
if ($data) {
$dom = new DOMDocument();
$dom->loadHTML($data);
$this->paths
->execute($dom);
}
}
protected $url;
protected $pattern;
}
class WebPool {
public function __construct($max) {
$this->max = $max;
}
public function submit(WebScraper $scraper) {
$random = rand(0, $this->max);
if (isset($this->workers[$random])) {
return $this->workers[$random]
->stack($scraper);
} else {
$this->workers[$random] = new WebWorker();
$this->workers[$random]->start();
return $this->workers[$random]
->stack($scraper);
}
}
public function shutdown() {
foreach ($this->workers as $worker)
$worker->shutdown();
}
protected $max;
protected $workers;
}
$functions = get_extension_funcs("standard");
$paths = [
"description" => "//p[@class='para rdfs-comment']"
];
$pool = new WebPool(8);
$work = [];
$objects = [];
do {
$wid = count($work);
$objects[$wid] = new WebPaths($paths, $objects);
$work[$wid] = new WebScraper(
sprintf("http://us2.php.net/%s", array_shift($functions)),
$objects[$wid]);
$pool
->submit($work[$wid]);
} while (count($functions));
$pool->shutdown();
/*
In the real world, you would now start to iterate over $work finding completed jobs
to store results in a database and or remove the task from the $work array to keep memory
usage in check
Storing results in a database could be done in the worker threads ...
*/
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment