/scrape.php Secret
Created
March 5, 2014 12:13
scraping in php
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Scraping the bjesus out of php.net with pthreads | |
* | |
* The following _example_ code shows how you might use pthreads to construct threaded scraping software | |
* It uses very simple DOM/XPath to scrape php.net pages for function descriptions. | |
* | |
* WARNING: DO NOT HAMMER THE CRAP OUT OF PHP.NET !! YOU WILL UPSET A LOT OF PEOPLE, INCLUDING ME !! | |
*/ | |
class WebWorker extends Worker { | |
/* maybe connect to a database or whatever here */ | |
public function run(){} | |
} | |
class WebPath extends Stackable { | |
public function __construct($path) { | |
$this->path = $path; | |
$this->found = null; | |
} | |
public function execute(DOMDocument $dom) { | |
$xpath = new DOMXPath($dom); | |
foreach ($xpath->query($this->path) as $found) { | |
$this->found = (string) $found->textContent; | |
break; | |
} | |
var_dump($this); | |
} | |
public function run(){} | |
protected $path; | |
protected $found; | |
} | |
class WebPaths extends Stackable { | |
public function __construct($paths, &$objects) { | |
foreach ($paths as $id => $path) { | |
$this[$id] = | |
$objects[] = new WebPath($path); | |
} | |
} | |
public function execute(DOMDocument $dom) { | |
foreach ($this as $path) | |
$path->execute($dom); | |
} | |
public function run(){} | |
} | |
class WebScraper extends Stackable { | |
public function __construct($url, WebPaths $paths) { | |
$this->url = $url; | |
$this->paths = $paths; | |
} | |
public function run() { | |
$data = file_get_contents($this->url); | |
if ($data) { | |
$dom = new DOMDocument(); | |
$dom->loadHTML($data); | |
$this->paths | |
->execute($dom); | |
} | |
} | |
protected $url; | |
protected $pattern; | |
} | |
class WebPool { | |
public function __construct($max) { | |
$this->max = $max; | |
} | |
public function submit(WebScraper $scraper) { | |
$random = rand(0, $this->max); | |
if (isset($this->workers[$random])) { | |
return $this->workers[$random] | |
->stack($scraper); | |
} else { | |
$this->workers[$random] = new WebWorker(); | |
$this->workers[$random]->start(); | |
return $this->workers[$random] | |
->stack($scraper); | |
} | |
} | |
public function shutdown() { | |
foreach ($this->workers as $worker) | |
$worker->shutdown(); | |
} | |
protected $max; | |
protected $workers; | |
} | |
$functions = get_extension_funcs("standard"); | |
$paths = [ | |
"description" => "//p[@class='para rdfs-comment']" | |
]; | |
$pool = new WebPool(8); | |
$work = []; | |
$objects = []; | |
do { | |
$wid = count($work); | |
$objects[$wid] = new WebPaths($paths, $objects); | |
$work[$wid] = new WebScraper( | |
sprintf("http://us2.php.net/%s", array_shift($functions)), | |
$objects[$wid]); | |
$pool | |
->submit($work[$wid]); | |
} while (count($functions)); | |
$pool->shutdown(); | |
/* | |
In the real world, you would now start to iterate over $work finding completed jobs | |
to store results in a database and or remove the task from the $work array to keep memory | |
usage in check | |
Storing results in a database could be done in the worker threads ... | |
*/ | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment