Skip to content

Instantly share code, notes, and snippets.

@tobsn
Created May 29, 2011 09:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tobsn/997594 to your computer and use it in GitHub Desktop.
Save tobsn/997594 to your computer and use it in GitHub Desktop.
pastebin scraper - sorting
<?php
class pastebin {
private $folder = './pastebins';
public $pastes = array();
public $count = 0;
function __construct( $folder = false ) {
if( $folder ) {
$this->folder = $folder;
}
$this->folder = rtrim( $this->folder, '/' );
if( !file_exists( $this->folder ) ) {
mkdir( $this->folder );
}
}
function downloader() {
while( count( $this->pastes ) > 0 ) {
$paste = array_shift( $this->pastes );
$content = file_get_contents( 'http://pastebin.com/raw.php?i='.$paste );
if( strpos( $content, 'requesting a little bit too much' ) !== false ) {
printf( "Throttling... requeuing $s\n", $paste );
$this->pastes[] = $paste;
sleep(1);
}
else {
$type = '';
if( strpos( $content, 'password=' ) ) {
printf( "Found password in %s\n", $paste );
$type = '-password';
}
elseif( preg_match( '#https?://([a-z0-9]+?):([a-z0-9]+?)@.+#i', $content ) ) {
printf( "Found HTTP password string in %s\n", $paste );
$type = '-http';
}
$fn = sprintf( '%s/%s-%s%s.txt', $this->folder, $paste, @date( 'Y-m-d' ), $type );
if( file_put_contents( $fn, $content ) ) {
$this->count++;
}
}
$delay = rand( 1, 3 );
printf( "Downloaded %s, waiting %d sec\n", $paste, $delay );
sleep( $delay );
}
}
function scraper() {
$doc = new DOMDocument();
$doc->recover = true;
@$doc->loadHTMLFile( 'http://www.pastebin.com/' );
$xpath = new DOMXPath( $doc );
$elements = $xpath->query( '//ul[@class="right_menu"]/li/a' );
if( $elements !== null ) {
foreach( $elements as $e ) {
$href = $e->getAttribute( 'href' );
if( in_array( $href, $this->pastes ) ) {
printf( "%s already seen\n", $href );
}
else {
$this->pastes[] = substr( $href, 1 );
}
}
}
}
}
$p = new pastebin();
while( true ) {
$p->scraper();
$p->downloader();
printf( "%s downloaded so far\n", $p->count );
sleep( rand( 6, 12 ) );
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment