Skip to content

Instantly share code, notes, and snippets.

@scrapewww
Created November 3, 2017 16:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save scrapewww/2f580c85df78c57bb53aa4f612f24f67 to your computer and use it in GitHub Desktop.
Save scrapewww/2f580c85df78c57bb53aa4f612f24f67 to your computer and use it in GitHub Desktop.
<?php
namespace App\Console\Commands;
define('MAX_FILE_SIZE', 6000000);
use Illuminate\Http\Request;
use Illuminate\Console\Command;
use Sunra\PhpSimple\HtmlDomParser;
use Illuminate\Support\Facades\Cache;
use App\Website;
use App\Mp3;
use App\ScrapeUrl;
use App\ScrapeDirectory;
class DirectorySpider extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'mp3foo:directory';
/**
* The console command description.
*
* @var string
*/
protected $description = 'Command description';
/**
* Create a new command instance.
*
* @return void
*/
public function __construct()
{
parent::__construct();
}
/**
* Execute the console command.
*
* @return mixed
*/
public function handle(Request $request)
{
$sd = ScrapeDirectory::where('status','pending')->first();
if( $sd )
{
$sd->status = 'processing';
$sd->save();
$url = str_replace(' ','%20',$sd->url);
$data = $this->get_data( $url );
$dom = HtmlDomParser::str_get_html( $data );
//dd( $dom->find('a') );
$r = [];
try {
foreach( $dom->find('a') AS $a )
{
$r['urls'][] = $a->href;
if( !$this->startsWith2( $a->href, '/' ) && $this->endsWith2( $a->href, '.mp3') ) {
try {
$insert = new Mp3;
$insert->status = 'pending';
$insert->url = $this->to_single_slashes( urldecode($url.$a->href) );
$insert->source = $url;
$insert->save();
$r['mp3s'][] = $this->to_single_slashes( urldecode($url.$a->href) );
} catch( \Exception $e )
{
}
}
if( $sd->base_url && $this->startsWith2( $a->href, '/' ) && $this->endsWith2( $a->href, '.mp3') ) {
try {
$insert = new Mp3;
$insert->status = 'pending';
$insert->url = $this->to_single_slashes( urldecode($sd->base_url.$a->href) );
$insert->source = $url;
$insert->save();
$r['mp3s'][] = $this->to_single_slashes( urldecode($sd->base_url.$a->href) );
} catch( \Exception $e )
{
}
}
if( !$this->startsWith2( $a->href, '/' ) && $this->endsWith2( $a->href, '/') && !$this->endsWith2( $a->href, '.mp3') ) {
try {
$insert = new ScrapeDirectory;
$insert->status = 'pending';
$insert->url = $this->to_single_slashes( urldecode(substr( $a->href, 0, 4 ) === "http" ? $a->href : $url.$a->href ) );
if( $sd->base_url ) $insert->base_url = $sd->base_url;
$insert->save();
$r['directories'][] = $this->to_single_slashes( urldecode(substr( $a->href, 0, 4 ) === "http" ? $a->href : $url.$a->href ) );
} catch( \Exception $e )
{
}
}
if( $sd->base_url && $this->startsWith2( $a->href, '/' ) && $this->endsWith2( $a->href, '/') && !$this->endsWith2( $a->href, '.mp3') ) {
try {
$insert = new ScrapeDirectory;
$insert->status = 'pending';
$insert->url = $this->to_single_slashes( urldecode(substr( $a->href, 0, 4 ) === "http" ? $a->href : $sd->base_url.$a->href ) );
if( $sd->base_url ) $insert->base_url = $sd->base_url;
$insert->save();
$r['directories'][] = $this->to_single_slashes( urldecode(substr( $a->href, 0, 4 ) === "http" ? $a->href : $sd->base_url.$a->href ) );
} catch( \Exception $e )
{
}
}
}
} catch( \Exception $e )
{
}
$sd->status = 'complete';
$sd->save();
sleep(1);
//dd( $r );
}
else
{
sleep(60);
}
}
function to_single_slashes($input) {
return preg_replace('~(^|[^:])//+~', '\\1/', $input);
}
private function get_data($url) {
$ch = curl_init();
$timeout = 5;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
private function processUrl($url,$bbase)
{
$re = '/((https?):((\/\/)|(\\\\\\\\))+[\pL\pM\w\d :#@%\/;$()~_?\+-=\\\\\.&]*\.mp3)/';
$data = $this->get_data( $url );
$dom = HtmlDomParser::str_get_html( $data );
$parse_url = parse_url( $url );
preg_match_all($re, $data, $matches, PREG_SET_ORDER, 0);
$mp3s = [];
foreach($matches AS $mp3 )
{
if( !in_array( $mp3[0], $mp3s ) ) $mp3s[] = $this->to_single_slashes( $mp3[0] );
}
$urls = [];
foreach( $dom->find('a') AS $a )
{
$url_output = $this->santizeUrl( $a->href, $bbase );
$parse_url2 = parse_url( $url_output );
if( $url_output != $parse_url['host'] && isset( $parse_url['host'] ) && isset( $parse_url2['host'] ) && $parse_url['host'] == $parse_url2['host'] )
{
$urls[] = $this->to_single_slashes( $url_output );
}
}
$description = $dom->find('meta[name=description]',0) && isset( $dom->find('meta[description]',0)->attr['content'] ) ? $dom->find('meta[description]',0)->attr['content'] : '';
if( !$description )
{
$description = $dom->find('meta[property="og:description"]',0) && isset( $dom->find('meta[property="og:description"]',0)->attr['content'] ) ? $dom->find('meta[property="og:description"]',0)->attr['content'] : '';
}
$image = $dom->find('meta[property="og:image"]',0) ? $dom->find('meta[property="og:image"]',0)->attr['content'] : '';
return ['mp3s'=>array_unique($mp3s),'urls'=>array_unique($urls),'description'=>$description,'image'=>$image];
}
function endsWith2($haystack, $needle) {
// search forward starting from end minus needle length characters
return $needle === "" || (($temp = strlen($haystack) - strlen($needle)) >= 0 && strpos($haystack, $needle, $temp) !== false);
}
function startsWith2($haystack, $needle) {
// search backwards starting from haystack length characters from the end
return $needle === "" || strrpos($haystack, $needle, -strlen($haystack)) !== false;
}
private function santizeUrl($url, $base)
{
if( ! $url) return $base;
if(parse_url($url, PHP_URL_SCHEME) != '') return $url;
if($url[0] == '/' || $url[0] == '#' || $url[0] == '?') return $base.$url;
return $base.'/'.$url;
extract(parse_url($base));
if( ! isset($path)) $path = '/';
$path = preg_replace('#/[^/]*$#', '', $path);
if($url[0] == '/') $path = '';
$abs = "$host$path/$url";
$re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
for($n = 1; $n > 0; $abs = preg_replace($re, '/', $abs, -1, $n)) {}
return $scheme.'://'.$abs;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment