Skip to content

Instantly share code, notes, and snippets.

@scrapewww
Created November 3, 2017 16:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save scrapewww/ebc9c39a6159a4718a591f9dd26f628d to your computer and use it in GitHub Desktop.
Save scrapewww/ebc9c39a6159a4718a591f9dd26f628d to your computer and use it in GitHub Desktop.
<?php
namespace App\Console\Commands;
use Illuminate\Http\Request;
use Illuminate\Console\Command;
use Sunra\PhpSimple\HtmlDomParser;
use Illuminate\Support\Facades\Cache;
use App\Website;
use App\Mp3;
use App\ScrapeUrl;
class Crawl extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'mp3foo:crawl {often}';
/**
* The console command description.
*
* @var string
*/
protected $description = 'Command description';
/**
* Create a new command instance.
*
* @return void
*/
public function __construct()
{
parent::__construct();
}
/**
* Execute the console command.
*
* @return mixed
*/
public function handle(Request $request)
{
ScrapeUrl::where('status','!=','pending')->where('delete_at', '<=' , \Carbon\Carbon::now() )->delete();
$websites = Website::where('status','enabled')->where('frequency', $this->argument('often'))->get();
foreach( $websites AS $website )
{
$website->last_crawl = date('Y-m-d H:i:s');
$website->save();
$base = $website->url;
$parse_url = parse_url( $base );
$process = $this->processUrl( $base );
if( !empty( $process['mp3s'] ) )
{
foreach( $process['mp3s'] AS $mp3 )
{
try{
$insert = new Mp3;
$insert->status = 'pending';
$insert->url = $mp3;
$insert->source = $base;
$insert->source_description = $process['description'];
//$insert->source_image = $process['image'];
$insert->save();
}catch(\Exception $exception)
{
}
}
}
if( $website->depth > 1 && !empty( $process['urls'] ) )
{
foreach( $process['urls'] AS $pageurl )
{
$cache = 'mp3foo_crawl_'.md5($pageurl);
$parse_url2 = parse_url( $pageurl );
if( $this->endswith($pageurl, '.mp3') )
{
try{
$insert = new Mp3;
$insert->status = 'pending';
$insert->url = $pageurl;
$insert->source = $base;
$insert->source_description = $process['description'];
//$insert->source_image = $process['image'];
$insert->save();
}catch(\Exception $exception)
{
}
}
else if(
strpos($pageurl, '.m4a') === false &&
strpos($pageurl, '.gif') === false &&
strpos($pageurl, '.jpg') === false &&
strpos($pageurl, '.zip') === false &&
strpos($pageurl, '.mp3') === false &&
$pageurl != $base &&
( !Cache::has($cache) || in_array( $pageurl, $website->whitelist ) ) &&
isset( $parse_url['host'] ) &&
isset( $parse_url2['host'] ) &&
$parse_url['host'] == $parse_url2['host']
)
{
Cache::put($cache, 1, $website->cache);
try {
ScrapeUrl::insert(
[
'website_id' => $website->id,
'depth' => 1,
'url' => $pageurl,
'delete_at' => \Carbon\Carbon::now()->addMinutes($website->cache)->toDateTimeString()
]
);
} catch( \Exception $e )
{
}
}
}
}
}
}
function to_single_slashes($input) {
return preg_replace('~(^|[^:])//+~', '\\1/', $input);
}
private function get_data($url) {
$ch = curl_init();
$timeout = 5;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
function endswith($string, $test) {
$strlen = strlen($string);
$testlen = strlen($test);
if ($testlen > $strlen) return false;
return substr_compare($string, $test, $strlen - $testlen, $testlen) === 0;
}
private function processUrl($url)
{
$re = '/((https?):((\/\/)|(\\\\\\\\))+[\pL\pM\w\d :#@%\/;$()~_?\+-=\\\\\.&]*\.mp3)/';
$data = $this->get_data( $url );
$dom = HtmlDomParser::str_get_html( $data );
foreach( $dom->find('a') AS $a )
{
if( substr( $a->href, 0, 2 ) == '//' )
{
$a->href = 'http:'.$a->href;
}
}
$parse_url = parse_url( $url );
preg_match_all($re, $dom, $matches, PREG_SET_ORDER, 0);
$mp3s = [];
foreach($matches AS $mp3 )
{
if( !in_array( $mp3[0], $mp3s ) ) $mp3s[] = $this->to_single_slashes( $mp3[0] );
}
$urls = [];
foreach( $dom->find('a') AS $a )
{
$url_output = $this->santizeUrl( $a->href, $url );
$parse_url2 = parse_url( $url_output );
if( $url_output != $parse_url['host'] && isset( $parse_url['host'] ) && isset( $parse_url2['host'] ) && $parse_url['host'] == $parse_url2['host'] )
{
$urls[] = $this->to_single_slashes( $url_output );
}
}
$description = $dom->find('meta[name=description]',0) && isset( $dom->find('meta[description]',0)->attr['content'] ) ? $dom->find('meta[description]',0)->attr['content'] : '';
if( !$description )
{
$description = $dom->find('meta[property="og:description"]',0) && isset( $dom->find('meta[property="og:description"]',0)->attr['content'] ) ? $dom->find('meta[property="og:description"]',0)->attr['content'] : '';
}
$image = $dom->find('meta[property="og:image"]',0) ? $dom->find('meta[property="og:image"]',0)->attr['content'] : '';
return ['mp3s'=>array_unique($mp3s),'urls'=>array_unique($urls),'description'=>$description,'image'=>$image];
}
private function santizeUrl($url, $base)
{
if( ! $url) return $base;
if(parse_url($url, PHP_URL_SCHEME) != '') return $url;
if($url[0] == '/' || $url[0] == '#' || $url[0] == '?') return $base.$url;
return $base.'/'.$url;
extract(parse_url($base));
if( ! isset($path)) $path = '/';
$path = preg_replace('#/[^/]*$#', '', $path);
if($url[0] == '/') $path = '';
$abs = "$host$path/$url";
$re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
for($n = 1; $n > 0; $abs = preg_replace($re, '/', $abs, -1, $n)) {}
return $scheme.'://'.$abs;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment