scrapewww/Crawler

## Crawler
<?php

namespace App\Console\Commands;

use Illuminate\Http\Request;
use Illuminate\Console\Command;
use Sunra\PhpSimple\HtmlDomParser;
use Illuminate\Support\Facades\Cache;
use App\Website;
use App\Mp3;
use App\ScrapeUrl;

class Crawl extends Command
{
    /**
     * The name and signature of the console command.
     *
     * @var string
     */
    protected $signature = 'mp3foo:crawl {often}';

    /**
     * The console command description.
     *
     * @var string
     */
    protected $description = 'Command description';

    /**
     * Create a new command instance.
     *
     * @return void
     */
    public function __construct()
    {
        parent::__construct();
    }

    /**
     * Execute the console command.
     *
     * @return mixed
     */
    public function handle(Request $request)
    {
        ScrapeUrl::where('status','!=','pending')->where('delete_at', '<=' , \Carbon\Carbon::now() )->delete();

        $websites = Website::where('status','enabled')->where('frequency', $this->argument('often'))->get();
        foreach( $websites AS $website )
        {
            $website->last_crawl = date('Y-m-d H:i:s');
            $website->save();
            $base = $website->url;
            $parse_url = parse_url( $base );
            $process = $this->processUrl( $base );
            if( !empty( $process['mp3s'] ) )
            {
                foreach( $process['mp3s'] AS $mp3 )
                {
                    try{
                        $insert = new Mp3;
                        $insert->status = 'pending';
                        $insert->url = $mp3;
                        $insert->source = $base;
                        $insert->source_description = $process['description'];
                        //$insert->source_image = $process['image'];
                        $insert->save();
                    }catch(\Exception $exception)
                    {

                    }
                }
            }
            if( $website->depth > 1 && !empty( $process['urls'] ) )
            {
                foreach( $process['urls'] AS $pageurl )
                {

                    $cache = 'mp3foo_crawl_'.md5($pageurl);
                    $parse_url2 = parse_url( $pageurl );
                    if( $this->endswith($pageurl, '.mp3') )
                    {
                        try{
                            $insert = new Mp3;
                            $insert->status = 'pending';
                            $insert->url = $pageurl;
                            $insert->source = $base;
                            $insert->source_description = $process['description'];
                            //$insert->source_image = $process['image'];
                            $insert->save();
                        }catch(\Exception $exception)
                        {

                        }
                    }
                    else if(
                        strpos($pageurl, '.m4a') === false &&
                        strpos($pageurl, '.gif') === false &&
                        strpos($pageurl, '.jpg') === false &&
                        strpos($pageurl, '.zip') === false &&
                        strpos($pageurl, '.mp3') === false &&
                        $pageurl != $base &&
                        ( !Cache::has($cache) || in_array( $pageurl, $website->whitelist ) ) &&
                        isset( $parse_url['host'] ) &&
                        isset( $parse_url2['host'] ) &&
                        $parse_url['host'] == $parse_url2['host']
                    )
                    {
                        Cache::put($cache, 1, $website->cache);
                        try {
                            ScrapeUrl::insert(
                                [
                                    'website_id' => $website->id,
                                    'depth' => 1,
                                    'url' => $pageurl,
                                    'delete_at' => \Carbon\Carbon::now()->addMinutes($website->cache)->toDateTimeString()
                                ]
                            );
                        } catch( \Exception $e )
                        {

                        }
                    }
                }
            }
        }
    }

    function to_single_slashes($input) {
        return preg_replace('~(^|[^:])//+~', '\\1/', $input);
    }

    private function get_data($url) {
        $ch = curl_init();
        $timeout = 5;
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
        $data = curl_exec($ch);
        curl_close($ch);
        return $data;
    }

    function endswith($string, $test) {
        $strlen = strlen($string);
        $testlen = strlen($test);
        if ($testlen > $strlen) return false;
        return substr_compare($string, $test, $strlen - $testlen, $testlen) === 0;
    }

    private function processUrl($url)
    {
        $re = '/((https?):((\/\/)|(\\\\\\\\))+[\pL\pM\w\d :#@%\/;$()~_?\+-=\\\\\.&]*\.mp3)/';
        $data = $this->get_data( $url );
        $dom = HtmlDomParser::str_get_html( $data );
		foreach( $dom->find('a') AS $a )
        {
			if( substr( $a->href, 0, 2 ) == '//' )
			{
				$a->href = 'http:'.$a->href;
			}
		}
        $parse_url = parse_url( $url );
        preg_match_all($re, $dom, $matches, PREG_SET_ORDER, 0);
        $mp3s = [];
        foreach($matches AS $mp3 )
        {
            if( !in_array( $mp3[0], $mp3s ) ) $mp3s[] = $this->to_single_slashes( $mp3[0] );
        }
        $urls = [];
        foreach( $dom->find('a') AS $a )
        {
            $url_output = $this->santizeUrl( $a->href, $url );
            $parse_url2 = parse_url( $url_output );
            if( $url_output != $parse_url['host'] && isset( $parse_url['host'] ) && isset( $parse_url2['host'] ) && $parse_url['host'] == $parse_url2['host'] )
            {
                $urls[] = $this->to_single_slashes( $url_output );
            }
        }
        $description = $dom->find('meta[name=description]',0) && isset( $dom->find('meta[description]',0)->attr['content'] ) ? $dom->find('meta[description]',0)->attr['content'] : '';
        if( !$description )
        {
            $description = $dom->find('meta[property="og:description"]',0) && isset( $dom->find('meta[property="og:description"]',0)->attr['content'] ) ? $dom->find('meta[property="og:description"]',0)->attr['content'] : '';
        }
        $image = $dom->find('meta[property="og:image"]',0) ? $dom->find('meta[property="og:image"]',0)->attr['content'] : '';
        return ['mp3s'=>array_unique($mp3s),'urls'=>array_unique($urls),'description'=>$description,'image'=>$image];
    }

    private function santizeUrl($url, $base)
    {
        if( ! $url) return $base;
        if(parse_url($url, PHP_URL_SCHEME) != '') return $url;
        if($url[0] == '/' || $url[0] == '#' || $url[0] == '?') return $base.$url;
        return $base.'/'.$url;
        extract(parse_url($base));
        if( ! isset($path)) $path = '/';
        $path = preg_replace('#/[^/]*$#', '', $path);
        if($url[0] == '/') $path = '';
        $abs = "$host$path/$url";
        $re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
        for($n = 1; $n > 0; $abs = preg_replace($re, '/', $abs, -1, $n)) {}
        return $scheme.'://'.$abs;
    }
}
	<?php

	namespace App\Console\Commands;

	use Illuminate\Http\Request;
	use Illuminate\Console\Command;
	use Sunra\PhpSimple\HtmlDomParser;
	use Illuminate\Support\Facades\Cache;
	use App\Website;
	use App\Mp3;
	use App\ScrapeUrl;

	class Crawl extends Command
	{
	/**
	* The name and signature of the console command.
	*
	* @var string
	*/
	protected $signature = 'mp3foo:crawl {often}';

	/**
	* The console command description.
	*
	* @var string
	*/
	protected $description = 'Command description';

	/**
	* Create a new command instance.
	*
	* @return void
	*/
	public function __construct()
	{
	parent::__construct();
	}

	/**
	* Execute the console command.
	*
	* @return mixed
	*/
	public function handle(Request $request)
	{
	ScrapeUrl::where('status','!=','pending')->where('delete_at', '<=' , \Carbon\Carbon::now() )->delete();

	$websites = Website::where('status','enabled')->where('frequency', $this->argument('often'))->get();
	foreach( $websites AS $website )
	{
	$website->last_crawl = date('Y-m-d H:i:s');
	$website->save();
	$base = $website->url;
	$parse_url = parse_url( $base );
	$process = $this->processUrl( $base );
	if( !empty( $process['mp3s'] ) )
	{
	foreach( $process['mp3s'] AS $mp3 )
	{
	try{
	$insert = new Mp3;
	$insert->status = 'pending';
	$insert->url = $mp3;
	$insert->source = $base;
	$insert->source_description = $process['description'];
	//$insert->source_image = $process['image'];
	$insert->save();
	}catch(\Exception $exception)
	{

	}
	}
	}
	if( $website->depth > 1 && !empty( $process['urls'] ) )
	{
	foreach( $process['urls'] AS $pageurl )
	{

	$cache = 'mp3foo_crawl_'.md5($pageurl);
	$parse_url2 = parse_url( $pageurl );
	if( $this->endswith($pageurl, '.mp3') )
	{
	try{
	$insert = new Mp3;
	$insert->status = 'pending';
	$insert->url = $pageurl;
	$insert->source = $base;
	$insert->source_description = $process['description'];
	//$insert->source_image = $process['image'];
	$insert->save();
	}catch(\Exception $exception)
	{

	}
	}
	else if(
	strpos($pageurl, '.m4a') === false &&
	strpos($pageurl, '.gif') === false &&
	strpos($pageurl, '.jpg') === false &&
	strpos($pageurl, '.zip') === false &&
	strpos($pageurl, '.mp3') === false &&
	$pageurl != $base &&
	( !Cache::has($cache) \|\| in_array( $pageurl, $website->whitelist ) ) &&
	isset( $parse_url['host'] ) &&
	isset( $parse_url2['host'] ) &&
	$parse_url['host'] == $parse_url2['host']
	)
	{
	Cache::put($cache, 1, $website->cache);
	try {
	ScrapeUrl::insert(
	[
	'website_id' => $website->id,
	'depth' => 1,
	'url' => $pageurl,
	'delete_at' => \Carbon\Carbon::now()->addMinutes($website->cache)->toDateTimeString()
	]
	);
	} catch( \Exception $e )
	{

	}
	}
	}
	}
	}
	}

	function to_single_slashes($input) {
	return preg_replace('~(^\|[^:])//+~', '\\1/', $input);
	}

	private function get_data($url) {
	$ch = curl_init();
	$timeout = 5;
	curl_setopt($ch, CURLOPT_URL, $url);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
	curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
	$data = curl_exec($ch);
	curl_close($ch);
	return $data;
	}

	function endswith($string, $test) {
	$strlen = strlen($string);
	$testlen = strlen($test);
	if ($testlen > $strlen) return false;
	return substr_compare($string, $test, $strlen - $testlen, $testlen) === 0;
	}

	private function processUrl($url)
	{
	$re = '/((https?):((\/\/)\|(\\\\\\\\))+[\pL\pM\w\d :#@%\/;$()~_?\+-=\\\\\.&]*\.mp3)/';
	$data = $this->get_data( $url );
	$dom = HtmlDomParser::str_get_html( $data );
	foreach( $dom->find('a') AS $a )
	{
	if( substr( $a->href, 0, 2 ) == '//' )
	{
	$a->href = 'http:'.$a->href;
	}
	}
	$parse_url = parse_url( $url );
	preg_match_all($re, $dom, $matches, PREG_SET_ORDER, 0);
	$mp3s = [];
	foreach($matches AS $mp3 )
	{
	if( !in_array( $mp3[0], $mp3s ) ) $mp3s[] = $this->to_single_slashes( $mp3[0] );
	}
	$urls = [];
	foreach( $dom->find('a') AS $a )
	{
	$url_output = $this->santizeUrl( $a->href, $url );
	$parse_url2 = parse_url( $url_output );
	if( $url_output != $parse_url['host'] && isset( $parse_url['host'] ) && isset( $parse_url2['host'] ) && $parse_url['host'] == $parse_url2['host'] )
	{
	$urls[] = $this->to_single_slashes( $url_output );
	}
	}
	$description = $dom->find('meta[name=description]',0) && isset( $dom->find('meta[description]',0)->attr['content'] ) ? $dom->find('meta[description]',0)->attr['content'] : '';
	if( !$description )
	{
	$description = $dom->find('meta[property="og:description"]',0) && isset( $dom->find('meta[property="og:description"]',0)->attr['content'] ) ? $dom->find('meta[property="og:description"]',0)->attr['content'] : '';
	}
	$image = $dom->find('meta[property="og:image"]',0) ? $dom->find('meta[property="og:image"]',0)->attr['content'] : '';
	return ['mp3s'=>array_unique($mp3s),'urls'=>array_unique($urls),'description'=>$description,'image'=>$image];
	}

	private function santizeUrl($url, $base)
	{
	if( ! $url) return $base;
	if(parse_url($url, PHP_URL_SCHEME) != '') return $url;
	if($url[0] == '/' \|\| $url[0] == '#' \|\| $url[0] == '?') return $base.$url;
	return $base.'/'.$url;
	extract(parse_url($base));
	if( ! isset($path)) $path = '/';
	$path = preg_replace('#/[^/]*$#', '', $path);
	if($url[0] == '/') $path = '';
	$abs = "$host$path/$url";
	$re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
	for($n = 1; $n > 0; $abs = preg_replace($re, '/', $abs, -1, $n)) {}
	return $scheme.'://'.$abs;
	}
	}