Last active
March 2, 2022 14:10
-
-
Save takshaktiwari/4aa850a394f9637bdf788f2315603a31 to your computer and use it in GitHub Desktop.
scrapper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/php | |
| <?php | |
| $shortopts = ""; | |
| $shortopts .= "u:"; | |
| $shortopts .= "d:"; | |
| $shortopts .= "o::"; | |
| $shortopts .= "r::"; | |
| $shortopts .= "s::"; | |
| $shortopts .= "h"; | |
| $longopts = array( | |
| "url:", | |
| "domain:", | |
| "output::", | |
| "report::", | |
| "sitemap::", | |
| "help", | |
| ); | |
| $options = getopt($shortopts, $longopts); | |
| $config = [ | |
| 'url' => 'http://localhost/develop/example-app/', | |
| 'domain' => 'http://localhost', | |
| 'output' => true, | |
| 'report' => [ | |
| 'generate' => false, | |
| 'json' => true, | |
| 'filename' => 'report.json', | |
| ], | |
| 'sitemap' => [ | |
| 'generate' => false, | |
| 'filename' => 'sitemap.xml' | |
| ] | |
| ]; | |
| if (isset($options['h']) || isset($options['help'])){ | |
| help(); | |
| } | |
| if (isset($options['u']) || isset($options['url'])) { | |
| $config['url'] = $options['u'] ? $options['u'] : $options['url']; | |
| $config['domain'] = $config['url']; | |
| } | |
| if (isset($options['d']) || isset($options['domain'])) { | |
| $config['domain'] = $options['d'] ? $options['d'] : $options['domain']; | |
| } | |
| if (isset($options['o']) || isset($options['output'])) { | |
| $config['output'] = true; | |
| } | |
| if (isset($options['r']) || isset($options['report'])) { | |
| $config['report']['generate'] = true; | |
| if (isset($options['r'])) { | |
| $filename = $options['r']; | |
| } | |
| if (isset($options['report'])) { | |
| $filename = $options['report']; | |
| } | |
| $config['report']['filename'] = $filename ? $filename : $config['report']['filename']; | |
| } | |
| if (isset($options['s']) || isset($options['sitemap'])) { | |
| $config['sitemap']['generate'] = true; | |
| if (isset($options['s'])) { | |
| $filename = $options['s']; | |
| } | |
| if (isset($options['sitemap'])) { | |
| $filename = $options['sitemap']; | |
| } | |
| $config['sitemap']['filename'] = $filename ? $filename : $config['sitemap']['filename']; | |
| } | |
| /* -------------------------------------- */ | |
| /* -------------------------------------- */ | |
| $collector = new Collector(); | |
| $collector->scrap(); | |
| if($config['report']['generate']){ | |
| $collector->save( | |
| $config['report']['filename'], | |
| $config['report']['json'] | |
| ); | |
| } | |
| if($config['sitemap']['generate']){ | |
| $collector->sitemap($config['sitemap']['filename']); | |
| } | |
| class Collector | |
| { | |
| public $url; | |
| public $crawler; | |
| public $checked; | |
| public $report; | |
| public $domain; | |
| public function __construct($url=null) | |
| { | |
| $this->url = $url ? $url : $GLOBALS['config']['url']; | |
| $this->url = rtrim($this->url, '/'); | |
| $this->domain = $GLOBALS['config']['domain']; | |
| $this->domain = rtrim($this->domain, '/'); | |
| $this->report = []; | |
| $this->checked = []; | |
| $this->crawler = new Crawler; | |
| } | |
| public function setDomain($domain=null) | |
| { | |
| $this->domain = rtrim($domain, '/'); | |
| return $this; | |
| } | |
| public function scrap($url=null) | |
| { | |
| $this->url = $url ? $url : $this->url; | |
| $this->url = rtrim($this->url, '/'); | |
| if (in_array($this->url, $this->checked)) { | |
| return false; | |
| } | |
| $crawl = $this->crawler->fetch($this->url); | |
| $this->checked[] = $this->url; | |
| $this->report[] = [ | |
| 'url' => $this->url, | |
| 'status' => $crawl->status() | |
| ]; | |
| if($crawl->status() == 200 && strpos($this->url, $this->domain) != '') { | |
| $links = $crawl->setDomain($this->domain)->parseHtml()->allLinks()->getLinks(); | |
| foreach ($links as $link) { | |
| $this->scrap($link); | |
| } | |
| } | |
| return $this; | |
| } | |
| public function save($filename='report.json', $json=true) | |
| { | |
| $data = $json ? json_encode($this->report, JSON_PRETTY_PRINT) : $this->report; | |
| return file_put_contents($filename, $data); | |
| } | |
| public function sitemap($filename='sitemap.xml') | |
| { | |
| $xml = '<?xml version="1.0" encoding="UTF-8"?>'."\n"; | |
| $xml .= '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'."\n"; | |
| foreach ($this->report as $item) { | |
| if($item['status'] != 200){ | |
| continue; | |
| } | |
| if(strpos($item['url'], $this->domain) == ''){ | |
| continue; | |
| } | |
| $xml .= "\t".'<url>'."\n"; | |
| $xml .= "\t\t".'<loc>'.$item['url'].'</loc>'."\n"; | |
| $xml .= "\t\t".'<lastmod>'.date('Y-m-d').'</lastmod>'."\n"; | |
| $xml .= "\t\t".'<changefreq>daily</changefreq>'."\n"; | |
| $xml .= "\t\t".'<priority>0.8</priority>'."\n"; | |
| $xml .= "\t".'</url>'."\n"; | |
| } | |
| $xml .='</urlset>'; | |
| return file_put_contents($filename, $xml); | |
| } | |
| } | |
| class Crawler | |
| { | |
| public $url; | |
| public $domain; | |
| public $links; | |
| public $excludes; | |
| public $dom; | |
| public $status; | |
| public $response; | |
| public $output; | |
| public function __construct($url=null, $output=null) | |
| { | |
| $this->url = $url; | |
| $this->output = $output ? $output : $GLOBALS['config']['output']; | |
| $this->dom = new DOMDocument(); | |
| $this->url = []; | |
| $this->excludes = [ | |
| 'tel:', 'mailto:', 'javascript:', '#', 'sms:' | |
| ]; | |
| } | |
| public function setDomain($domain=null) | |
| { | |
| $this->domain = $domain; | |
| return $this; | |
| } | |
| public function fetch($url=null, $output=null) | |
| { | |
| $this->url = $url ? $url : $this->url; | |
| $this->output = $output ? $output : $this->output; | |
| if(substr($this->url, -1) != '/'){ | |
| $this->url = $url; | |
| } | |
| $ch = curl_init(); | |
| curl_setopt($ch, CURLOPT_URL, $this->url); | |
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
| $this->response = curl_exec($ch); | |
| $this->status = curl_getinfo($ch, CURLINFO_HTTP_CODE); | |
| if($this->output){ | |
| echo $this->status.': '.$this->url."\n"; | |
| } | |
| return $this; | |
| } | |
| public function status() | |
| { | |
| return $this->status; | |
| } | |
| public function response() | |
| { | |
| return $this->response; | |
| } | |
| public function parseHtml() | |
| { | |
| if($this->status != 200){ | |
| return false; | |
| } | |
| @$this->dom->loadHTML($this->response); | |
| return $this; | |
| } | |
| public function anchors() | |
| { | |
| $anchors = $this->dom->getElementsByTagName('a'); | |
| foreach ($anchors as $anchor) { | |
| $this->links[] = $anchor->getAttribute('href'); | |
| } | |
| return $this; | |
| } | |
| public function images() | |
| { | |
| $anchors = $this->dom->getElementsByTagName('img'); | |
| foreach ($anchors as $anchor) { | |
| $this->links[] = $anchor->getAttribute('src'); | |
| } | |
| return $this; | |
| } | |
| public function css() | |
| { | |
| $anchors = $this->dom->getElementsByTagName('link'); | |
| foreach ($anchors as $anchor) { | |
| $this->links[] = $anchor->getAttribute('href'); | |
| } | |
| return $this; | |
| } | |
| public function forms() | |
| { | |
| $anchors = $this->dom->getElementsByTagName('form'); | |
| foreach ($anchors as $anchor) { | |
| $this->links[] = $anchor->getAttribute('action'); | |
| } | |
| return $this; | |
| } | |
| public function scripts() | |
| { | |
| $anchors = $this->dom->getElementsByTagName('script'); | |
| foreach ($anchors as $anchor) { | |
| $this->links[] = $anchor->getAttribute('src'); | |
| } | |
| return $this; | |
| } | |
| public function allLinks() | |
| { | |
| $this->anchors(); | |
| $this->images(); | |
| $this->css(); | |
| $this->scripts(); | |
| $this->forms(); | |
| return $this; | |
| } | |
| public function filterLinks() | |
| { | |
| $links = []; | |
| foreach ($this->links as $link) { | |
| $link = trim($link); | |
| $include = true; | |
| foreach ($this->excludes as $exclude) { | |
| if(strpos($link, $exclude) != ''){ | |
| $include = false; | |
| break; | |
| } | |
| } | |
| if(!$include){ | |
| continue; | |
| } | |
| $links[] = $link; | |
| } | |
| $this->links = $links; | |
| $this->links = array_unique($this->links); | |
| $this->links = array_filter($this->links, function($item){ | |
| return $item ? true : false; | |
| }); | |
| return $this; | |
| } | |
| public function mapLinks() | |
| { | |
| $this->links = array_map(function($item){ | |
| if(!preg_match('/^(http:\/\/|https:\/\/)/', $item)){ | |
| $item = rtrim($this->domain, '/').'/'.ltrim($item, '/'); | |
| } | |
| return $item; | |
| }, $this->links); | |
| } | |
| public function getLinks() | |
| { | |
| $this->filterLinks(); | |
| $this->mapLinks(); | |
| return $this->links; | |
| } | |
| } | |
| function help() | |
| { | |
| echo " | |
| Web Scrapper | |
| _________________________________________________ | |
| Scan the whole website, checks for status code generate urls report and sitemap. You need to make it executable (chmod +x scrapper.php) or run with php binary location. | |
| Syntax: ./scrapper.php [ -u= | -d | -o | -r | -s | -h ] | |
| Syntax: ./scrapper.php [ --url= | --domain= | --output | --report | --sitemap | --help ] | |
| Options: | |
| -u [--url] Pass the url to to start from | |
| -d [--domain] Set the main domain. this works to fix any relative urls, --url will be set as domain if not passed | |
| -o [--output] Show the output in console | |
| -r [--report] Generate the report file. Pass the name otherwise default name report.json will be generated | |
| -s [--sitemap] Generate the sitemap file. Pass the name otherwise default name sitemap.xml will be generated | |
| -h [--help] Print the Help for this script. | |
| \n | |
| Usage: | |
| Case 1: ./scrapper.php -u=http://example.com -o | |
| Case 2: ./scrapper.php -u=http://example.com -o -r -s=sitemap.xml | |
| Case 2: ./scrapper.php --url=http://example.com --output --report --sitemap=sitemap.xml | |
| ___________________________________________ | |
| "; | |
| die(); | |
| } | |
| function dd($var){ | |
| echo '<pre>'; | |
| print_r($var); | |
| die(); | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
completed