Skip to content

Instantly share code, notes, and snippets.

@takshaktiwari
Last active March 2, 2022 14:10
Show Gist options
  • Select an option

  • Save takshaktiwari/4aa850a394f9637bdf788f2315603a31 to your computer and use it in GitHub Desktop.

Select an option

Save takshaktiwari/4aa850a394f9637bdf788f2315603a31 to your computer and use it in GitHub Desktop.
scrapper
#! /usr/bin/php
<?php
$shortopts = "";
$shortopts .= "u:";
$shortopts .= "d:";
$shortopts .= "o::";
$shortopts .= "r::";
$shortopts .= "s::";
$shortopts .= "h";
$longopts = array(
"url:",
"domain:",
"output::",
"report::",
"sitemap::",
"help",
);
$options = getopt($shortopts, $longopts);
$config = [
'url' => 'http://localhost/develop/example-app/',
'domain' => 'http://localhost',
'output' => true,
'report' => [
'generate' => false,
'json' => true,
'filename' => 'report.json',
],
'sitemap' => [
'generate' => false,
'filename' => 'sitemap.xml'
]
];
if (isset($options['h']) || isset($options['help'])){
help();
}
if (isset($options['u']) || isset($options['url'])) {
$config['url'] = $options['u'] ? $options['u'] : $options['url'];
$config['domain'] = $config['url'];
}
if (isset($options['d']) || isset($options['domain'])) {
$config['domain'] = $options['d'] ? $options['d'] : $options['domain'];
}
if (isset($options['o']) || isset($options['output'])) {
$config['output'] = true;
}
if (isset($options['r']) || isset($options['report'])) {
$config['report']['generate'] = true;
if (isset($options['r'])) {
$filename = $options['r'];
}
if (isset($options['report'])) {
$filename = $options['report'];
}
$config['report']['filename'] = $filename ? $filename : $config['report']['filename'];
}
if (isset($options['s']) || isset($options['sitemap'])) {
$config['sitemap']['generate'] = true;
if (isset($options['s'])) {
$filename = $options['s'];
}
if (isset($options['sitemap'])) {
$filename = $options['sitemap'];
}
$config['sitemap']['filename'] = $filename ? $filename : $config['sitemap']['filename'];
}
/* -------------------------------------- */
/* -------------------------------------- */
$collector = new Collector();
$collector->scrap();
if($config['report']['generate']){
$collector->save(
$config['report']['filename'],
$config['report']['json']
);
}
if($config['sitemap']['generate']){
$collector->sitemap($config['sitemap']['filename']);
}
class Collector
{
public $url;
public $crawler;
public $checked;
public $report;
public $domain;
public function __construct($url=null)
{
$this->url = $url ? $url : $GLOBALS['config']['url'];
$this->url = rtrim($this->url, '/');
$this->domain = $GLOBALS['config']['domain'];
$this->domain = rtrim($this->domain, '/');
$this->report = [];
$this->checked = [];
$this->crawler = new Crawler;
}
public function setDomain($domain=null)
{
$this->domain = rtrim($domain, '/');
return $this;
}
public function scrap($url=null)
{
$this->url = $url ? $url : $this->url;
$this->url = rtrim($this->url, '/');
if (in_array($this->url, $this->checked)) {
return false;
}
$crawl = $this->crawler->fetch($this->url);
$this->checked[] = $this->url;
$this->report[] = [
'url' => $this->url,
'status' => $crawl->status()
];
if($crawl->status() == 200 && strpos($this->url, $this->domain) != '') {
$links = $crawl->setDomain($this->domain)->parseHtml()->allLinks()->getLinks();
foreach ($links as $link) {
$this->scrap($link);
}
}
return $this;
}
public function save($filename='report.json', $json=true)
{
$data = $json ? json_encode($this->report, JSON_PRETTY_PRINT) : $this->report;
return file_put_contents($filename, $data);
}
public function sitemap($filename='sitemap.xml')
{
$xml = '<?xml version="1.0" encoding="UTF-8"?>'."\n";
$xml .= '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'."\n";
foreach ($this->report as $item) {
if($item['status'] != 200){
continue;
}
if(strpos($item['url'], $this->domain) == ''){
continue;
}
$xml .= "\t".'<url>'."\n";
$xml .= "\t\t".'<loc>'.$item['url'].'</loc>'."\n";
$xml .= "\t\t".'<lastmod>'.date('Y-m-d').'</lastmod>'."\n";
$xml .= "\t\t".'<changefreq>daily</changefreq>'."\n";
$xml .= "\t\t".'<priority>0.8</priority>'."\n";
$xml .= "\t".'</url>'."\n";
}
$xml .='</urlset>';
return file_put_contents($filename, $xml);
}
}
class Crawler
{
public $url;
public $domain;
public $links;
public $excludes;
public $dom;
public $status;
public $response;
public $output;
public function __construct($url=null, $output=null)
{
$this->url = $url;
$this->output = $output ? $output : $GLOBALS['config']['output'];
$this->dom = new DOMDocument();
$this->url = [];
$this->excludes = [
'tel:', 'mailto:', 'javascript:', '#', 'sms:'
];
}
public function setDomain($domain=null)
{
$this->domain = $domain;
return $this;
}
public function fetch($url=null, $output=null)
{
$this->url = $url ? $url : $this->url;
$this->output = $output ? $output : $this->output;
if(substr($this->url, -1) != '/'){
$this->url = $url;
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$this->response = curl_exec($ch);
$this->status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if($this->output){
echo $this->status.': '.$this->url."\n";
}
return $this;
}
public function status()
{
return $this->status;
}
public function response()
{
return $this->response;
}
public function parseHtml()
{
if($this->status != 200){
return false;
}
@$this->dom->loadHTML($this->response);
return $this;
}
public function anchors()
{
$anchors = $this->dom->getElementsByTagName('a');
foreach ($anchors as $anchor) {
$this->links[] = $anchor->getAttribute('href');
}
return $this;
}
public function images()
{
$anchors = $this->dom->getElementsByTagName('img');
foreach ($anchors as $anchor) {
$this->links[] = $anchor->getAttribute('src');
}
return $this;
}
public function css()
{
$anchors = $this->dom->getElementsByTagName('link');
foreach ($anchors as $anchor) {
$this->links[] = $anchor->getAttribute('href');
}
return $this;
}
public function forms()
{
$anchors = $this->dom->getElementsByTagName('form');
foreach ($anchors as $anchor) {
$this->links[] = $anchor->getAttribute('action');
}
return $this;
}
public function scripts()
{
$anchors = $this->dom->getElementsByTagName('script');
foreach ($anchors as $anchor) {
$this->links[] = $anchor->getAttribute('src');
}
return $this;
}
public function allLinks()
{
$this->anchors();
$this->images();
$this->css();
$this->scripts();
$this->forms();
return $this;
}
public function filterLinks()
{
$links = [];
foreach ($this->links as $link) {
$link = trim($link);
$include = true;
foreach ($this->excludes as $exclude) {
if(strpos($link, $exclude) != ''){
$include = false;
break;
}
}
if(!$include){
continue;
}
$links[] = $link;
}
$this->links = $links;
$this->links = array_unique($this->links);
$this->links = array_filter($this->links, function($item){
return $item ? true : false;
});
return $this;
}
public function mapLinks()
{
$this->links = array_map(function($item){
if(!preg_match('/^(http:\/\/|https:\/\/)/', $item)){
$item = rtrim($this->domain, '/').'/'.ltrim($item, '/');
}
return $item;
}, $this->links);
}
public function getLinks()
{
$this->filterLinks();
$this->mapLinks();
return $this->links;
}
}
function help()
{
echo "
Web Scrapper
_________________________________________________
Scan the whole website, checks for status code generate urls report and sitemap. You need to make it executable (chmod +x scrapper.php) or run with php binary location.
Syntax: ./scrapper.php [ -u= | -d | -o | -r | -s | -h ]
Syntax: ./scrapper.php [ --url= | --domain= | --output | --report | --sitemap | --help ]
Options:
-u [--url] Pass the url to to start from
-d [--domain] Set the main domain. this works to fix any relative urls, --url will be set as domain if not passed
-o [--output] Show the output in console
-r [--report] Generate the report file. Pass the name otherwise default name report.json will be generated
-s [--sitemap] Generate the sitemap file. Pass the name otherwise default name sitemap.xml will be generated
-h [--help] Print the Help for this script.
\n
Usage:
Case 1: ./scrapper.php -u=http://example.com -o
Case 2: ./scrapper.php -u=http://example.com -o -r -s=sitemap.xml
Case 2: ./scrapper.php --url=http://example.com --output --report --sitemap=sitemap.xml
___________________________________________
";
die();
}
function dd($var){
echo '<pre>';
print_r($var);
die();
}
@takshaktiwari
Copy link
Author

completed

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment