Skip to content

Instantly share code, notes, and snippets.

@victorkane
Forked from xymox12/crawler.sh
Created March 2, 2021 13:05
Show Gist options
  • Save victorkane/c572d877d7ff02ffa2a893ae93381093 to your computer and use it in GitHub Desktop.
Save victorkane/c572d877d7ff02ffa2a893ae93381093 to your computer and use it in GitHub Desktop.
Convert a csv of URLs and Page Titles created using WGET to Freemind XML and UL list
#!/bin/bash
#
# Crawls a domain
# Retreives all visible URLs and their page titles
# Saves to CSV
# $1 = URL (no http(s))
# $2 = csv title
# MODIFY - wget include directories, domain, and --reject-regex TODO - make a variable
# Text color variables
txtund=$(tput sgr 0 1)
# Underline
txtbld=$(tput bold)
# Bold
bldred=${txtbld}$(tput setaf 1) # red
bldblu=${txtbld}$(tput setaf 4) # blue
bldgreen=${txtbld}$(tput setaf 2) # green
bldwht=${txtbld}$(tput setaf 7) # white
txtrst=$(tput sgr0) # Reset
info=${bldwht}*${txtrst} # Feedback
pass=${bldblu}*${txtrst}
warn=${bldred}*${txtrst}
ques=${bldblu}?${txtrst}
printf "%s=== Crawling $1 === %s" "$bldgreen" "$txtrst"
# wget in Spider mode, outputs to wglog file
# -R switch to ignore specific file types (images, javascript etc.)
wget --spider --recursive --level=1 --include-directories=/site --domains=www.aa.bb.cc --no-parent --no-host-directories --no-directories --restrict-file-names=nocontrol --execute="robots=off" --no-check-certificate --force-html --no-clobber --reject=bmp,css,gif,ico,jpg,jpeg,js,mp3,mp4,pdf,png,swf,svg,txt,xml,xls --user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0" $1 2>&1 | tee wglog
printf " %s========================================== \n" "$bldgreen"
printf "%s=== Crawl Finished... ===%s \n" "$bldgreen" "$txtrst"
printf "%s=== Begin retreiving page titles... ===%s \n" "$bldgreen" "$txtrst"
printf "%s========================================== \n" "$dgreen"
printf "%s** Run tail -f $1.csv for progress%s \n" "$bldred" "$txtrst"
# from wglog, grab URLs
# curl each URL and grep title
cat wglog | grep '^--' | awk '{print $3}' | sort | uniq | while read url; do {
printf "%s* Retreiving title for: %s$url%s \n" "$bldgreen" "$txtrst$txtbld" "$txtrst"
printf "\"${url}\",\"`curl -s ${url} | sed -n 's/.*<title>\(.*\)<\/title>.*/\1/ip;T;q'`\"\n" >> $2.csv
}; done
# clean up log file
rm wglog
exit
<?php
/**
* @file
* Create XML from sitemap.
*
* php -S localhost:8000
* http://localhost:8000/site_crawler_2_xml.php
* OR from cmd line
* php site_crawler_2_xml.php
*/
$url_arrays = read_log_file_to_array();
// Use array merge
$tree = build_tree($url_arrays);
$list = build_list($tree);
echo $list;
// Use references
$tree= build_tree2($url_arrays);
$list = build_list($tree);
echo $list;
// Create a Freemind Sitemap
$xml_sitemap = new SimpleXMLElement("<map version=\"1.0.1\"></map>");
array_to_xml($tree, $xml_sitemap);
$xml_file = $xml_user_info->asXML('sitemap.mm');
print_r($xml_file);
/**
* Read a URL list into an array.
*/
function read_log_file_to_array() {
$file = fopen("wlog.csv", "r");
while (!feof($file)) {
$meta = [];
$line = fgetcsv($file);
$path = str_replace('http://', '', $line[0]);
$path = str_replace(array("\r\n", "\n"), "", $path);
$path = rtrim($path, '/');
$dir_names = explode('/', $path);
$meta['__path'] = $line[0];
$meta['__title'] = $line[1];
$data[] = array($dir_names, $meta);
}
fclose($file);
return $data;
}
/**
* Description.
*/
function build_tree($paths) {
$array = [];
foreach ($paths as $path) {
$reverse_dir_order = array_reverse($path[0]);
$first = TRUE;
foreach ($reverse_dir_order as $dir) {
$temp = [];
if ($first) {
$temp[$dir]['__title'] = $path[1]['__title'];
$temp[$dir]['__path'] = $path[1]['__path'];
$first = FALSE;
} else {
$temp[$dir] = $prev;
}
$prev = $temp;
}
$array = array_merge_recursive($array, $temp);
}
return $array;
}
/**
* Description.
*/
function build_tree2($path_list) {
$path_tree = array();
foreach ($path_list as $path_data) {
$last_dir =& $path_tree;
foreach ($path_data[0] as $dir) {
if (!isset($last_dir[$dir])) {
$last_dir[$dir] = NULL;
}
$last_dir =& $last_dir[$dir];
}
$last_dir['__title'] = $path_data[1]['__title'];
$last_dir['__path'] = $path_data[1]['__path'];
}
return $path_tree;
}
/**
* Description.
*/
function build_list($tree, $prefix = '') {
$ul = '';
foreach ($tree as $key => $value) {
$li = '';
if (is_array($value)) {
if (array_key_exists('__title', $value)) {
$li .= "$prefix$key/ <a href=\"http://$prefix$key/\">${value['__title']}</a>";
}
else {
$li .= "$prefix$key/";
}
$li .= build_list($value, "$prefix$key/");
$ul .= strlen($li) ? "<li>$li</li>" : '';
}
}
return strlen($ul) ? "<ul>$ul</ul>" : '';
}
/**
* Description.
*/
function array_to_xml($array, &$xml_user_info) {
foreach ($array as $key => $value) {
if (is_array($value)) {
if ($key !== '__title' || $key !== '__path') {
$subnode = $xml_user_info->addChild("node");
$subnode->addAttribute('LINK', $value['__path']);
$subnode->addAttribute('TEXT', $value['__title']);
array_to_xml($value, $subnode);
}
else {
$subnode = $xml_user_info->addChild("item$key");
array_to_xml($value, $subnode);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment