victorkane/crawler.sh

## crawler.sh
#!/bin/bash
#
# Crawls a domain
# Retreives all visible URLs and their page titles
# Saves to CSV
# $1 = URL (no http(s))
# $2 = csv title
# MODIFY - wget include directories, domain, and --reject-regex  TODO - make a variable

# Text color variables
txtund=$(tput sgr 0 1)

# Underline
txtbld=$(tput bold)

# Bold
bldred=${txtbld}$(tput setaf 1) # red
bldblu=${txtbld}$(tput setaf 4) # blue
bldgreen=${txtbld}$(tput setaf 2) # green
bldwht=${txtbld}$(tput setaf 7) # white
txtrst=$(tput sgr0) # Reset
info=${bldwht}*${txtrst} # Feedback
pass=${bldblu}*${txtrst}
warn=${bldred}*${txtrst}
ques=${bldblu}?${txtrst}
printf "%s=== Crawling $1 ===  %s" "$bldgreen" "$txtrst"

# wget in Spider mode, outputs to wglog file
# -R switch to ignore specific file types (images, javascript etc.)
wget --spider --recursive --level=1 --include-directories=/site --domains=www.aa.bb.cc --no-parent --no-host-directories --no-directories --restrict-file-names=nocontrol --execute="robots=off" --no-check-certificate --force-html --no-clobber --reject=bmp,css,gif,ico,jpg,jpeg,js,mp3,mp4,pdf,png,swf,svg,txt,xml,xls --user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0" $1 2>&1 | tee wglog
printf " %s========================================== \n" "$bldgreen"
printf "%s=== Crawl Finished... ===%s \n" "$bldgreen" "$txtrst"
printf "%s=== Begin retreiving page titles... ===%s \n" "$bldgreen" "$txtrst"
printf "%s========================================== \n" "$dgreen"
printf "%s** Run tail -f $1.csv for progress%s \n" "$bldred" "$txtrst"

# from wglog, grab URLs
# curl each URL and grep title
cat wglog | grep '^--' | awk '{print $3}' | sort | uniq | while read url; do {
printf "%s* Retreiving title for: %s$url%s \n" "$bldgreen" "$txtrst$txtbld" "$txtrst"
printf "\"${url}\",\"`curl -s ${url} | sed -n 's/.*<title>\(.*\)<\/title>.*/\1/ip;T;q'`\"\n"  >> $2.csv
}; done

# clean up log file
rm wglog
exit

## site_crawler_2_xml.php
<?php

/**
 * @file
 * Create XML from sitemap.
 *
 * php -S localhost:8000
 * http://localhost:8000/site_crawler_2_xml.php
 * OR from cmd line
 * php site_crawler_2_xml.php
 */

$url_arrays = read_log_file_to_array();

// Use array merge
$tree = build_tree($url_arrays);
$list = build_list($tree);
echo $list;

// Use references
$tree= build_tree2($url_arrays);
$list = build_list($tree);
echo $list;

// Create a Freemind Sitemap
$xml_sitemap = new SimpleXMLElement("<map version=\"1.0.1\"></map>");
array_to_xml($tree, $xml_sitemap);
$xml_file = $xml_user_info->asXML('sitemap.mm');
print_r($xml_file);

/**
 * Read a URL list into an array.
 */
function read_log_file_to_array() {
  $file = fopen("wlog.csv", "r");
  while (!feof($file)) {
    $meta = [];
    $line = fgetcsv($file);
    $path = str_replace('http://', '', $line[0]);
    $path = str_replace(array("\r\n", "\n"), "", $path);
    $path = rtrim($path, '/');
    $dir_names = explode('/', $path);
    $meta['__path'] = $line[0];
    $meta['__title'] = $line[1];
    $data[] = array($dir_names, $meta);
  }
  fclose($file);
  return $data;
}

/**
 * Description.
 */
function build_tree($paths) {
  $array = [];
  foreach ($paths as $path) {
    $reverse_dir_order = array_reverse($path[0]);
    $first = TRUE;
    foreach ($reverse_dir_order as $dir) {
      $temp = [];
      if ($first) {
        $temp[$dir]['__title'] = $path[1]['__title'];
        $temp[$dir]['__path'] = $path[1]['__path'];
        $first = FALSE;
      } else {
        $temp[$dir] = $prev;
      }
      $prev = $temp;
    }
    $array = array_merge_recursive($array, $temp);
  }
  return $array;
}

/**
 * Description.
 */
function build_tree2($path_list) {
  $path_tree = array();
  foreach ($path_list as $path_data) {
    $last_dir =& $path_tree;
    foreach ($path_data[0] as $dir) {
      if (!isset($last_dir[$dir])) {
        $last_dir[$dir] = NULL;
      }
      $last_dir =& $last_dir[$dir];
    }
    $last_dir['__title'] = $path_data[1]['__title'];
    $last_dir['__path'] = $path_data[1]['__path'];
  }
  return $path_tree;
}

/**
 * Description.
 */
function build_list($tree, $prefix = '') {
  $ul = '';
  foreach ($tree as $key => $value) {
    $li = '';
    if (is_array($value)) {
      if (array_key_exists('__title', $value)) {
        $li .= "$prefix$key/ <a href=\"http://$prefix$key/\">${value['__title']}</a>";
      }
      else {
        $li .= "$prefix$key/";
      }
      $li .= build_list($value, "$prefix$key/");
      $ul .= strlen($li) ? "<li>$li</li>" : '';
    }
  }
  return strlen($ul) ? "<ul>$ul</ul>" : '';
}

/**
 * Description.
 */
function array_to_xml($array, &$xml_user_info) {
  foreach ($array as $key => $value) {
    if (is_array($value)) {
      if ($key !== '__title' || $key !== '__path') {
        $subnode = $xml_user_info->addChild("node");
        $subnode->addAttribute('LINK', $value['__path']);
        $subnode->addAttribute('TEXT', $value['__title']);
        array_to_xml($value, $subnode);
      }
      else {
        $subnode = $xml_user_info->addChild("item$key");
        array_to_xml($value, $subnode);
      }
    }
  }

}
	#!/bin/bash
	#
	# Crawls a domain
	# Retreives all visible URLs and their page titles
	# Saves to CSV
	# $1 = URL (no http(s))
	# $2 = csv title
	# MODIFY - wget include directories, domain, and --reject-regex TODO - make a variable

	# Text color variables
	txtund=$(tput sgr 0 1)

	# Underline
	txtbld=$(tput bold)

	# Bold
	bldred=${txtbld}$(tput setaf 1) # red
	bldblu=${txtbld}$(tput setaf 4) # blue
	bldgreen=${txtbld}$(tput setaf 2) # green
	bldwht=${txtbld}$(tput setaf 7) # white
	txtrst=$(tput sgr0) # Reset
	info=${bldwht}*${txtrst} # Feedback
	pass=${bldblu}*${txtrst}
	warn=${bldred}*${txtrst}
	ques=${bldblu}?${txtrst}
	printf "%s=== Crawling $1 === %s" "$bldgreen" "$txtrst"

	# wget in Spider mode, outputs to wglog file
	# -R switch to ignore specific file types (images, javascript etc.)
	wget --spider --recursive --level=1 --include-directories=/site --domains=www.aa.bb.cc --no-parent --no-host-directories --no-directories --restrict-file-names=nocontrol --execute="robots=off" --no-check-certificate --force-html --no-clobber --reject=bmp,css,gif,ico,jpg,jpeg,js,mp3,mp4,pdf,png,swf,svg,txt,xml,xls --user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0" $1 2>&1 \| tee wglog
	printf " %s========================================== \n" "$bldgreen"
	printf "%s=== Crawl Finished... ===%s \n" "$bldgreen" "$txtrst"
	printf "%s=== Begin retreiving page titles... ===%s \n" "$bldgreen" "$txtrst"
	printf "%s========================================== \n" "$dgreen"
	printf "%s** Run tail -f $1.csv for progress%s \n" "$bldred" "$txtrst"

	# from wglog, grab URLs
	# curl each URL and grep title
	cat wglog \| grep '^--' \| awk '{print $3}' \| sort \| uniq \| while read url; do {
	printf "%s* Retreiving title for: %s$url%s \n" "$bldgreen" "$txtrst$txtbld" "$txtrst"
	printf "\"${url}\",\"`curl -s ${url} \| sed -n 's/.<title>\(.\)<\/title>.*/\1/ip;T;q'`\"\n" >> $2.csv
	}; done

	# clean up log file
	rm wglog
	exit
	<?php

	/**
	* @file
	* Create XML from sitemap.
	*
	* php -S localhost:8000
	* http://localhost:8000/site_crawler_2_xml.php
	* OR from cmd line
	* php site_crawler_2_xml.php
	*/

	$url_arrays = read_log_file_to_array();

	// Use array merge
	$tree = build_tree($url_arrays);
	$list = build_list($tree);
	echo $list;

	// Use references
	$tree= build_tree2($url_arrays);
	$list = build_list($tree);
	echo $list;

	// Create a Freemind Sitemap
	$xml_sitemap = new SimpleXMLElement("<map version=\"1.0.1\"></map>");
	array_to_xml($tree, $xml_sitemap);
	$xml_file = $xml_user_info->asXML('sitemap.mm');
	print_r($xml_file);

	/**
	* Read a URL list into an array.
	*/
	function read_log_file_to_array() {
	$file = fopen("wlog.csv", "r");
	while (!feof($file)) {
	$meta = [];
	$line = fgetcsv($file);
	$path = str_replace('http://', '', $line[0]);
	$path = str_replace(array("\r\n", "\n"), "", $path);
	$path = rtrim($path, '/');
	$dir_names = explode('/', $path);
	$meta['__path'] = $line[0];
	$meta['__title'] = $line[1];
	$data[] = array($dir_names, $meta);
	}
	fclose($file);
	return $data;
	}

	/**
	* Description.
	*/
	function build_tree($paths) {
	$array = [];
	foreach ($paths as $path) {
	$reverse_dir_order = array_reverse($path[0]);
	$first = TRUE;
	foreach ($reverse_dir_order as $dir) {
	$temp = [];
	if ($first) {
	$temp[$dir]['__title'] = $path[1]['__title'];
	$temp[$dir]['__path'] = $path[1]['__path'];
	$first = FALSE;
	} else {
	$temp[$dir] = $prev;
	}
	$prev = $temp;
	}
	$array = array_merge_recursive($array, $temp);
	}
	return $array;
	}

	/**
	* Description.
	*/
	function build_tree2($path_list) {
	$path_tree = array();
	foreach ($path_list as $path_data) {
	$last_dir =& $path_tree;
	foreach ($path_data[0] as $dir) {
	if (!isset($last_dir[$dir])) {
	$last_dir[$dir] = NULL;
	}
	$last_dir =& $last_dir[$dir];
	}
	$last_dir['__title'] = $path_data[1]['__title'];
	$last_dir['__path'] = $path_data[1]['__path'];
	}
	return $path_tree;
	}

	/**
	* Description.
	*/
	function build_list($tree, $prefix = '') {
	$ul = '';
	foreach ($tree as $key => $value) {
	$li = '';
	if (is_array($value)) {
	if (array_key_exists('__title', $value)) {
	$li .= "$prefix$key/ <a href=\"http://$prefix$key/\">${value['__title']}</a>";
	}
	else {
	$li .= "$prefix$key/";
	}
	$li .= build_list($value, "$prefix$key/");
	$ul .= strlen($li) ? "<li>$li</li>" : '';
	}
	}
	return strlen($ul) ? "<ul>$ul</ul>" : '';
	}

	/**
	* Description.
	*/
	function array_to_xml($array, &$xml_user_info) {
	foreach ($array as $key => $value) {
	if (is_array($value)) {
	if ($key !== '__title' \|\| $key !== '__path') {
	$subnode = $xml_user_info->addChild("node");
	$subnode->addAttribute('LINK', $value['__path']);
	$subnode->addAttribute('TEXT', $value['__title']);
	array_to_xml($value, $subnode);
	}
	else {
	$subnode = $xml_user_info->addChild("item$key");
	array_to_xml($value, $subnode);
	}
	}
	}

	}