Olical/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Overview

This script takes a list of files out of a file, removes duplicate files and then loops through all those files.
It checks each files headers to see if it has a HTTP status code of 200.
If it is a redirect (301) or not found (404) then it will not add it to the sitemap.
So all good files are added into sitemap.xml at the end.
It was built to take the output from site crawlers and build a sitemap out of what they find. Because when they generate a sitemap it does not take redirects into account, this does.
Running

To run it you will need NodeJS installed. If you need some help with that, have a Google.
Once installed you just need to run it like so.
node extract.js files.txt

Where files.txt is your file containing your list of files.
The list of files must resemble something like this.
http://www.example.com/index.html
http://www.example.com/contact.html
http://www.example.com/aboutus.html


## extract.js
// Initialise anything required
var http = require('http'),
	fs = require('fs'),
	list = null,
	i = null,
	options = {
		port: 80
	},
	extract = new RegExp('^https?://([-\\w\\.]+)+(:\\d+)?(/([\\w/_\\.]*(\\?\\S+)?)?)?', 'g'),
	clean = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">',
	path = null,
	done = 0;

// Grab the data
list = fs.readFileSync(process.argv[2], 'utf8').split(/[\r\n]/g)

function unique(target, extract) {
	var a = [],
		l = target.length,
		j = null,
		i = null;

	for(i = 0; i < l; i++) {
		for(j = i + 1; j < l; j++) {
			if(target[i] === target[j]) {
				j = ++i;
			}
		}

		a.push(target[i]);
	}
	return a;
}

// Unique it
list = unique(list, extract);

// Grab the host
options.host = list[0].replace(extract, "$1");

// Loop through it
for(i = 0; i < list.length; i++) {
	options.path = list[i].replace(extract, "$3");

	http.get(options, function(res) {
		done += 1;
		path = 'http://' + options.host + res.connection._httpMessage.path;

		if(res.statusCode === 200) {
			console.log('Adding ' + path);
			clean += '<url><loc>' + path + '</loc></url>';

			if(done === list.length) {
				// Set up the file to be written
				clean += '</urlset>';

				// Now write
				fs.writeFileSync('sitemap.xml', clean);
			}
		}
		else {
			console.log('Rejecting (' + res.statusCode + ') ' + path);
		}
	});
}
	// Initialise anything required
	var http = require('http'),
	fs = require('fs'),
	list = null,
	i = null,
	options = {
	port: 80
	},
	extract = new RegExp('^https?://([-\\w\\.]+)+(:\\d+)?(/([\\w/_\\.]*(\\?\\S+)?)?)?', 'g'),
	clean = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">',
	path = null,
	done = 0;

	// Grab the data
	list = fs.readFileSync(process.argv[2], 'utf8').split(/[\r\n]/g)

	function unique(target, extract) {
	var a = [],
	l = target.length,
	j = null,
	i = null;

	for(i = 0; i < l; i++) {
	for(j = i + 1; j < l; j++) {
	if(target[i] === target[j]) {
	j = ++i;
	}
	}

	a.push(target[i]);
	}
	return a;
	}

	// Unique it
	list = unique(list, extract);

	// Grab the host
	options.host = list[0].replace(extract, "$1");

	// Loop through it
	for(i = 0; i < list.length; i++) {
	options.path = list[i].replace(extract, "$3");

	http.get(options, function(res) {
	done += 1;
	path = 'http://' + options.host + res.connection._httpMessage.path;

	if(res.statusCode === 200) {
	console.log('Adding ' + path);
	clean += '<url><loc>' + path + '</loc></url>';

	if(done === list.length) {
	// Set up the file to be written
	clean += '</urlset>';

	// Now write
	fs.writeFileSync('sitemap.xml', clean);
	}
	}
	else {
	console.log('Rejecting (' + res.statusCode + ') ' + path);
	}
	});
	}