PseudoSky/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Setup

You're going to need

lodash

NMAP Script List

Load up lodash.js into the chrome inspector (or own html/js files)
Load nmap-scripts.json data into a variable named nmap
Load contextualizer.js

Step 1

I added some interesting functions to the module to grab the text and resources (js) out of whatever website you're on (tested in chrome) so that you can analyze what the site is about and what they used to buid it.
Example:Playing around in the chrome inspector.
//Generates the tree
var tf = analyzePage();

// Grabs the moste frequent terms from the html
var ten_most_frequent = tf.getSortedTerms('html',10);
console.log(ten_most_frequent);
/*
> [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object]

> [{
    count:8,
    freq:0.07272727272727272,
    key:"creat"
  }, ... 
  ]
*/
Step 2

The module can be used to analyze collections of objects for information, from which we can search and score queries against the documents
Example:Analyze and query the nmap script data.
/*
 'name' is the attribute in each doc the nmap collection that uniquely ids the record
 'info' is the attribute we want to extract the text from
     -> it is in an array because at times we may want to analyze across multiple attrs
*/
var tf=analyzeCollection(nmap,'name',['info'])

// Should look like this
/*
{
  BM25: {
    averageDocumentLength: 12.866419294990724,
    b: 0.75,
    documents: Object,
    k1: 1.3,
    terms: Object,
    totalDocumentTermLength: 6935,
    totalDocuments: 539
  }
}
*/

// Search for useful scripts
tf.search('passwords database',5)
/*
> ["password", "databas"]
> Array[5]
    0: {
        _score: 2.519205992456092
        body: "Performs brute force password auditing against the MongoDB database."
        id: "mongodb-brute"
        termCount: 7
        terms: tokens: Array[7]
      },
    1: Object,
    2: Object,
    3: Object,
    4: Object,
    length: 5
*/

  
## contextualizer.js
/**
 * Okapi BM25 Implementation
 * Burak Kanber, 2015
 *
 * License: you may use this for educational purposes only.
 */

/**
 * Porter stemmer
 *
 * I took the porter stemmer from here and minified it: http://tartarus.org/~martin/PorterStemmer/js.txt
 *
 * Original comments on the porter stemmer source:

// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
// paper, in
//
//  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
//  no. 3, pp 130-137,
//
// see also http://www.tartarus.org/~martin/PorterStemmer

// Release 1 be 'andargor', Jul 2004
// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
*/

var stemmer=function(){var e={ational:"ate",tional:"tion",enci:"ence",anci:"ance",izer:"ize",bli:"ble",alli:"al",entli:"ent",eli:"e",ousli:"ous",ization:"ize",ation:"ate",ator:"ate",alism:"al",iveness:"ive",fulness:"ful",ousness:"ous",aliti:"al",iviti:"ive",biliti:"ble",logi:"log"},i={icate:"ic",ative:"",alize:"al",iciti:"ic",ical:"ic",ful:"",ness:""},t="[^aeiou]",s="[aeiouy]",a=t+"[^aeiouy]*",l=s+"[aeiou]*",n="^("+a+")?"+l+a,o="^("+a+")?"+l+a+"("+l+")?$",c="^("+a+")?"+l+a+l+a,r="^("+a+")?"+s;return function(t){var l,u,x,$,p,v,g;if(t.length<3)return t;if(x=t.substr(0,1),"y"==x&&(t=x.toUpperCase()+t.substr(1)),$=/^(.+?)(ss|i)es$/,p=/^(.+?)([^s])s$/,$.test(t)?t=t.replace($,"$1$2"):p.test(t)&&(t=t.replace(p,"$1$2")),$=/^(.+?)eed$/,p=/^(.+?)(ed|ing)$/,$.test(t)){var f=$.exec(t);$=new RegExp(n),$.test(f[1])&&($=/.$/,t=t.replace($,""))}else if(p.test(t)){var f=p.exec(t);l=f[1],p=new RegExp(r),p.test(l)&&(t=l,p=/(at|bl|iz)$/,v=new RegExp("([^aeiouylsz])\\1$"),g=new RegExp("^"+a+s+"[^aeiouwxy]$"),p.test(t)?t+="e":v.test(t)?($=/.$/,t=t.replace($,"")):g.test(t)&&(t+="e"))}if($=/^(.+?)y$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(r),$.test(l)&&(t=l+"i")}if($=/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+e[u])}if($=/^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+i[u])}if($=/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/,p=/^(.+?)(s|t)(ion)$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),$.test(l)&&(t=l)}else if(p.test(t)){var f=p.exec(t);l=f[1]+f[2],p=new RegExp(c),p.test(l)&&(t=l)}if($=/^(.+?)e$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),p=new RegExp(o),v=new RegExp("^"+a+s+"[^aeiouwxy]$"),($.test(l)||p.test(l)&&!v.test(l))&&(t=l)}return $=/ll$/,p=new RegExp(c),$.test(t)&&p.test(t)&&($=/.$/,t=t.replace($,"")),"y"==x&&(t=x.toLowerCase()+t.substr(1)),t}}();

/**
 * Stopwords list taken from http://www.ranks.nl/stopwords
 */

var stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"];

var stopStems = stopwords.map(stemmer);

function BM25() {
	this.documents = {};
	this.terms = {};
	this.totalDocumentTermLength = 0; // For average doc length.
	this.averageDocumentLength = 0;
	this.totalDocuments = 0;
	this.k1 = 1.3;
	this.b = 0.75;
};

// Static methods.

BM25.Tokenize = function(text) {
	text = _.clean(text)
		    .split(' ')
		    .map(function(a) { return stemmer(a); })
    console.log(text);
    window.text=text;
	// Filter out stopStems
	var out = [];
	for (var i = 0, len = text.length; i < len; i++) {
		if (stopStems.indexOf(text[i]) === -1) {
			out.push(text[i]);
		}
	}

	return out;
};

// Instance methods.

/**
  * @param doc Object Expects this parameter to have an id and body properties.
  */

BM25.prototype.addDocument = function(doc, batch) {
	if (typeof doc.id === 'undefined') { throw new Error(1000, 'ID is a required property of documents.'); };
	if (typeof doc.body === 'undefined') { throw new Error(1001, 'Body is a required property of documents.'); };

	// Raw tokenized list of words
	var tokens = BM25.Tokenize(doc.body).filter( a => a.length>0);

	// Will hold unique terms and their counts and frequencies
	var _terms = {};

	// docObj will eventually be added to the documents database
	var docObj = {id: doc.id, tokens: tokens, body: doc.body};

	// Count number of terms
	docObj.termCount = tokens.length;

	// Increment totalDocuments
	this.totalDocuments++;

	// Readjust averageDocumentLength
	this.totalDocumentTermLength += docObj.termCount;
	this.averageDocumentLength = this.totalDocumentTermLength / this.totalDocuments;

	// Calculate term frequency
	// First get terms count
	for (var i = 0, len = tokens.length; i < len; i++) {
		var term = tokens[i];
		if (!_terms[term]) {
			_terms[term] = {
				count: 0,
				freq: 0
			};
		};
		_terms[term].count++;
	}

	// Then re-loop to calculate term frequency.
	// We'll also update inverse document frequencies here.
	var keys = Object.keys(_terms);
	for (var i = 0, len = keys.length; i < len; i++) {
		var term = keys[i];
		// Term Frequency for this document.
		_terms[term].freq = _terms[term].count / docObj.termCount;

		// Inverse Document Frequency initialization
		if (!this.terms[term]) {
			this.terms[term] = {
				n: 0, // Number of docs this term appears in, uniquely
				idf: 0,
				docs: {}
			};
		}

		this.terms[term].n++;
		this.terms[term].docs[doc.id]=1
	};

	// Calculate inverse document frequencies
	!batch && this.updateIdf();

	// Add docObj to docs db
	docObj.terms = _terms;
	this.documents[docObj.id] = docObj;
};

BM25.prototype.updateIdf = function() {
	var keys = Object.keys(this.terms);
	for (var i = 0, len = keys.length; i < len; i++) {
		var term = keys[i];
		var num = (this.totalDocuments - this.terms[term].n + 0.5);
		var denom = (this.terms[term].n + 0.5);
		this.terms[term].idf = Math.max(Math.log10(num / denom), 0.01);
	}
};

BM25.prototype.getSortedTerms=function(doc,size){

    if(doc){
        if(!this.documents[doc]) return "That document doesn't exist.";
        size=size||_.keys(this.documents[doc].terms).length
        return _(this.documents[doc].terms).map((v,k) => ({key:k, freq:v.freq, count:v.count})).sortBy('count').takeRight(size).value().reverse()
    }else{
        size=size||_.keys(this.terms).length
        return _(this.terms).map((v,k) => ({key:k, idf:v.idf, n:v.n})).sortBy('n').takeRight(size).value().reverse()
    }
}

BM25.prototype.search = function(query, limit) {

	var queryTerms = BM25.Tokenize(query);
	var results = [];

	// Look at each document in turn. There are better ways to do this with inverted indices.
	var keys = Object.keys(this.documents);
	for (var j = 0, nDocs = keys.length; j < nDocs; j++) {
		var id = keys[j];
		// The relevance score for a document is the sum of a tf-idf-like
		// calculation for each query term.
		this.documents[id]._score = 0;

		// Calculate the score for each query term
		for (var i = 0, len = queryTerms.length; i < len; i++) {
			var queryTerm = queryTerms[i];

			// We've never seen this term before so IDF will be 0.
			// Means we can skip the whole term, it adds nothing to the score
			// and isn't in any document.
			if (typeof this.terms[queryTerm] === 'undefined') {
				continue;
			}

			// This term isn't in the document, so the TF portion is 0 and this
			// term contributes nothing to the search score.
			if (typeof this.documents[id].terms[queryTerm] === 'undefined') {
				continue;
			}

			// The term is in the document, let's go.
			// The whole term is :
			// IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength))

			// IDF is pre-calculated for the whole docset.
			var idf = this.terms[queryTerm].idf;
			// Numerator of the TF portion.
			var num = this.documents[id].terms[queryTerm].count * (this.k1 + 1);
			// Denomerator of the TF portion.
			var denom = this.documents[id].terms[queryTerm].count
				+ (this.k1 * (1 - this.b + (this.b * this.documents[id].termCount / this.averageDocumentLength)));

			// Add this query term to the score
			this.documents[id]._score += idf * num / denom;
		}

		if (!isNaN(this.documents[id]._score) && this.documents[id]._score > 0) {
			results.push(this.documents[id]);
		}
	}

	results.sort(function(a, b) { return b._score - a._score; });
	return results.slice(0, limit || 10);
};

var bm = new BM25;

onmessage = function(e) {
	var payload = e.data;
	switch (payload.type) {

		case 'index-batch':
			//payload.data = payload.data.slice(0, 1000);
			var len = payload.data.length;
			for (var i in payload.data) {
				bm.addDocument({id: i, body: payload.data[i]});
				if (i % 100 === 0) {
					postMessage({type:"index-update", data: i/len});
				}
			}
			bm.updateIdf();
			postMessage({type:"index-update", data: 1});
			break;

		case 'search':
			postMessage({type: "search-results", data: bm.search(payload.data)});
			break;

	}
}
function analyzeCollection(arr, id_attr, attrs){
	var tfidf = new BM25();
	if(!attrs) attrs=Object.keys(arr[0]);

    // 	For each element add each of the attributes to a doc
	_.map( arr, (d,i) =>{
		// With the elements id_attr as the id
		_.each(attrs, attr=> d[attr] ? tfidf.addDocument({id: (d[id_attr] ? d[id_attr] : i), body: d[attr]}, true) : -1 );
	})

    tfidf.updateIdf();
	return tfidf;

}


function getScripts(){
  js=_.map(document.scripts, d => ({id: (d.src || _.uniqueId()), body: d.innerText}));
  var tfidf=new BM25();
  _.each(js, d => tfidf.addDocument(d, true) );
  tfidf.updateIdf();
  console.log(tfidf);
  console.log(tfidf.terms);
};

var repl={
    html: ['/&lt;/gi','/&gt;/gi','/&nbsp;/gi'],
    js_methods: ['console.log('],
    splits:[ /"/g, /,/g, /\w\.\w/gi, /\s.{1,3}\s/gi, /\W/g, /\s+/g, /\s[0-9]*\s/g, /\s[0-9]*px\s/g]
}

var filters = ["nbsp","lt","gt","init","Array","Date","eval","function","hasOwnProperty","Infinity","isFinite","isNaN","isPrototypeOf","length","Math","NaN","name","Number","Object","prototype","String","toString","undefined","valueOf","abstract", "arguments", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "debugger", "default", "delete", "do", "double", "else", "enum", "eval", "export", "extends*", "false", "final", "finally", "float", "for", "function", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "let", "long", "native", "new", "null", "package", "private", "protected", "public", "return", "short", "static", "super", "switch", "synchronized", "this", "throw", "throws", "transient", "true", "try", "typeof", "var", "void", "volatile", "while", "with", "yield"];
function replaceWith(replace_regex, replace_val){
    return _.partial(_.curryReplace,replace_regex, replace_val);
}
_.curryReplace = function (rgxWhat, rplWith, text) {
  return String(text).replace(rgxWhat, rplWith);
};

_.mixin({
    replaceFlow: (v) => _.partial(_.replaceWith,_,v),
    replaceWith: (replace_regex, replace_val) =>{
       return _.partial(_.curryReplace,replace_regex, replace_val);
    },
    curryReplace: (rgxWhat, rplWith, text) =>{
        return String(text).replace(rgxWhat, rplWith);
    },
    isntEmpty:_.negate(_.isEmpty),
    clean: function(s){
        s = _.toLower(s);
        var replacements = _.flatten(_.values(repl));
        replacements = _.map(replacements,_.replaceFlow(' '));
        var res = _.flow(replacements)(s)
                   .replace(/\s.{1,3}\s/gi, " ")
                   .replace(/\s.{1,3}\s/gi, " ")
                   .replace(/\s.{1,3}\s/gi, " ");
        res=_.pullAll(res.split(' '),filters).join(' ');
        return res;
    }
})

function analyzePage(){
    var tfidf = new BM25();

    x=document.body.innerHTML.replace(/(<([^>]+)>)/ig,"")
    x=_.flow(_.replaceFlow(' ')(repl.html))(x)
                   .replace(/[\(\){}[]"'`~]*/g,' ')
                   .replace(/\s+/g,' ')
                   .replace(/\s.{1,3}\s/gi, " ")
                   .replace(/\s.{1,3}\s/gi, " ")
                   .replace(/\s.{1,3}\s/gi, " ")


    tfidf.addDocument({id: "html",body:_.clean(x)}, true);
    tfidf.updateIdf();
    return tfidf
}
	/**
	* Okapi BM25 Implementation
	* Burak Kanber, 2015
	*
	* License: you may use this for educational purposes only.
	*/

	/**
	* Porter stemmer
	*
	* I took the porter stemmer from here and minified it: http://tartarus.org/~martin/PorterStemmer/js.txt
	*
	* Original comments on the porter stemmer source:

	// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
	// paper, in
	//
	// Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
	// no. 3, pp 130-137,
	//
	// see also http://www.tartarus.org/~martin/PorterStemmer

	// Release 1 be 'andargor', Jul 2004
	// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
	*/

	var stemmer=function(){var e={ational:"ate",tional:"tion",enci:"ence",anci:"ance",izer:"ize",bli:"ble",alli:"al",entli:"ent",eli:"e",ousli:"ous",ization:"ize",ation:"ate",ator:"ate",alism:"al",iveness:"ive",fulness:"ful",ousness:"ous",aliti:"al",iviti:"ive",biliti:"ble",logi:"log"},i={icate:"ic",ative:"",alize:"al",iciti:"ic",ical:"ic",ful:"",ness:""},t="[^aeiou]",s="[aeiouy]",a=t+"[^aeiouy]",l=s+"[aeiou]",n="^("+a+")?"+l+a,o="^("+a+")?"+l+a+"("+l+")?$",c="^("+a+")?"+l+a+l+a,r="^("+a+")?"+s;return function(t){var l,u,x,$,p,v,g;if(t.length<3)return t;if(x=t.substr(0,1),"y"==x&&(t=x.toUpperCase()+t.substr(1)),$=/^(.+?)(ss\|i)es$/,p=/^(.+?)([^s])s$/,$.test(t)?t=t.replace($,"$1$2"):p.test(t)&&(t=t.replace(p,"$1$2")),$=/^(.+?)eed$/,p=/^(.+?)(ed\|ing)$/,$.test(t)){var f=$.exec(t);$=new RegExp(n),$.test(f[1])&&($=/.$/,t=t.replace($,""))}else if(p.test(t)){var f=p.exec(t);l=f[1],p=new RegExp(r),p.test(l)&&(t=l,p=/(at\|bl\|iz)$/,v=new RegExp("([^aeiouylsz])\\1$"),g=new RegExp("^"+a+s+"[^aeiouwxy]$"),p.test(t)?t+="e":v.test(t)?($=/.$/,t=t.replace($,"")):g.test(t)&&(t+="e"))}if($=/^(.+?)y$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(r),$.test(l)&&(t=l+"i")}if($=/^(.+?)(ational\|tional\|enci\|anci\|izer\|bli\|alli\|entli\|eli\|ousli\|ization\|ation\|ator\|alism\|iveness\|fulness\|ousness\|aliti\|iviti\|biliti\|logi)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+e[u])}if($=/^(.+?)(icate\|ative\|alize\|iciti\|ical\|ful\|ness)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+i[u])}if($=/^(.+?)(al\|ance\|ence\|er\|ic\|able\|ible\|ant\|ement\|ment\|ent\|ou\|ism\|ate\|iti\|ous\|ive\|ize)$/,p=/^(.+?)(s\|t)(ion)$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),$.test(l)&&(t=l)}else if(p.test(t)){var f=p.exec(t);l=f[1]+f[2],p=new RegExp(c),p.test(l)&&(t=l)}if($=/^(.+?)e$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),p=new RegExp(o),v=new RegExp("^"+a+s+"[^aeiouwxy]$"),($.test(l)\|\|p.test(l)&&!v.test(l))&&(t=l)}return $=/ll$/,p=new RegExp(c),$.test(t)&&p.test(t)&&($=/.$/,t=t.replace($,"")),"y"==x&&(t=x.toLowerCase()+t.substr(1)),t}}();

	/**
	* Stopwords list taken from http://www.ranks.nl/stopwords
	*/

	var stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"];

	var stopStems = stopwords.map(stemmer);

	function BM25() {
	this.documents = {};
	this.terms = {};
	this.totalDocumentTermLength = 0; // For average doc length.
	this.averageDocumentLength = 0;
	this.totalDocuments = 0;
	this.k1 = 1.3;
	this.b = 0.75;
	};

	// Static methods.

	BM25.Tokenize = function(text) {
	text = _.clean(text)
	.split(' ')
	.map(function(a) { return stemmer(a); })
	console.log(text);
	window.text=text;
	// Filter out stopStems
	var out = [];
	for (var i = 0, len = text.length; i < len; i++) {
	if (stopStems.indexOf(text[i]) === -1) {
	out.push(text[i]);
	}
	}

	return out;
	};

	// Instance methods.

	/**
	* @param doc Object Expects this parameter to have an id and body properties.
	*/

	BM25.prototype.addDocument = function(doc, batch) {
	if (typeof doc.id === 'undefined') { throw new Error(1000, 'ID is a required property of documents.'); };
	if (typeof doc.body === 'undefined') { throw new Error(1001, 'Body is a required property of documents.'); };

	// Raw tokenized list of words
	var tokens = BM25.Tokenize(doc.body).filter( a => a.length>0);

	// Will hold unique terms and their counts and frequencies
	var _terms = {};

	// docObj will eventually be added to the documents database
	var docObj = {id: doc.id, tokens: tokens, body: doc.body};

	// Count number of terms
	docObj.termCount = tokens.length;

	// Increment totalDocuments
	this.totalDocuments++;

	// Readjust averageDocumentLength
	this.totalDocumentTermLength += docObj.termCount;
	this.averageDocumentLength = this.totalDocumentTermLength / this.totalDocuments;

	// Calculate term frequency
	// First get terms count
	for (var i = 0, len = tokens.length; i < len; i++) {
	var term = tokens[i];
	if (!_terms[term]) {
	_terms[term] = {
	count: 0,
	freq: 0
	};
	};
	_terms[term].count++;
	}

	// Then re-loop to calculate term frequency.
	// We'll also update inverse document frequencies here.
	var keys = Object.keys(_terms);
	for (var i = 0, len = keys.length; i < len; i++) {
	var term = keys[i];
	// Term Frequency for this document.
	_terms[term].freq = _terms[term].count / docObj.termCount;

	// Inverse Document Frequency initialization
	if (!this.terms[term]) {
	this.terms[term] = {
	n: 0, // Number of docs this term appears in, uniquely
	idf: 0,
	docs: {}
	};
	}

	this.terms[term].n++;
	this.terms[term].docs[doc.id]=1
	};

	// Calculate inverse document frequencies
	!batch && this.updateIdf();

	// Add docObj to docs db
	docObj.terms = _terms;
	this.documents[docObj.id] = docObj;
	};

	BM25.prototype.updateIdf = function() {
	var keys = Object.keys(this.terms);
	for (var i = 0, len = keys.length; i < len; i++) {
	var term = keys[i];
	var num = (this.totalDocuments - this.terms[term].n + 0.5);
	var denom = (this.terms[term].n + 0.5);
	this.terms[term].idf = Math.max(Math.log10(num / denom), 0.01);
	}
	};

	BM25.prototype.getSortedTerms=function(doc,size){

	if(doc){
	if(!this.documents[doc]) return "That document doesn't exist.";
	size=size\|\|_.keys(this.documents[doc].terms).length
	return _(this.documents[doc].terms).map((v,k) => ({key:k, freq:v.freq, count:v.count})).sortBy('count').takeRight(size).value().reverse()
	}else{
	size=size\|\|_.keys(this.terms).length
	return _(this.terms).map((v,k) => ({key:k, idf:v.idf, n:v.n})).sortBy('n').takeRight(size).value().reverse()
	}
	}

	BM25.prototype.search = function(query, limit) {

	var queryTerms = BM25.Tokenize(query);
	var results = [];

	// Look at each document in turn. There are better ways to do this with inverted indices.
	var keys = Object.keys(this.documents);
	for (var j = 0, nDocs = keys.length; j < nDocs; j++) {
	var id = keys[j];
	// The relevance score for a document is the sum of a tf-idf-like
	// calculation for each query term.
	this.documents[id]._score = 0;

	// Calculate the score for each query term
	for (var i = 0, len = queryTerms.length; i < len; i++) {
	var queryTerm = queryTerms[i];

	// We've never seen this term before so IDF will be 0.
	// Means we can skip the whole term, it adds nothing to the score
	// and isn't in any document.
	if (typeof this.terms[queryTerm] === 'undefined') {
	continue;
	}

	// This term isn't in the document, so the TF portion is 0 and this
	// term contributes nothing to the search score.
	if (typeof this.documents[id].terms[queryTerm] === 'undefined') {
	continue;
	}

	// The term is in the document, let's go.
	// The whole term is :
	// IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength))

	// IDF is pre-calculated for the whole docset.
	var idf = this.terms[queryTerm].idf;
	// Numerator of the TF portion.
	var num = this.documents[id].terms[queryTerm].count * (this.k1 + 1);
	// Denomerator of the TF portion.
	var denom = this.documents[id].terms[queryTerm].count
	+ (this.k1 * (1 - this.b + (this.b * this.documents[id].termCount / this.averageDocumentLength)));

	// Add this query term to the score
	this.documents[id]._score += idf * num / denom;
	}

	if (!isNaN(this.documents[id]._score) && this.documents[id]._score > 0) {
	results.push(this.documents[id]);
	}
	}

	results.sort(function(a, b) { return b._score - a._score; });
	return results.slice(0, limit \|\| 10);
	};

	var bm = new BM25;

	onmessage = function(e) {
	var payload = e.data;
	switch (payload.type) {

	case 'index-batch':
	//payload.data = payload.data.slice(0, 1000);
	var len = payload.data.length;
	for (var i in payload.data) {
	bm.addDocument({id: i, body: payload.data[i]});
	if (i % 100 === 0) {
	postMessage({type:"index-update", data: i/len});
	}
	}
	bm.updateIdf();
	postMessage({type:"index-update", data: 1});
	break;

	case 'search':
	postMessage({type: "search-results", data: bm.search(payload.data)});
	break;

	}
	}
	function analyzeCollection(arr, id_attr, attrs){
	var tfidf = new BM25();
	if(!attrs) attrs=Object.keys(arr[0]);

	// For each element add each of the attributes to a doc
	_.map( arr, (d,i) =>{
	// With the elements id_attr as the id
	_.each(attrs, attr=> d[attr] ? tfidf.addDocument({id: (d[id_attr] ? d[id_attr] : i), body: d[attr]}, true) : -1 );
	})

	tfidf.updateIdf();
	return tfidf;

	}


	function getScripts(){
	js=_.map(document.scripts, d => ({id: (d.src \|\| _.uniqueId()), body: d.innerText}));
	var tfidf=new BM25();
	_.each(js, d => tfidf.addDocument(d, true) );
	tfidf.updateIdf();
	console.log(tfidf);
	console.log(tfidf.terms);
	};

	var repl={
	html: ['/</gi','/>/gi','/ /gi'],
	js_methods: ['console.log('],
	splits:[ /"/g, /,/g, /\w\.\w/gi, /\s.{1,3}\s/gi, /\W/g, /\s+/g, /\s[0-9]\s/g, /\s[0-9]px\s/g]
	}

	var filters = ["nbsp","lt","gt","init","Array","Date","eval","function","hasOwnProperty","Infinity","isFinite","isNaN","isPrototypeOf","length","Math","NaN","name","Number","Object","prototype","String","toString","undefined","valueOf","abstract", "arguments", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "debugger", "default", "delete", "do", "double", "else", "enum", "eval", "export", "extends*", "false", "final", "finally", "float", "for", "function", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "let", "long", "native", "new", "null", "package", "private", "protected", "public", "return", "short", "static", "super", "switch", "synchronized", "this", "throw", "throws", "transient", "true", "try", "typeof", "var", "void", "volatile", "while", "with", "yield"];
	function replaceWith(replace_regex, replace_val){
	return _.partial(_.curryReplace,replace_regex, replace_val);
	}
	_.curryReplace = function (rgxWhat, rplWith, text) {
	return String(text).replace(rgxWhat, rplWith);
	};

	_.mixin({
	replaceFlow: (v) => _.partial(_.replaceWith,_,v),
	replaceWith: (replace_regex, replace_val) =>{
	return _.partial(_.curryReplace,replace_regex, replace_val);
	},
	curryReplace: (rgxWhat, rplWith, text) =>{
	return String(text).replace(rgxWhat, rplWith);
	},
	isntEmpty:_.negate(_.isEmpty),
	clean: function(s){
	s = _.toLower(s);
	var replacements = _.flatten(_.values(repl));
	replacements = _.map(replacements,_.replaceFlow(' '));
	var res = _.flow(replacements)(s)
	.replace(/\s.{1,3}\s/gi, " ")
	.replace(/\s.{1,3}\s/gi, " ")
	.replace(/\s.{1,3}\s/gi, " ");
	res=_.pullAll(res.split(' '),filters).join(' ');
	return res;
	}
	})

	function analyzePage(){
	var tfidf = new BM25();

	x=document.body.innerHTML.replace(/(<([^>]+)>)/ig,"")
	x=_.flow(_.replaceFlow(' ')(repl.html))(x)
	.replace(/[\(\){}[]"'`~]*/g,' ')
	.replace(/\s+/g,' ')
	.replace(/\s.{1,3}\s/gi, " ")
	.replace(/\s.{1,3}\s/gi, " ")
	.replace(/\s.{1,3}\s/gi, " ")


	tfidf.addDocument({id: "html",body:_.clean(x)}, true);
	tfidf.updateIdf();
	return tfidf
	}