Skip to content

Instantly share code, notes, and snippets.

@PseudoSky
Last active November 9, 2016 21:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PseudoSky/452bc9571de985424bba8454434ab3b2 to your computer and use it in GitHub Desktop.
Save PseudoSky/452bc9571de985424bba8454434ab3b2 to your computer and use it in GitHub Desktop.
TFIDF Analysis and Search Tools

Setup

You're going to need
lodash
NMAP Script List

  • Load up lodash.js into the chrome inspector (or own html/js files)
  • Load nmap-scripts.json data into a variable named nmap
  • Load contextualizer.js

Step 1

I added some interesting functions to the module to grab the text and resources (js) out of whatever website you're on (tested in chrome) so that you can analyze what the site is about and what they used to buid it. Example:Playing around in the chrome inspector.

//Generates the tree
var tf = analyzePage();

// Grabs the moste frequent terms from the html
var ten_most_frequent = tf.getSortedTerms('html',10);
console.log(ten_most_frequent);
/*
> [Object, Object, Object, Object, Object, Object, Object, Object, Object, Object]

> [{
    count:8,
    freq:0.07272727272727272,
    key:"creat"
  }, ... 
  ]
*/

Step 2

The module can be used to analyze collections of objects for information, from which we can search and score queries against the documents Example:Analyze and query the nmap script data.

/*
 'name' is the attribute in each doc the nmap collection that uniquely ids the record
 'info' is the attribute we want to extract the text from
     -> it is in an array because at times we may want to analyze across multiple attrs
*/
var tf=analyzeCollection(nmap,'name',['info'])

// Should look like this
/*
{
  BM25: {
    averageDocumentLength: 12.866419294990724,
    b: 0.75,
    documents: Object,
    k1: 1.3,
    terms: Object,
    totalDocumentTermLength: 6935,
    totalDocuments: 539
  }
}
*/

// Search for useful scripts
tf.search('passwords database',5)
/*
> ["password", "databas"]
> Array[5]
    0: {
        _score: 2.519205992456092
        body: "Performs brute force password auditing against the MongoDB database."
        id: "mongodb-brute"
        termCount: 7
        terms: tokens: Array[7]
      },
    1: Object,
    2: Object,
    3: Object,
    4: Object,
    length: 5
*/
/**
* Okapi BM25 Implementation
* Burak Kanber, 2015
*
* License: you may use this for educational purposes only.
*/
/**
* Porter stemmer
*
* I took the porter stemmer from here and minified it: http://tartarus.org/~martin/PorterStemmer/js.txt
*
* Original comments on the porter stemmer source:
// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
// paper, in
//
// Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
// no. 3, pp 130-137,
//
// see also http://www.tartarus.org/~martin/PorterStemmer
// Release 1 be 'andargor', Jul 2004
// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
*/
var stemmer=function(){var e={ational:"ate",tional:"tion",enci:"ence",anci:"ance",izer:"ize",bli:"ble",alli:"al",entli:"ent",eli:"e",ousli:"ous",ization:"ize",ation:"ate",ator:"ate",alism:"al",iveness:"ive",fulness:"ful",ousness:"ous",aliti:"al",iviti:"ive",biliti:"ble",logi:"log"},i={icate:"ic",ative:"",alize:"al",iciti:"ic",ical:"ic",ful:"",ness:""},t="[^aeiou]",s="[aeiouy]",a=t+"[^aeiouy]*",l=s+"[aeiou]*",n="^("+a+")?"+l+a,o="^("+a+")?"+l+a+"("+l+")?$",c="^("+a+")?"+l+a+l+a,r="^("+a+")?"+s;return function(t){var l,u,x,$,p,v,g;if(t.length<3)return t;if(x=t.substr(0,1),"y"==x&&(t=x.toUpperCase()+t.substr(1)),$=/^(.+?)(ss|i)es$/,p=/^(.+?)([^s])s$/,$.test(t)?t=t.replace($,"$1$2"):p.test(t)&&(t=t.replace(p,"$1$2")),$=/^(.+?)eed$/,p=/^(.+?)(ed|ing)$/,$.test(t)){var f=$.exec(t);$=new RegExp(n),$.test(f[1])&&($=/.$/,t=t.replace($,""))}else if(p.test(t)){var f=p.exec(t);l=f[1],p=new RegExp(r),p.test(l)&&(t=l,p=/(at|bl|iz)$/,v=new RegExp("([^aeiouylsz])\\1$"),g=new RegExp("^"+a+s+"[^aeiouwxy]$"),p.test(t)?t+="e":v.test(t)?($=/.$/,t=t.replace($,"")):g.test(t)&&(t+="e"))}if($=/^(.+?)y$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(r),$.test(l)&&(t=l+"i")}if($=/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+e[u])}if($=/^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/,$.test(t)){var f=$.exec(t);l=f[1],u=f[2],$=new RegExp(n),$.test(l)&&(t=l+i[u])}if($=/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/,p=/^(.+?)(s|t)(ion)$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),$.test(l)&&(t=l)}else if(p.test(t)){var f=p.exec(t);l=f[1]+f[2],p=new RegExp(c),p.test(l)&&(t=l)}if($=/^(.+?)e$/,$.test(t)){var f=$.exec(t);l=f[1],$=new RegExp(c),p=new RegExp(o),v=new RegExp("^"+a+s+"[^aeiouwxy]$"),($.test(l)||p.test(l)&&!v.test(l))&&(t=l)}return $=/ll$/,p=new RegExp(c),$.test(t)&&p.test(t)&&($=/.$/,t=t.replace($,"")),"y"==x&&(t=x.toLowerCase()+t.substr(1)),t}}();
/**
* Stopwords list taken from http://www.ranks.nl/stopwords
*/
var stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"];
var stopStems = stopwords.map(stemmer);
function BM25() {
this.documents = {};
this.terms = {};
this.totalDocumentTermLength = 0; // For average doc length.
this.averageDocumentLength = 0;
this.totalDocuments = 0;
this.k1 = 1.3;
this.b = 0.75;
};
// Static methods.
BM25.Tokenize = function(text) {
text = _.clean(text)
.split(' ')
.map(function(a) { return stemmer(a); })
console.log(text);
window.text=text;
// Filter out stopStems
var out = [];
for (var i = 0, len = text.length; i < len; i++) {
if (stopStems.indexOf(text[i]) === -1) {
out.push(text[i]);
}
}
return out;
};
// Instance methods.
/**
* @param doc Object Expects this parameter to have an id and body properties.
*/
BM25.prototype.addDocument = function(doc, batch) {
if (typeof doc.id === 'undefined') { throw new Error(1000, 'ID is a required property of documents.'); };
if (typeof doc.body === 'undefined') { throw new Error(1001, 'Body is a required property of documents.'); };
// Raw tokenized list of words
var tokens = BM25.Tokenize(doc.body).filter( a => a.length>0);
// Will hold unique terms and their counts and frequencies
var _terms = {};
// docObj will eventually be added to the documents database
var docObj = {id: doc.id, tokens: tokens, body: doc.body};
// Count number of terms
docObj.termCount = tokens.length;
// Increment totalDocuments
this.totalDocuments++;
// Readjust averageDocumentLength
this.totalDocumentTermLength += docObj.termCount;
this.averageDocumentLength = this.totalDocumentTermLength / this.totalDocuments;
// Calculate term frequency
// First get terms count
for (var i = 0, len = tokens.length; i < len; i++) {
var term = tokens[i];
if (!_terms[term]) {
_terms[term] = {
count: 0,
freq: 0
};
};
_terms[term].count++;
}
// Then re-loop to calculate term frequency.
// We'll also update inverse document frequencies here.
var keys = Object.keys(_terms);
for (var i = 0, len = keys.length; i < len; i++) {
var term = keys[i];
// Term Frequency for this document.
_terms[term].freq = _terms[term].count / docObj.termCount;
// Inverse Document Frequency initialization
if (!this.terms[term]) {
this.terms[term] = {
n: 0, // Number of docs this term appears in, uniquely
idf: 0,
docs: {}
};
}
this.terms[term].n++;
this.terms[term].docs[doc.id]=1
};
// Calculate inverse document frequencies
!batch && this.updateIdf();
// Add docObj to docs db
docObj.terms = _terms;
this.documents[docObj.id] = docObj;
};
BM25.prototype.updateIdf = function() {
var keys = Object.keys(this.terms);
for (var i = 0, len = keys.length; i < len; i++) {
var term = keys[i];
var num = (this.totalDocuments - this.terms[term].n + 0.5);
var denom = (this.terms[term].n + 0.5);
this.terms[term].idf = Math.max(Math.log10(num / denom), 0.01);
}
};
BM25.prototype.getSortedTerms=function(doc,size){
if(doc){
if(!this.documents[doc]) return "That document doesn't exist.";
size=size||_.keys(this.documents[doc].terms).length
return _(this.documents[doc].terms).map((v,k) => ({key:k, freq:v.freq, count:v.count})).sortBy('count').takeRight(size).value().reverse()
}else{
size=size||_.keys(this.terms).length
return _(this.terms).map((v,k) => ({key:k, idf:v.idf, n:v.n})).sortBy('n').takeRight(size).value().reverse()
}
}
BM25.prototype.search = function(query, limit) {
var queryTerms = BM25.Tokenize(query);
var results = [];
// Look at each document in turn. There are better ways to do this with inverted indices.
var keys = Object.keys(this.documents);
for (var j = 0, nDocs = keys.length; j < nDocs; j++) {
var id = keys[j];
// The relevance score for a document is the sum of a tf-idf-like
// calculation for each query term.
this.documents[id]._score = 0;
// Calculate the score for each query term
for (var i = 0, len = queryTerms.length; i < len; i++) {
var queryTerm = queryTerms[i];
// We've never seen this term before so IDF will be 0.
// Means we can skip the whole term, it adds nothing to the score
// and isn't in any document.
if (typeof this.terms[queryTerm] === 'undefined') {
continue;
}
// This term isn't in the document, so the TF portion is 0 and this
// term contributes nothing to the search score.
if (typeof this.documents[id].terms[queryTerm] === 'undefined') {
continue;
}
// The term is in the document, let's go.
// The whole term is :
// IDF * (TF * (k1 + 1)) / (TF + k1 * (1 - b + b * docLength / avgDocLength))
// IDF is pre-calculated for the whole docset.
var idf = this.terms[queryTerm].idf;
// Numerator of the TF portion.
var num = this.documents[id].terms[queryTerm].count * (this.k1 + 1);
// Denomerator of the TF portion.
var denom = this.documents[id].terms[queryTerm].count
+ (this.k1 * (1 - this.b + (this.b * this.documents[id].termCount / this.averageDocumentLength)));
// Add this query term to the score
this.documents[id]._score += idf * num / denom;
}
if (!isNaN(this.documents[id]._score) && this.documents[id]._score > 0) {
results.push(this.documents[id]);
}
}
results.sort(function(a, b) { return b._score - a._score; });
return results.slice(0, limit || 10);
};
var bm = new BM25;
onmessage = function(e) {
var payload = e.data;
switch (payload.type) {
case 'index-batch':
//payload.data = payload.data.slice(0, 1000);
var len = payload.data.length;
for (var i in payload.data) {
bm.addDocument({id: i, body: payload.data[i]});
if (i % 100 === 0) {
postMessage({type:"index-update", data: i/len});
}
}
bm.updateIdf();
postMessage({type:"index-update", data: 1});
break;
case 'search':
postMessage({type: "search-results", data: bm.search(payload.data)});
break;
}
}
function analyzeCollection(arr, id_attr, attrs){
var tfidf = new BM25();
if(!attrs) attrs=Object.keys(arr[0]);
// For each element add each of the attributes to a doc
_.map( arr, (d,i) =>{
// With the elements id_attr as the id
_.each(attrs, attr=> d[attr] ? tfidf.addDocument({id: (d[id_attr] ? d[id_attr] : i), body: d[attr]}, true) : -1 );
})
tfidf.updateIdf();
return tfidf;
}
function getScripts(){
js=_.map(document.scripts, d => ({id: (d.src || _.uniqueId()), body: d.innerText}));
var tfidf=new BM25();
_.each(js, d => tfidf.addDocument(d, true) );
tfidf.updateIdf();
console.log(tfidf);
console.log(tfidf.terms);
};
var repl={
html: ['/&lt;/gi','/&gt;/gi','/&nbsp;/gi'],
js_methods: ['console.log('],
splits:[ /"/g, /,/g, /\w\.\w/gi, /\s.{1,3}\s/gi, /\W/g, /\s+/g, /\s[0-9]*\s/g, /\s[0-9]*px\s/g]
}
var filters = ["nbsp","lt","gt","init","Array","Date","eval","function","hasOwnProperty","Infinity","isFinite","isNaN","isPrototypeOf","length","Math","NaN","name","Number","Object","prototype","String","toString","undefined","valueOf","abstract", "arguments", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "continue", "debugger", "default", "delete", "do", "double", "else", "enum", "eval", "export", "extends*", "false", "final", "finally", "float", "for", "function", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "let", "long", "native", "new", "null", "package", "private", "protected", "public", "return", "short", "static", "super", "switch", "synchronized", "this", "throw", "throws", "transient", "true", "try", "typeof", "var", "void", "volatile", "while", "with", "yield"];
function replaceWith(replace_regex, replace_val){
return _.partial(_.curryReplace,replace_regex, replace_val);
}
_.curryReplace = function (rgxWhat, rplWith, text) {
return String(text).replace(rgxWhat, rplWith);
};
_.mixin({
replaceFlow: (v) => _.partial(_.replaceWith,_,v),
replaceWith: (replace_regex, replace_val) =>{
return _.partial(_.curryReplace,replace_regex, replace_val);
},
curryReplace: (rgxWhat, rplWith, text) =>{
return String(text).replace(rgxWhat, rplWith);
},
isntEmpty:_.negate(_.isEmpty),
clean: function(s){
s = _.toLower(s);
var replacements = _.flatten(_.values(repl));
replacements = _.map(replacements,_.replaceFlow(' '));
var res = _.flow(replacements)(s)
.replace(/\s.{1,3}\s/gi, " ")
.replace(/\s.{1,3}\s/gi, " ")
.replace(/\s.{1,3}\s/gi, " ");
res=_.pullAll(res.split(' '),filters).join(' ');
return res;
}
})
function analyzePage(){
var tfidf = new BM25();
x=document.body.innerHTML.replace(/(<([^>]+)>)/ig,"")
x=_.flow(_.replaceFlow(' ')(repl.html))(x)
.replace(/[\(\){}[]"'`~]*/g,' ')
.replace(/\s+/g,' ')
.replace(/\s.{1,3}\s/gi, " ")
.replace(/\s.{1,3}\s/gi, " ")
.replace(/\s.{1,3}\s/gi, " ")
tfidf.addDocument({id: "html",body:_.clean(x)}, true);
tfidf.updateIdf();
return tfidf
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment