Skip to content

Instantly share code, notes, and snippets.

@cristiandouce
Last active December 16, 2015 12:59
Show Gist options
  • Save cristiandouce/5438728 to your computer and use it in GitHub Desktop.
Save cristiandouce/5438728 to your computer and use it in GitHub Desktop.
Parse page content
var text = document.lastChild.innerHTML;
var regexs = {
// enclosing
script: /<(?:script)[\d\D]*?>[\d\D]*?<\/(?:script)>/gi,
noscript: /<(?:noscript)[\d\D]*?>[\d\D]*?<\/(?:noscript)>/gi,
style: /<(?:style)[\d\D]*?>[\d\D]*?<\/(?:style)>/gi,
iframe: /<(?:iframe)[\d\D]*?>[\d\D]*?<\/(?:iframe)>/gi,
frame: /<(?:frame)[\d\D]*?>[\d\D]*?<\/(?:frame)>/gi,
object: /<(?:object)[\d\D]*?>[\d\D]*?<\/(?:object)>/gi,
//singles
embed: /<embed[\d\D]*?>/gi,
link: /<link[\d\D]*?>/gi,
image: /<img[\d\D]*?>/gi,
// extras
nonroman: /[^a-zA-Z0-9]/g,
nonalpha: /[[:^alpha:]]/g,
nonalphanum: /[[:^alnum:]]/g,
punct: /[\.\:\;\,\"\'\¿\?\¡\!\-\_\(\)\*\^\%\$\#\@\~\`\,\<\>\\\/]/g,
tagsFree: /<(.*?)>/g,
whitespaces: /\s+/g
}
var loops = ['script', 'noscript', 'style', 'iframe', 'frame', 'object', 'embed', 'link', 'image'];
loops.forEach(function(tag) {
while(regexs[tag].test(text)) text = text.replace(regexs[tag], ' ');
});
var words = text
.replace(regexs.tagsFree, ' ')
.replace(regexs.punct, ' ')
.replace(regexs.whitespaces,' ')
.trim()
.split(' ');
var results = {};
words.forEach(function(w) {
if(!results[w]) results[w]=0; results[w]++;
});
// I should make some script
// to make a choice in letters length
// and words counter to have at least
// a number of results between the 2 and
// 5 percent of total words parsed.
//
// Also... should I look for the most rated
// or should I care for rare words... lets say,
// those named less than X times?
//
// Or maybe... cross results between multiple methods
// and use that for query.
//
for(var word in results) {
if(word.length > 3 && results[word] < 4) {
console.log(word, results[word]);
}
}
@cristiandouce
Copy link
Author

Should avoid "link" "script" "style" "code" tags.

@cristiandouce
Copy link
Author

String.prototype.sanitizeHTML=function (white,black) {
   if (!white) white="b|i|p|br";//allowed tags
   if (!black) black="script|object|embed";//complete remove tags
   e=new RegExp("(<("+black+")[^>]*>.*</\\2>|(?!<[/]?("+white+")(\\s[^<]*>|[/]>|>))<[^<>]*>|(?!<[^<>\\s]+)\\s[^</>]+(?=[/>]))", "gi");
   return this.replace(e,"");
}

@cristiandouce
Copy link
Author

also I should consider:

while(regexp.test(text)) {
  text = text.replace(regexp, ' ')
}

and then trim whitespaces...

this is to avoid the mismatch of <scr<script>ipt>

@cristiandouce
Copy link
Author

I should not avoid "meta"s...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment