Created
November 22, 2013 12:32
-
-
Save chiral/7599166 to your computer and use it in GitHub Desktop.
Ad-hoc web site markup analyzer for native advertising.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var jsdom = require("jsdom"); | |
var $ = require("jquery"); | |
var request = require("request"); | |
var ua = require("./ua"); | |
var myConfig = { | |
UA: "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)", | |
minTitleCount: 3, | |
minAvgTitleLen: 10, | |
minArticleFillRate: 0.5, | |
contentTags: { | |
div:1, section:1, ul:1, tr:1, | |
}, | |
ignoreTags: { p:1,span:1 }, | |
skipTags: { script:1 }, | |
}; | |
function attr_string(prefix,str) { | |
var s=''; | |
var a = str.split(/\s+/); | |
for (var i=0; i<a.length; i++) { | |
s+=prefix+a[i]; | |
} | |
return s; | |
} | |
function filter_nodes($node,cs,ss) { | |
var N = ss.length; | |
if (N<myConfig.minTitleCount) | |
return null; | |
var txt = $node.text(); | |
if (!txt || txt.length<N*myConfig.minAvgTitleLen) | |
return null; | |
var i,max=0,tbl={}; | |
for (i=0; i<N; i++) { | |
var len=ss[i].length; | |
var c=tbl[len]=(tbl[len]||0)+1; | |
if (c>max) max=c; | |
} | |
if (max < N*myConfig.minArticleFillRate) | |
return null; | |
var ret=[]; | |
for (i=0; i<N; i++) { | |
if (tbl[ss[i].length]==max) ret.push(cs[i]); | |
} | |
return ret; | |
} | |
function convert_to_template($,key,nodes) { | |
var htmls={},max=0; | |
for (var i=0; i<nodes.length; i++) { | |
var c = nodes[i]; | |
c.find('a[href]').each(function(j){ | |
$(this).attr('href','{{=href'+(j+1)+'}}'); | |
}); | |
c.find('img').each(function(j){ | |
$(this).attr('src','{{=pic_src'+(j+1)+'}}'); | |
$(this).attr('alt','{{=pic_alt'+(j+1)+'}}'); | |
}); | |
var a=['span','p','a']; | |
for (var j=0; j<a.length; j++) { | |
var tag = a[j]; | |
var j1=0,j2=0; | |
c.find(tag).each(function(){ | |
if ($(this).children().length==0 && $(this).text().length) { | |
$(this).text('{{='+tag+(++j1)+'}}'); | |
} | |
$(this).attr('title','{{='+tag+'_title'+(++j2)+'}}'); | |
var id=$(this).attr('id'); | |
if (id) { | |
$(this).attr('id',id.replace(/\d+/g,'*')); | |
} | |
}); | |
} | |
var h=c[0].outerHTML.replace(/\n|\r/g,''); | |
var c=htmls[h]=(htmls[h]||0)+1; | |
if (c>max) max=c; | |
} | |
var ret=[] | |
for (var k in htmls) ret.push(k); | |
return ret; | |
} | |
function traverse($,$node,pkey,result) { | |
var tag0 = $node[0].tagName; | |
if (tag0) tag0=tag0.toLowerCase(); | |
var skip = myConfig.skipTags[tag0]; | |
if (skip) return ''; | |
var content = myConfig.contentTags[tag0]; | |
var ignore = myConfig.ignoreTags[tag0]; | |
tag = tag0; | |
var cls = $node.attr('class'); | |
if (cls) tag+=attr_string('.',cls); | |
var id = $node.attr('id'); | |
if (id) tag+=attr_string('#',id); | |
var key = (cls || id) ? tag : pkey+tag; | |
if (cls || id) pkey = tag+' '; | |
var s = ignore ? '' : '<'+tag+'>'; | |
console.log(key); | |
var ss=[],cs=[]; | |
$node.children().each(function(){ | |
var s1=traverse($,$(this),pkey,result); | |
if (s1 && content) { ss.push(s1); cs.push($(this)); } | |
s+=s1; | |
}); | |
if (s.indexOf("<a")>=0) { | |
cs = filter_nodes($node,cs,ss); | |
if (cs) { | |
var htmls=convert_to_template($,key,cs); | |
result.push({key:key,val:ss,htmls:htmls}); | |
} | |
} | |
if (!ignore) s += '</'+tag0+'>'; | |
return s; | |
} | |
function post_process(list) { | |
var tbl={}; | |
for (var i=0; i<list.length; i++) { | |
var k=list[i].key | |
var c=tbl[k]; | |
tbl[k]=c?c+1:1; | |
} | |
var r=[]; | |
for (var j=0; j<list.length; j++) { | |
var e = list[j]; | |
e.htmls.sort(function(a,b){return a.length-b.length;}); | |
if (tbl[e.key]<2) r.push({k:e.key,v:e.htmls}); | |
} | |
return r; | |
} | |
function run(url,ua_key,f) { | |
request({url:url,headers:{'User-Agent':ua.table[ua_key]||myConfig.UA}},function(e,r,body) { | |
var r = []; | |
traverse($,$(body),'',r); | |
f(post_process(r)); | |
}); | |
} | |
exports.run = run; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment