Skip to content

Instantly share code, notes, and snippets.

@chiral
Created November 22, 2013 12:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chiral/7599166 to your computer and use it in GitHub Desktop.
Save chiral/7599166 to your computer and use it in GitHub Desktop.
Ad-hoc web site markup analyzer for native advertising.
var jsdom = require("jsdom");
var $ = require("jquery");
var request = require("request");
var ua = require("./ua");
var myConfig = {
UA: "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)",
minTitleCount: 3,
minAvgTitleLen: 10,
minArticleFillRate: 0.5,
contentTags: {
div:1, section:1, ul:1, tr:1,
},
ignoreTags: { p:1,span:1 },
skipTags: { script:1 },
};
function attr_string(prefix,str) {
var s='';
var a = str.split(/\s+/);
for (var i=0; i<a.length; i++) {
s+=prefix+a[i];
}
return s;
}
function filter_nodes($node,cs,ss) {
var N = ss.length;
if (N<myConfig.minTitleCount)
return null;
var txt = $node.text();
if (!txt || txt.length<N*myConfig.minAvgTitleLen)
return null;
var i,max=0,tbl={};
for (i=0; i<N; i++) {
var len=ss[i].length;
var c=tbl[len]=(tbl[len]||0)+1;
if (c>max) max=c;
}
if (max < N*myConfig.minArticleFillRate)
return null;
var ret=[];
for (i=0; i<N; i++) {
if (tbl[ss[i].length]==max) ret.push(cs[i]);
}
return ret;
}
function convert_to_template($,key,nodes) {
var htmls={},max=0;
for (var i=0; i<nodes.length; i++) {
var c = nodes[i];
c.find('a[href]').each(function(j){
$(this).attr('href','{{=href'+(j+1)+'}}');
});
c.find('img').each(function(j){
$(this).attr('src','{{=pic_src'+(j+1)+'}}');
$(this).attr('alt','{{=pic_alt'+(j+1)+'}}');
});
var a=['span','p','a'];
for (var j=0; j<a.length; j++) {
var tag = a[j];
var j1=0,j2=0;
c.find(tag).each(function(){
if ($(this).children().length==0 && $(this).text().length) {
$(this).text('{{='+tag+(++j1)+'}}');
}
$(this).attr('title','{{='+tag+'_title'+(++j2)+'}}');
var id=$(this).attr('id');
if (id) {
$(this).attr('id',id.replace(/\d+/g,'*'));
}
});
}
var h=c[0].outerHTML.replace(/\n|\r/g,'');
var c=htmls[h]=(htmls[h]||0)+1;
if (c>max) max=c;
}
var ret=[]
for (var k in htmls) ret.push(k);
return ret;
}
function traverse($,$node,pkey,result) {
var tag0 = $node[0].tagName;
if (tag0) tag0=tag0.toLowerCase();
var skip = myConfig.skipTags[tag0];
if (skip) return '';
var content = myConfig.contentTags[tag0];
var ignore = myConfig.ignoreTags[tag0];
tag = tag0;
var cls = $node.attr('class');
if (cls) tag+=attr_string('.',cls);
var id = $node.attr('id');
if (id) tag+=attr_string('#',id);
var key = (cls || id) ? tag : pkey+tag;
if (cls || id) pkey = tag+' ';
var s = ignore ? '' : '<'+tag+'>';
console.log(key);
var ss=[],cs=[];
$node.children().each(function(){
var s1=traverse($,$(this),pkey,result);
if (s1 && content) { ss.push(s1); cs.push($(this)); }
s+=s1;
});
if (s.indexOf("<a")>=0) {
cs = filter_nodes($node,cs,ss);
if (cs) {
var htmls=convert_to_template($,key,cs);
result.push({key:key,val:ss,htmls:htmls});
}
}
if (!ignore) s += '</'+tag0+'>';
return s;
}
function post_process(list) {
var tbl={};
for (var i=0; i<list.length; i++) {
var k=list[i].key
var c=tbl[k];
tbl[k]=c?c+1:1;
}
var r=[];
for (var j=0; j<list.length; j++) {
var e = list[j];
e.htmls.sort(function(a,b){return a.length-b.length;});
if (tbl[e.key]<2) r.push({k:e.key,v:e.htmls});
}
return r;
}
function run(url,ua_key,f) {
request({url:url,headers:{'User-Agent':ua.table[ua_key]||myConfig.UA}},function(e,r,body) {
var r = [];
traverse($,$(body),'',r);
f(post_process(r));
});
}
exports.run = run;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment