Skip to content

Instantly share code, notes, and snippets.

@yapcheahshen
Created May 5, 2012 02:47
Show Gist options
  • Save yapcheahshen/2599244 to your computer and use it in GitHub Desktop.
Save yapcheahshen/2599244 to your computer and use it in GitHub Desktop.
calculate bigram in a text file
/*
text file cannot contain surrogate due to bugs in node.js
*/
var fs = require('fs');
var total_size=0;
var filetoarraySync = function (filename, encoding) { //array
encoding = encoding || 'utf8';
var data = fs.readFileSync(filename, encoding);
if (data.indexOf("\r\n") > -1)
return data.split("\r\n");
else
return data.split("\n");
}
var main=function() {
var argv=process.argv;
if (argv[0]==="node") argv.shift();
var fn=argv[1];
//var f=fs.createReadStream('diary.xml');
console.log("loading "+fn);
var arr=filetoarraySync(fn+'_acc.xml');
console.log("loaded");
var output=fs.createWriteStream(fn+'_bigram.js');
var intag=false;
var bigram={};
var prev="";
var code=0;
var tokencount=0,uniquetokencount=0;
for (var i in arr) {
var s=arr[i];
if (i % 1024===0) console.log( tokencount,uniquetokencount);
for (j in s) {
var c=s[j];
if (c=="<") intag=true;
if (c==">") { intag=false;continue};
if (intag) continue;
if (c=="的") continue;
code=s.charCodeAt(j);
if (code<0x4e00 || code>0x9fff) {
prev="";
continue;
}
if (!bigram[ c ] ) {
bigram[c]={};//create a new object
uniquetokencount++;
}
tokencount++;
if (prev) {
//output.write(prev+c +'\n');
if (! bigram[prev][c]) {
//console.log( "new pair"+prev+s[j] );
bigram[prev][c]=1;
} else {
bigram[prev][c]++;
}
}
prev=c;
}
// if (tokencount>100000) break;
};
var maxchild=15;
var minfreq=2;
var trimmed={};
//calculate some figures
var stat={};
stat.headcount=0;
stat.paircount=0;
stat.hitcount=0;
var charfreq={};
for (var i in bigram) {
stat.headcount++;
for (var j in bigram[i]) {
stat.paircount++;
stat.hitcount+=bigram[i][j] ;
if (!charfreq[j] ) charfreq[j]=0;
charfreq[j]++;
}
}
stat.averagepair=stat.paircount/ stat.headcount;
stat.averagehit=stat.hitcount/ stat.paircount;
console.log(stat);
//if (stat.averagehit>minfreq) minfreq=stat.averagehit;
//字頻。
//TODO 出現次數 除以 ln 字頻 ,去除構詞能力太強的字
for (var j in charfreq) {
charfreq[j]=Math.log(charfreq[j]+2);
// console.log(j,charfreq[j]);
}
for (var i in bigram) {
var child="";
var sortable=[];
var totalhit=0;
var averagehit=0;
if (!bigram[i]) continue;
for (var j in bigram[i]) totalhit+=bigram[i][j];
averagehit=totalhit / sortable.length;
if (!totalhit) continue;
//var weight=Math.log(totalhit/stat.averagehit ); //常用字較大
//output.write( i+ Math.log(2+ totalhit/stat.averagehit )+"\n");
for (var j in bigram[i]) sortable.push( [ j , bigram[i][j] ] );
sortable.sort( function(a,b) {return b[1]-a[1]});
for (var j=0;j<maxchild && j < sortable.length;j++) {
if (sortable[j][1]< (minfreq) ) break;
//if (sortable[j][1]<averagehit) break;
child+=sortable[j][0];
}
if (child) trimmed[i]=child;
}
output.write("define([],function() {return {payload:{");
for (var i in trimmed) {
output.write('"'+ i + '":"'+trimmed[i] +'",\n');
}
output.write("},header:{version:20120419} }});");
console.log("end");
output.end();
return;
}//
main();
@kamleong
Copy link

kamleong commented May 8, 2012

if (code<0x4e00 || code>0x9fff)

other than CJK Unified Ideographs [U+4E00 → U+9FFF], may also consider to include CJK Unified Ideographs Extension A [U+3400 → U+4DBF]. anyway, this may not apply to CBETA which seems to always "使用半型中括號括住組字式(組字法說明如下),用以代表缺字", and it does not include any of the characters from the Extension blocks.

if (c=="的") continue;

//TODO 出現次數 除以 ln 字頻 ,去除構詞能力太強的字

other than that, may also consider to exclude certain 虛字或語氣, e.g. 之、乎、也、焉、乃、而、矣。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment