Skip to content

Instantly share code, notes, and snippets.

@binarymax
Last active May 22, 2016 08:59
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save binarymax/d3691180e65ff7f0dec5 to your computer and use it in GitHub Desktop.
Save binarymax/d3691180e65ff7f0dec5 to your computer and use it in GitHub Desktop.
Parses news.ycombinator.com comment archive into word2vec digestable format.
#!/usr/bin/env node
var linestream = require('line-stream');
var htmldecode = require('htmldec');
var s = linestream();
var numbers = "zero,one,two,three,four,five,six,seven,eight,nine".split(',');
var reletter = /[a-z_]/i;
var renumber = /[0-9]/;
var reclose = /\<\/[^\>]+\>/g;
var respace = / +/g;
var reopen = /\<[^\>]+\>/g;
var reapos = /\'/g;
var reund1 = / _/g;
var reund2 = /_ /g;
var reurl = /(ftp|http|https):\/\/(\w+:{0,1}\w*@)?(\S+)(:[0-9]+)?(\/|\/([\w#!:.?+=&%@!\-\/]))?/g;
s.on('data',function(line) {
var json;
var text;
var chr = "";
var out = "";
try {
json = JSON.parse(line);
} catch (ex) {
return;
}
if (!json.body) return;
if (!json.body.text) return;
text = json.body.text||'';
text = htmldecode(text);
text = text.replace(reopen,' ').replace(reclose,' ');
text = text.replace(reurl,' ');
text = text.replace(reapos,'_').replace(reund1,'').replace(reund2,'');
for (var i=0, l=text.length; i<l; i++) {
chr = text[i];
if(reletter.test(chr)) {
out+=chr.toLowerCase();
} else if (renumber.test(chr)) {
out+=numbers[parseInt(chr)];
} else {
out += ' ';
}
}
out += ' ';
out = out.replace(respace,' ');
process.stdout.write(out);
});
process.stdin.pipe(s);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment