Skip to content

Instantly share code, notes, and snippets.

@kamikat
Last active August 29, 2015 13:57
Show Gist options
  • Save kamikat/9682070 to your computer and use it in GitHub Desktop.
Save kamikat/9682070 to your computer and use it in GitHub Desktop.
clawer for kasi-time.com
// Usage: node kasi.js <id>
// Example:
// Get lyric from http://www.kasi-time.com/item-55961.html
// node kasi.js 55961
var http = require('http');
var vm = require('vm');
var no = process.argv[2];
// To Render HTML to Plain Text
var render = (function () {
// Focus on horizontial (tab) and vertical (line break) and escaped characters
var br0 = ['br', 'hr'];
var br1 = ['li', 'p', 'tr', 'table'];
var tab = ['td'];
// Escape map from entity name to code
var esc = {
'euro': 8364, 'nbsp': 32, 'quot': 34, 'amp': 38, 'lt': 60, 'gt': 62,
'iexcl': 161, 'cent': 162, 'pound': 163, 'curren': 164, 'yen': 165, 'brvbar': 166, 'sect': 167, 'uml': 168, 'copy': 169, 'ordf': 170, 'not': 172, 'shy': 173, 'reg': 174, 'macr': 175, 'deg': 176, 'plusmn': 177, 'sup2': 178, 'sup3': 179, 'acute': 180, 'micro': 181, 'para': 182, 'middot': 183, 'cedil': 184, 'sup1': 185, 'ordm': 186, 'raquo': 187, 'frac14': 188, 'frac12': 189, 'frac34': 190, 'iquest': 191, 'Agrave': 192, 'Aacute': 193, 'Acirc': 194, 'Atilde': 195, 'Auml': 196, 'Aring': 197, 'AElig': 198, 'Ccedil': 199, 'Egrave': 200, 'Eacute': 201, 'Ecirc': 202, 'Euml': 203, 'Igrave': 204, 'Iacute': 205, 'Icirc': 206, 'Iuml': 207, 'ETH': 208, 'Ntilde': 209, 'Ograve': 210, 'Oacute': 211, 'Ocirc': 212, 'Otilde': 213, 'Ouml': 214, 'times': 215, 'Oslash': 216, 'Ugrave': 217, 'Uacute': 218, 'Ucirc': 219, 'Uuml': 220, 'Yacute': 221, 'THORN': 222, 'szlig': 223, 'agrave': 224, 'aacute': 225, 'acirc': 226, 'atilde': 227, 'auml': 228, 'aring': 229, 'aelig': 230, 'ccedil': 231, 'egrave': 232, 'eacute': 233, 'ecirc': 234, 'euml': 235, 'igrave': 236, 'iacute': 237, 'icirc': 238, 'iuml': 239, 'eth': 240, 'ntilde': 241, 'ograve': 242, 'oacute': 243, 'ocirc': 244, 'otilde': 245, 'ouml': 246, 'divide': 247, 'oslash': 248, 'ugrave': 249, 'uacute': 250, 'ucirc': 251, 'uuml': 252, 'yacute': 253, 'thorn': 254
};
var unescape = function (str) {
return str.replace(/&([^#;]+);/gi, function (match, entity) {
return '&#' + (esc[entity] || entity) + ';';
}).replace(/&#([0-9]+);/gi, function (match, code) {
return String.fromCharCode(+code);
});
};
return function (html) {
var text = '';
var re = /[<]([^<>]+)[>]/gi, match, lastIndex = 0;
var nest = [];
while ((match = re.exec(html))) {
var name = match[1], close = name[0] == '\/', open = !close;
name = name.slice(+close).trim();
if (name.slice(-1) == '\/') {
open = close = true;
name = name.slice(0, name.length - 1).trim();
}
text += unescape(html.slice(lastIndex, re.lastIndex - match[0].length));
var last = nest[nest.length - 1];
if (~br0.indexOf(name)) text += '\n';
else if (open) {
if (name == last) close = true;
nest.push(name);
}
if (close) {
while ((last = nest.pop()) != name) {
if (~br1.indexOf(last)) text += '\n';
if (~tab.indexOf(last)) text += '\t';
}
if (~br1.indexOf(name)) text += '\n';
if (~tab.indexOf(name)) text += '\t';
}
lastIndex = re.lastIndex;
}
text += unescape(html.slice(lastIndex));
while (nest.length > 0) {
var unclosed = nest.pop();
if (~br1.indexOf(unclosed)) text += '\n';
if (~tab.indexOf(unclosed)) text += '\t';
}
return text;
};
})();
http.get('http://www.kasi-time.com/item_js.php?no=' + no, function (res) {
if (res.statusCode != 200) {
return console.log('ERROR - HTTP Status ' + res.statusCode);
}
// Initialize a Buffer to receive message body
// MAXIMUM body data size is bound to 1MB
var buffer = new Buffer(1024 * 1024), bufptr = 0;
res
.on('data', function (chunk) {
// Use buffer copy concating data chunks
bufptr += chunk.copy(buffer, bufptr);
})
.on('end', function () {
// Get JavaScript code
var code = buffer.slice(0, bufptr).toString();
// Compile code to Script object
var script = vm.createScript(code, 'item.js');
// Create a Sandbox (Execution Context Object)
var sandbox = {
document: {
write: function (html) {
var text = render(html);
return console.log(text);
}
}
};
// Execute Script in Sandbox
script.runInNewContext(sandbox);
});
}).on('error', function (err) {
console.log(err);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment