Created
August 3, 2017 00:13
-
-
Save jclulow/b54d56217ad16a04bf193a5920c4b534 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
/* vim: set ts=8 sts=8 sw=8 noet: */ | |
var mod_assert = require('assert-plus'); | |
var mod_util = require('util'); | |
var mod_parse5 = require('parse5'); | |
var mod_jsprim = require('jsprim'); | |
var mod_verror = require('verror'); | |
var VE = mod_verror.VError; | |
var lib_common = require('./common'); | |
var find_first_attr = lib_common.find_first_attr; | |
var find_first_child = lib_common.find_first_child; | |
var text_node_is_match = lib_common.text_node_is_match; | |
function | |
sanitise(node) | |
{ | |
var replacer = function (key, value) { | |
if (key === 'parentNode') { | |
return (undefined); | |
} | |
return (value); | |
}; | |
return (JSON.parse(JSON.stringify(node, replacer))); | |
} | |
function | |
process_hyperlink(link) | |
{ | |
/* | |
* A hyperlink? | |
*/ | |
var href = find_first_attr(link, 'href'); | |
if (!href || link.childNodes.length !== 1) { | |
return (new VE('malformed <a> #1')); | |
} | |
var tn = link.childNodes[0]; | |
if (tn.nodeName !== '#text' || tn.childNodes) { | |
return (new VE('malformed <a> #2')); | |
} | |
/* | |
* XXX Sigh. It appears that the escaping of double quotes in | |
* URLs did not always happen correctly. This manifests as an | |
* attribute named for a single double quote (!) and probably | |
* just means we should relax the text/URL check. | |
*/ | |
var relax = false; | |
for (var lai = 0; lai < link.attrs.length; lai++) { | |
if (link.attrs[lai].name.indexOf('"') !== -1 && | |
link.attrs[lai].value === '') { | |
relax = true; | |
break; | |
} | |
} | |
/* | |
var garbage = [ | |
'},"', | |
'}"', | |
'"', | |
')"', | |
'>>"', | |
'21"', | |
'>"', | |
]; | |
for (var gi = 0; gi < garbage.length; gi++) { | |
if (find_first_attr(link, garbage[gi]) === '') { | |
relax = true; | |
break; | |
} | |
}*/ | |
if (!relax && tn.value !== href) { | |
return (new VE('<a> href/text mismatch: %j', | |
sanitise(link))); | |
} | |
return (href); | |
} | |
function | |
OLD_proc_html_line(dt, members, l) | |
{ | |
//console.error('%s', mod_util.inspect(l, false, 10, true)); | |
//console.error('-----------------------'); | |
var name, href; | |
if (l[0].nodeName !== 'a' || l[0].tagName !== 'a' || | |
find_first_attr(l[0], 'class') !== 'ts' || | |
(name = find_first_attr(l[0], 'name')) === null || | |
(href = find_first_attr(l[0], 'href')) === null || | |
//id !== name || | |
href !== ('#' + name)) { | |
return (new VE('invalid timestamp: %j', sanitise(l))); | |
} | |
var out = { | |
time: new Date(dt + 'T' + name + 'Z').toISOString() | |
}; | |
var pos = 1; | |
if (!l[pos]) { | |
console.log('%j', sanitise(l)); | |
} | |
if (text_node_is_match(l[pos], ' ')) { | |
/* | |
* Skip a single space right after the timestamp. | |
*/ | |
pos++; | |
} | |
if (l[pos].nodeName !== 'font') { | |
return (new VE('expected <font> node, not "%s"', | |
l[pos].nodeName)); | |
} | |
var fcls = find_first_attr(l[pos], 'class'); | |
var fch, fm; | |
var ci, cic, res; | |
switch (fcls) { | |
case 'mrcm': | |
var valid_values = [ | |
'Chatroom configuration modified', | |
'Chatroom is started', | |
'Chatroom is created', | |
'Chatroom is stopped', | |
'Chatroom is destroyed', | |
]; | |
out.type = 'ignore'; | |
if (l[pos].childNodes.length !== 1) { | |
return (new VE('<font> with class "mrcm" had too ' + | |
'many children')); | |
} | |
fch = l[pos].childNodes[0]; | |
if (fch.nodeName !== '#text' || | |
valid_values.indexOf(fch.value) === -1 || | |
fch.childNodes) { | |
return (new VE('<font> with class "mrcm" had ' + | |
'malformed child')); | |
} | |
pos++; | |
if (l[pos].nodeName !== 'br') { | |
return (new VE('<font> with class "mrcm" missing ' + | |
'<br>')); | |
} | |
pos++; | |
break; | |
case 'ml': | |
/* | |
* This font tag represents a user leaving a room. | |
* | |
* XXX Note that technically a user can provide a message when | |
* they leave, but we're just going to drop that on the floor. | |
*/ | |
out.type = 'part'; | |
if (l[pos].childNodes.length < 1) { | |
return (new VE('<font> with class "ml" had too ' + | |
'few children')); | |
} | |
fch = l[pos].childNodes[0]; | |
if (fch.nodeName !== '#text' || | |
(fm = fch.value.match(/^(.*) leaves the room/)) === null || | |
fch.childNodes) { | |
return (new VE('<font> with class "ml" had ' + | |
'malformed child')); | |
} | |
out.name = fm[1]; | |
pos++; | |
if (l[pos].nodeName !== 'br') { | |
return (new VE('<font> with class "ml" missing <br>')); | |
} | |
pos++; | |
//if (!members[out.name]) { | |
//console.error('USER "%s" LEAVING, BUT NOT PRESENT', | |
//out.name); | |
//process.exit(1); | |
//} | |
members[out.name] = true; | |
break; | |
case 'mj': | |
/* | |
* This font tag represents a user joining a room. | |
*/ | |
out.type = 'join'; | |
if (l[pos].childNodes.length !== 1) { | |
return (new VE('<font> with class "mj" had too ' + | |
'many children')); | |
} | |
fch = l[pos].childNodes[0]; | |
if (fch.nodeName !== '#text' || | |
(fm = fch.value.match(/^(.*) joins the room/)) === null || | |
fch.childNodes) { | |
return (new VE('<font> with class "mj" had ' + | |
'malformed child')); | |
} | |
out.name = fm[1]; | |
pos++; | |
if (l[pos].nodeName !== 'br') { | |
return (new VE('<font> with class "mj" missing <br>')); | |
} | |
pos++; | |
//if (!members[out.name]) { | |
//console.error('USER "%s" JOINING, BUT NOT PRESENT', | |
//out.name); | |
//process.exit(1); | |
//} | |
members[out.name] = true; | |
break; | |
case 'mnc': | |
/* | |
* This font tag represents a user changing nicknames. | |
* (Sigh.) | |
*/ | |
out.type = 'rename'; | |
if (l[pos].childNodes.length !== 1) { | |
return (new VE('<font> with class "mnc" had too ' + | |
'many children')); | |
} | |
fch = l[pos].childNodes[0]; | |
if (fch.nodeName !== '#text' || | |
(fm = fch.value.match(/^(.*) is now known as (.*)$/)) === | |
null || | |
fch.childNodes) { | |
return (new VE('<font> with class "mnc" had ' + | |
'malformed child')); | |
} | |
out.name = fm[1]; | |
out.message = fm[2]; | |
pos++; | |
if (l[pos].nodeName !== 'br') { | |
return (new VE('<font> with class "mnc" missing <br>')); | |
} | |
pos++; | |
//if (!members[out.name]) { | |
//console.error('USER "%s" JOINING, BUT NOT PRESENT', | |
//out.name); | |
//process.exit(1); | |
//} | |
members[out.name] = true; | |
break; | |
case 'msc': | |
/* | |
* This font tag represents a user setting the room topic. | |
*/ | |
out.type = 'topic'; | |
if (l[pos].childNodes.length < 1) { | |
return (new VE('<font> with class "msc" had too ' + | |
'few children')); | |
} | |
fch = l[pos].childNodes[0]; | |
if (fch.nodeName !== '#text' || | |
(fm = fch.value.match( | |
/^(.*) has set the subject to: (.*)$/)) === null || | |
fch.childNodes) { | |
return (new VE('<font> with class "msc" had ' + | |
'malformed child')); | |
} | |
out.name = fm[1]; | |
out.msg = [ fm[2] ]; | |
/* | |
* Process remaining child elements... | |
*/ | |
for (ci = 1; ci < l[pos].childNodes.length; ci++) { | |
cic = l[pos].childNodes[ci]; | |
switch (cic.nodeName) { | |
case 'a': | |
res = process_hyperlink(cic); | |
if (res instanceof Error) { | |
return (res); | |
} | |
out.msg.push(res); | |
continue; | |
case 'br': | |
out.msg.push('\n'); | |
continue; | |
case '#text': | |
if (cic.childNodes) { | |
return (new VE('text node with ' + | |
'children')); | |
} | |
out.msg.push(cic.value); | |
continue; | |
default: | |
return (new VE('unexpected node %j', | |
sanitise(cic))); | |
} | |
} | |
pos++; | |
if (l[pos].nodeName !== 'br') { | |
return (new VE('<font> with class "msc" missing <br>')); | |
} | |
/* | |
* Remove final linefeed. | |
*/ | |
out.msg = out.msg.join('').replace(/\n$/, ''); | |
//if (!members[out.name]) { | |
//console.error('USER "%s" JOINING, BUT NOT PRESENT', | |
//out.name); | |
//process.exit(1); | |
//} | |
members[out.name] = true; | |
pos++; | |
break; | |
case 'mne': | |
/* | |
* XXX These are "/me ..." lines. Unfortunately there is | |
* no delimiter between the nickname and the actual text, | |
* so we're going to have to be clever and try and track | |
* the room occupancy in order to detect who said this. | |
*/ | |
out.type = 'describe'; | |
if (l[pos].childNodes.length < 1) { | |
return (new VE('<font> with class "mne" had too ' + | |
'few children')); | |
} | |
fch = l[pos].childNodes[0]; | |
if (fch.nodeName !== '#text' || fch.childNodes) { | |
return (new VE('<font> with class "mne" had ' + | |
'malformed initial text child')); | |
} | |
/* | |
* Decide whose message this is... | |
*/ | |
var names = Object.keys(members).sort(); | |
for (var ni = 0; ni < names.length; ni++) { | |
if (mod_jsprim.startsWith(fch.value, names[ni] + ' ')) { | |
if (out.name) { | |
return (new VE('amgiguous name: ' + | |
'"%s" and "%s"', out.name, | |
names[ni])); | |
} | |
out.name = names[ni]; | |
out.msg = [ fch.value.substr( | |
out.name.length + 1) ]; | |
break; | |
} | |
} | |
if (!out.name) { | |
return (new VE('unknown name "%s" on describe line', | |
fch.value)); | |
} | |
/* | |
* Process remaining child elements... | |
*/ | |
for (ci = 1; ci < l[pos].childNodes.length; ci++) { | |
cic = l[pos].childNodes[ci]; | |
switch (cic.nodeName) { | |
case 'a': | |
res = process_hyperlink(cic); | |
if (res instanceof Error) { | |
return (res); | |
} | |
out.msg.push(res); | |
continue; | |
case 'br': | |
out.msg.push('\n'); | |
continue; | |
case '#text': | |
if (cic.childNodes) { | |
return (new VE('text node with ' + | |
'children')); | |
} | |
out.msg.push(cic.value); | |
continue; | |
default: | |
return (new VE('unexpected node %j', | |
sanitise(cic))); | |
} | |
} | |
pos++; | |
if (l[pos].nodeName !== 'br') { | |
return (new VE('<font> with class "mne" missing <br>')); | |
} | |
/* | |
* Remove final linefeed. | |
*/ | |
out.msg = out.msg.join('').replace(/\n$/, ''); | |
pos++; | |
break; | |
case 'mn': | |
/* | |
* This font tag contains a Nickname as part of a regular | |
* chat message. | |
*/ | |
out.type = 'message'; | |
if (l[pos].childNodes.length !== 1) { | |
return (new VE('<font> with class "mn" had too ' + | |
'many children')); | |
} | |
fch = l[pos].childNodes[0]; | |
if (fch.nodeName !== '#text' || | |
(fm = fch.value.match(/^<(.*)>$/)) === null || | |
fch.childNodes) { | |
return (new VE('<font> with class "mn" had ' + | |
'malformed child')); | |
} | |
out.name = fm[1].trim(); | |
out.msg = []; | |
pos++; | |
for (;;) { | |
switch (l[pos].nodeName) { | |
case 'a': | |
res = process_hyperlink(l[pos]); | |
if (res instanceof Error) { | |
return (res); | |
} | |
out.msg.push(res); | |
pos++; | |
continue; | |
case 'br': | |
out.msg.push('\n'); | |
pos++; | |
continue; | |
case '#text': | |
if (l[pos].childNodes) { | |
return (new VE('text node with ' + | |
'children')); | |
} | |
if (l[pos].value === '\n') { | |
/* | |
* This should be the end of a line. | |
*/ | |
break; | |
} | |
out.msg.push(l[pos].value); | |
pos++; | |
continue; | |
default: | |
return (new VE('unexpected node')); | |
} | |
/* | |
* If the case arm does not terminate with "continue", | |
* this line is finished. | |
*/ | |
break; | |
} | |
/* | |
* Remove final linefeed. | |
*/ | |
out.msg = out.msg.join('').replace(/\n$/, ''); | |
/* | |
* Remove initial single space separating log line from | |
* timestamp. | |
*/ | |
out.msg = out.msg.replace(/^ /, ''); | |
members[out.name] = true; | |
break; | |
case 'msm': | |
/* | |
* XXX WTF | |
*/ | |
return ({ type: 'ignore' }); | |
case 'mk': | |
/* | |
* e.g., "cody has been kicked: shoo" | |
* XXX Bah. | |
*/ | |
return ({ type: 'ignore' }); | |
default: | |
return (new VE('unexpected <font> class "%s"', fcls)); | |
} | |
/* | |
* Because of the way ejabberd writes these log files, every line | |
* should terminate with a single line feed. If not, we have probably | |
* not correctly identified the bounds of this line. | |
*/ | |
if (!text_node_is_match(l[pos], '\n')) { | |
return (new VE('expected newline @ %d', pos)); | |
} | |
//console.error('%s', mod_util.inspect(out, false, 10, true)); | |
//console.log('%j', out); | |
return (out); | |
//console.error(); | |
//console.error(); | |
} | |
var SKIPS = [ | |
[ { nodeName: 'link', tagName: 'link', attrs: [ | |
{ name: 'rel', value: 'stylesheet' }, | |
{ name: 'type', value: 'text/css' }, | |
{ name: 'href', value: 'http://http://joyent.com/assets/css/style.css' }, | |
{ name: 'media', value: 'all' } ], | |
namespaceURI: 'http://www.w3.org/1999/xhtml', | |
childNodes: [] } ], | |
[ { nodeName: 'link', tagName: 'link', attrs: [ | |
{ name: 'rel', value: 'stylesheet' }, | |
{ name: 'type', value: 'text/css' }, | |
{ name: 'href', value: 'http://joyent.com/assets/css/style.css' }, | |
{ name: 'media', value: 'all' } ], | |
namespaceURI: 'http://www.w3.org/1999/xhtml', | |
childNodes: [] } ], | |
[ { nodeName: 'meta', tagName: 'meta', attrs: [ | |
{ name: 'http-equiv', value: 'Content-Type' }, | |
{ name: 'content', value: 'text/html; charset=utf-8' } ], | |
namespaceURI: 'http://www.w3.org/1999/xhtml', | |
childNodes: [] } ], | |
[ { nodeName: 'script', tagName: 'script', attrs: [ | |
{ name: 'type', value: 'text/javascript' } ], | |
namespaceURI: 'http://www.w3.org/1999/xhtml', | |
childNodes: [] } ], | |
[ { nodeName: '#text', | |
value: 'function sh(e) // Show/Hide an element' } ], | |
[ { nodeName: '#text', | |
value: '{if(document.getElementById(e).style.display==\'none\')' } ], | |
[ { nodeName: '#text', | |
value: '{document.getElementById(e).style.display=\'block\';}' } ], | |
[ { nodeName: '#text', | |
value: 'else {document.getElementById(e).style.display=\'none\';}}' } ], | |
[ { nodeName: 'div', tagName: 'div', attrs: [ | |
{ name: 'class', value: 'roomtitle' } ], | |
namespaceURI: 'http://www.w3.org/1999/xhtml', | |
childNodes: [] } ], | |
[ { nodeName: 'span', tagName: 'span', attrs: [ | |
{ name: 'class', value: 'w3c' } ], | |
namespaceURI: 'http://www.w3.org/1999/xhtml', | |
childNodes: [] } ], | |
[ { nodeName: 'style', tagName: 'style', attrs: [ | |
{ name: 'type', value: 'text/css' } ], | |
namespaceURI: 'http://www.w3.org/1999/xhtml', | |
childNodes: [] } ], | |
[], | |
]; | |
function | |
extract_title(frag) | |
{ | |
if (frag.nodeName !== '#document-fragment' || | |
frag.childNodes.length !== 1 || | |
frag.childNodes[0].nodeName !== 'title' || | |
frag.childNodes[0].attrs.length !== 0 || | |
frag.childNodes[0].childNodes.length !== 1 || | |
frag.childNodes[0].childNodes[0].nodeName !== '#text') { | |
return (null); | |
} | |
return (frag.childNodes[0].childNodes[0].value); | |
} | |
function | |
skip_div(frag, last_rc_heading) | |
{ | |
if (frag.nodeName !== '#document-fragment' || | |
frag.childNodes.length !== 1) { | |
return (false); | |
} | |
var div = frag.childNodes[0]; | |
if (div.nodeName !== 'div') { | |
return (false); | |
} | |
/* | |
* Skip any <div> of the class "rc". This div technically | |
* encloses the room configuration block that follows, except that | |
* we're parsing each line as a fragment and that structural | |
* relationship is interrupted by a line feed. | |
*/ | |
if (find_first_attr(div, 'class') === 'rc') { | |
return (true); | |
} | |
if (find_first_attr(div, 'class') === 'roomtitle') { | |
return (true); | |
} | |
if (find_first_attr(div, 'class') === 'legend') { | |
return (true); | |
} | |
/* | |
* We do not care about the div containing the room configuration. | |
* This is a set of key-value pairs, of sorts, which describe | |
* the way the room is presently set up. | |
*/ | |
if (find_first_attr(div, 'class') === 'rcos' && | |
last_rc_heading === 'Room Configuration') { | |
return (true); | |
} | |
/* | |
* If the <div> has a class, or more or less than one child node, we | |
* don't want to skip it here. | |
*/ | |
if (find_first_attr(div, 'class') || | |
div.childNodes.length !== 1) { | |
return (false); | |
} | |
/* | |
* If this <div> contains a single <a> tag which links to the root of | |
* the site, it's safe to skip. | |
*/ | |
var ch = div.childNodes[0]; | |
if (ch.nodeName === 'a' && | |
find_first_attr(ch, 'href') === '/') { | |
return (true); | |
} | |
var valid_urls = [ | |
'https://connector.joyent.com/logs/', | |
'https://jabber.joyent.com/logs/', | |
]; | |
if (ch.nodeName === 'a' && | |
valid_urls.indexOf(find_first_attr(ch, 'href')) !== -1) { | |
return (true); | |
} | |
return (false); | |
} | |
function | |
extract_occupants(frag, last_rc_heading) | |
{ | |
if (last_rc_heading !== 'Room Occupants') { | |
return (null); | |
} | |
if (frag.nodeName !== '#document-fragment' || | |
frag.childNodes.length !== 1 || | |
frag.childNodes[0].nodeName !== 'div') { | |
return (null); | |
} | |
var div = frag.childNodes[0]; | |
if (find_first_attr(div, 'class') !== 'rcos' || | |
div.childNodes.length !== 2 || | |
div.childNodes[0].nodeName !== 'br' || | |
div.childNodes[1].nodeName !== 'div') { | |
return (null); | |
} | |
var rcot = div.childNodes[1]; | |
if (find_first_attr(rcot, 'class') !== 'rcot') { | |
return (null); | |
} | |
var members = {}; | |
for (var q = 0; q < rcot.childNodes.length; q++) { | |
var cc = rcot.childNodes[q]; | |
if (cc.nodeName === 'br') { | |
continue; | |
} | |
if (cc.nodeName === '#text') { | |
var name = cc.value. | |
replace(/^Moderator: /, ''). | |
replace(/^Participant: /, ''); | |
members[name] = true; | |
} | |
} | |
return (members); | |
} | |
function | |
extract_tz(frag) | |
{ | |
if (frag.nodeName !== '#document-fragment' || | |
frag.childNodes.length !== 3 || | |
frag.childNodes[0].nodeName !== 'br' || | |
frag.childNodes[1].nodeName !== 'a' || | |
frag.childNodes[2].nodeName !== 'br') { | |
return (null); | |
} | |
var a = frag.childNodes[1]; | |
if (find_first_attr(a, 'class') !== 'ts' || | |
a.childNodes.length !== 1 || | |
a.childNodes[0].nodeName !== '#text') { | |
return (null); | |
} | |
return (a.childNodes[0].value); | |
} | |
function | |
extract_rc_heading(frag) | |
{ | |
if (frag.nodeName !== '#document-fragment' || | |
frag.childNodes.length !== 1 || | |
frag.childNodes[0].nodeName !== 'div') { | |
return (null); | |
} | |
var div = frag.childNodes[0]; | |
if (find_first_attr(div, 'class') !== 'rct' || | |
div.childNodes.length !== 1 || | |
div.childNodes[0].nodeName !== '#text') { | |
return (null); | |
} | |
return (div.childNodes[0].value); | |
} | |
function | |
extract_room_jid(frag) | |
{ | |
if (frag.nodeName !== '#document-fragment' || | |
frag.childNodes.length !== 1 || | |
frag.childNodes[0].nodeName !== 'a') { | |
return (null); | |
} | |
var a = frag.childNodes[0]; | |
if (find_first_attr(a, 'class') !== 'roomjid' || | |
a.childNodes.length !== 1 || | |
a.childNodes[0].nodeName !== '#text') { | |
return (null); | |
} | |
return (a.childNodes[0].value); | |
} | |
function | |
extract_date(frag) | |
{ | |
if (frag.nodeName !== '#document-fragment' || | |
frag.childNodes.length !== 1 || | |
frag.childNodes[0].nodeName !== 'div') { | |
return (null); | |
} | |
var div = frag.childNodes[0]; | |
if (find_first_attr(div, 'class') !== 'logdate' || | |
div.childNodes.length !== 2 || | |
div.childNodes[0].nodeName !== '#text' || | |
div.childNodes[1].nodeName !== 'span') { | |
return (null); | |
} | |
return (div.childNodes[0].value); | |
} | |
function | |
extract_room_subject(frag) | |
{ | |
if (frag.nodeName !== '#document-fragment' || | |
frag.childNodes.length !== 1 || | |
frag.childNodes[0].nodeName !== 'div') { | |
return (null); | |
} | |
var div = frag.childNodes[0]; | |
if (find_first_attr(div, 'class') !== 'roomsubject' || | |
div.childNodes.length !== 1 || | |
div.childNodes[0].nodeName !== '#text') { | |
return (null); | |
} | |
return (div.childNodes[0].value); | |
} | |
function | |
extract_message_common(frag, dt) | |
{ | |
/* | |
* These lines must start with an <a> tag containing a timestamp. | |
*/ | |
if (frag.nodeName !== '#document-fragment' || | |
frag.childNodes.length < 1 || | |
frag.childNodes[0].nodeName !== 'a') { | |
return (null); | |
} | |
var ts = frag.childNodes[0]; | |
var name, href; | |
if (find_first_attr(ts, 'class') !== 'ts' || | |
(name = find_first_attr(ts, 'name')) === null || | |
(href = find_first_attr(ts, 'href')) === null) { | |
return (null); | |
} | |
var out = { | |
time: new Date(dt + 'T' + name + 'Z').toISOString(), | |
pos: 1 | |
}; | |
if (text_node_is_match(frag.childNodes[out.pos], ' ')) { | |
/* | |
* Skip a single space right after the timestamp. | |
*/ | |
out.pos++; | |
} | |
return (out); | |
} | |
function | |
extract_message_tail(frag, dt, out) | |
{ | |
/* | |
* Process the rest of the child nodes as the message. | |
*/ | |
out.pos++; | |
while (out.pos < frag.childNodes.length) { | |
var ch = frag.childNodes[out.pos]; | |
switch (ch.nodeName) { | |
case 'a': | |
var res = process_hyperlink(ch); | |
if (res instanceof Error) { | |
return (res); | |
} | |
out.msg.push(res); | |
out.pos++; | |
continue; | |
case 'br': | |
out.msg.push('\n'); | |
out.pos++; | |
continue; | |
case '#text': | |
if (ch.childNodes) { | |
return (new VE('text node with ' + | |
'children')); | |
} | |
if (ch.value === '\n') { | |
return (new VE('unexpected raw newline')); | |
} | |
out.msg.push(ch.value); | |
out.pos++; | |
continue; | |
default: | |
return (new VE('unexpected node: %j', | |
frag.childNodes[out.pos])); | |
} | |
} | |
/* | |
* Remove final linefeed. | |
*/ | |
out.msg = out.msg.join('').replace(/\n$/, ''); | |
/* | |
* Remove initial single space separating log line from | |
* timestamp. | |
*/ | |
out.msg = out.msg.replace(/^ /, ''); | |
delete (out.pos); | |
return (out); | |
} | |
function | |
extract_message(frag, dt, out) | |
{ | |
var mn = frag.childNodes[out.pos]; | |
var m; | |
if (mn.nodeName !== 'font' || | |
find_first_attr(mn, 'class') !== 'mn' || | |
mn.childNodes.length !== 1 || | |
mn.childNodes[0].nodeName !== '#text' || | |
(m = mn.childNodes[0].value.match(/^<(.*)>$/)) === null) { | |
return (new VE('malformed message: %j', frag)); | |
} | |
out.type = 'message'; | |
out.name = m[1]; | |
out.msg = []; | |
return (extract_message_tail(frag, dt, out)); | |
} | |
function | |
extract_describe(frag, dt, out, members) | |
{ | |
var mne = frag.childNodes[out.pos]; | |
if (mne.nodeName !== 'font' || | |
find_first_attr(mne, 'class') !== 'mne' || | |
mne.childNodes.length < 1 || | |
mne.childNodes[0].nodeName !== '#text') { | |
return (new VE('malformed message: %j', frag)); | |
} | |
var text = mne.childNodes[0].value; | |
out.type = 'describe'; | |
/* | |
* Decide whose message this is... | |
*/ | |
var names = Object.keys(members).sort(); | |
for (var ni = 0; ni < names.length; ni++) { | |
if (mod_jsprim.startsWith(text, names[ni] + ' ')) { | |
if (out.name) { | |
return (new VE('amgiguous name: ' + | |
'"%s" and "%s"', out.name, | |
names[ni])); | |
} | |
out.name = names[ni]; | |
out.msg = [ text.substr(out.name.length + 1) ]; | |
break; | |
} | |
} | |
if (!out.name) { | |
return (new VE('unknown name on describe line: "%s"', text)); | |
} | |
if (frag.childNodes[out.pos + 1].nodeName !== 'br') { | |
return (new VE('describe should end with a <br>: %j', frag)); | |
} | |
out.pos = 0; /* XXX */ | |
return (extract_message_tail(mne, dt, out)); | |
} | |
function | |
extract_metamsg(frag, dt, members) | |
{ | |
var out = extract_message_common(frag, dt); | |
if (out === null || out instanceof Error) { | |
return (out); | |
} | |
if (frag.childNodes[out.pos].nodeName !== 'font') { | |
return (new VE('expected <font> node, not "%s"', | |
frag.childNodes[out.pos].nodeName)); | |
} | |
var fch = frag.childNodes[out.pos]; | |
var fcls = find_first_attr(fch, 'class'); | |
var pat; | |
switch (fcls) { | |
case 'msm': | |
/* | |
* XXX Some kind of system message? | |
*/ | |
return ({ type: 'ignore' }); | |
case 'ml': | |
/* | |
* This font tag represents a user leaving a room. | |
* | |
* XXX Note that technically a user can provide a | |
* message when they leave, but we're just going to | |
* drop that on the floor. | |
*/ | |
out.type = 'part'; | |
pat = /^(.*) leaves the room/; | |
break; | |
case 'mj': | |
out.type = 'join'; | |
pat = /^(.*) joins the room/; | |
break; | |
case 'mnc': | |
out.type = 'rename'; | |
pat = /^(.*) is now known as (.*)$/; | |
break; | |
case 'msc': | |
out.type = 'topic'; | |
pat = /^(.*) has set the subject to: (.*)$/; | |
break; | |
case 'mn': | |
return (extract_message(frag, dt, out)); | |
case 'mne': | |
return (extract_describe(frag, dt, out, members)); | |
case 'mrcm': | |
var valid_values = [ | |
'Chatroom configuration modified', | |
'Chatroom is started', | |
'Chatroom is created', | |
'Chatroom is stopped', | |
'Chatroom is destroyed', | |
]; | |
if (fch.childNodes.length !== 1) { | |
return (new VE('<font> with class "mrcm" had too ' + | |
'many children')); | |
} | |
if (fch.childNodes[0].nodeName !== '#text' || | |
valid_values.indexOf(fch.childNodes[0].value) === -1) { | |
return (new VE('<font> with class "mrcm" had ' + | |
'malformed child')); | |
} | |
/* | |
* We don't care about these messages at all. | |
*/ | |
return ({ type: 'ignore' }); | |
default: | |
return (null); | |
} | |
if (fch.childNodes.length !== 1 || | |
fch.childNodes[0].nodeName !== '#text') { | |
return (new VE('malformed "%s" <font> child', | |
out.type)); | |
} | |
var m = fch.childNodes[0].value.match(pat); | |
if (!m) { | |
return (new VE('blah: %j', fch)); | |
} | |
if (fcls === 'msc') { | |
out.name = m[1]; | |
out.msg = [ m[2] ]; | |
return (extract_message_tail(frag, dt, out)); | |
} else { | |
out.name = m[1]; | |
out.msg = m[2] ? m[2] : null; | |
} | |
delete (out.pos); | |
return (out); | |
} | |
function | |
proc_data(room, data, just_dump) | |
{ | |
/* | |
* Though the documents we are processing are technically HTML, | |
* they are really constructed iteratively by the Jabber server. | |
* Most lines essentially stand alone as a document fragment. | |
* In order to handle times when the document is not a well-formed | |
* selection of fragments, we process line by line. | |
*/ | |
var lines = data.split('\n'); | |
var out = []; | |
var last_rc_heading = null; | |
var log_date; | |
/* | |
* Track room membership. | |
* XXX We probably need to do this in two passes; gather members | |
* first, and then later resolve "describe" lines. | |
*/ | |
var members = { | |
james: true, | |
aconbere: true, | |
pedro: true, | |
'scott.mcwhirter': true, | |
ryan: true, | |
marsell: true, | |
bot: true, | |
jclulow: true, | |
alexwilson: true, | |
pmooney: true, | |
jperkin: true, | |
}; | |
var in_comment = false; | |
for (var i = 0; i < lines.length; i++) { | |
var l = lines[i]; | |
var v; | |
if (!l) { | |
/* | |
* Skip blank lines. | |
*/ | |
continue; | |
} | |
if (l === '<!--') { | |
in_comment = true; | |
continue; | |
} | |
if (in_comment) { | |
if (l === '//-->') { | |
in_comment = false; | |
} | |
continue; | |
} | |
var frag = sanitise(mod_parse5.parseFragment(l)); | |
var skip = false; | |
for (var j = 0; j < SKIPS.length; j++) { | |
var t = { nodeName: '#document-fragment', | |
childNodes: SKIPS[j] }; | |
if (mod_jsprim.deepEqual(frag, t)) { | |
skip = true; | |
break; | |
} | |
} | |
if (skip) { | |
continue; | |
} | |
if (skip_div(frag, last_rc_heading)) { | |
continue; | |
} | |
if (frag.childNodes.length === 2 && | |
text_node_is_match(frag.childNodes[0], ' ') && | |
frag.childNodes[1].nodeName === 'a') { | |
var href = find_first_attr(frag.childNodes[1], | |
'href'); | |
var powers = [ | |
'http://www.ejabberd.im', | |
'http://www.erlang.org/', | |
'http://validator.w3.org/check?uri=referer', | |
'http://jigsaw.w3.org/css-validator/' | |
]; | |
if (powers.indexOf(href) !== -1) { | |
console.log('BACK MATTER URL: %s', href); | |
continue; | |
} | |
} | |
var title = extract_title(frag); | |
if (title !== null) { | |
console.log('TITLE: %s', title); | |
continue; | |
} | |
if ((v = extract_room_jid(frag)) !== null) { | |
if (room !== v) { | |
return (new VE('unexpected room "%s"', | |
v)); | |
} | |
continue; | |
} | |
if ((v = extract_date(frag)) !== null) { | |
log_date = new Date(v).toISOString(). | |
replace(/T.*/, ''); | |
console.log('LOG DATE: %s', log_date); | |
continue; | |
} | |
var room_subject = extract_room_subject(frag); | |
if (room_subject !== null) { | |
/* | |
* XXX need to grab the username out. sigh. | |
*/ | |
console.log('ROOM SUBJECT: %s', room_subject); | |
continue; | |
} | |
if ((v = extract_occupants(frag, last_rc_heading)) !== null) { | |
var v_members = Object.keys(v); | |
console.log('MEMBERS: %s', v_members.join(', ')); | |
v_members.forEach(function (m) { | |
members[m] = true; | |
}); | |
continue; | |
} | |
var tz = extract_tz(frag); | |
if (tz !== null) { | |
console.log('LOG TIMEZONE: %s', tz); | |
continue; | |
} | |
/* | |
* XXX NB: this must be after the functions that use | |
* "last_rc_heading". | |
*/ | |
if ((last_rc_heading = extract_rc_heading(frag)) !== null) { | |
console.log('ROOM CONFIGURATION: %s', | |
last_rc_heading); | |
continue; | |
} | |
var msg = extract_metamsg(frag, log_date, members); | |
if (msg instanceof Error) { | |
console.error('ERROR: %s', VE.fullStack(msg)); | |
process.exit(1); | |
} else if (msg !== null) { | |
if (msg.type === 'ignore') { | |
continue; | |
} | |
msg.room = room; | |
if (process.env.V === '1') { | |
console.log('MESSAGE: %s', JSON.stringify(msg, | |
null, 4)); | |
} | |
if (msg.name) { | |
members[msg.name] = true; | |
} | |
out.push(msg); | |
continue; | |
} | |
console.error('unknown fragment: %s', mod_util.inspect( | |
frag, false, 32, true)); | |
process.exit(1); | |
} | |
if (in_comment) { | |
return (new VE('document terminated while inside a comment')); | |
} | |
return (out); | |
} | |
function | |
proc_data_old(data, just_dump) | |
{ | |
var doc = mod_parse5.parse(data); | |
if (just_dump) { | |
return (sanitise(doc)); | |
} | |
/* | |
* Track room membership. | |
* XXX We probably need to do this in two passes; gather members | |
* first, and then later resolve "describe" lines. | |
*/ | |
var members = { | |
james: true, | |
aconbere: true, | |
pedro: true, | |
'scott.mcwhirter': true, | |
ryan: true, | |
marsell: true, | |
bot: true, | |
jclulow: true, | |
alexwilson: true, | |
pmooney: true, | |
jperkin: true, | |
}; | |
var body = find_first_child(find_first_child(doc, 'html'), 'body'); | |
var i = 0; | |
var c; | |
var dt; | |
while (i < body.childNodes.length) { | |
c = body.childNodes[i++]; | |
if (c.nodeName !== 'div' || | |
find_first_attr(c, 'class') !== 'logdate' || | |
c.childNodes.length < 2) { | |
continue; | |
} | |
// console.error('DATE: %s', c.childNodes[0].value); | |
dt = (new Date(c.childNodes[0].value)). | |
toISOString().replace(/T.*/, ''); | |
// console.error('DATE: %s', dt); | |
break; | |
} | |
if (dt === undefined) { | |
return (new VE('could not find date header')); | |
} | |
var before_occup_check = i; | |
var occupants_found = false; | |
while (i < body.childNodes.length) { | |
c = body.childNodes[i++]; | |
/* | |
* Is this a "room configuration" style <DIV>? | |
*/ | |
if (c.nodeName !== 'div' || | |
find_first_attr(c, 'class') !== 'rc' || | |
c.childNodes.length !== 5) { | |
continue; | |
} | |
/* | |
* Make sure this is the "Room Occupants" block: | |
*/ | |
var rct = c.childNodes[1]; | |
if (rct.nodeName !== 'div' || | |
find_first_attr(rct, 'class') !== 'rct' || | |
rct.childNodes[0].nodeName !== '#text' || | |
rct.childNodes[0].value !== 'Room Occupants') { | |
continue; | |
} | |
var rcos = c.childNodes[3]; | |
if (rcos.nodeName !== 'div' || | |
find_first_attr(rcos, 'class') !== 'rcos' || | |
rcos.childNodes.length !== 2) { | |
continue; | |
} | |
var rcot = rcos.childNodes[1]; | |
if (rcot.nodeName !== 'div' || | |
find_first_attr(rcot, 'class') !== 'rcot') { | |
continue; | |
} | |
for (var q = 0; q < rcot.childNodes.length; q++) { | |
var cc = rcot.childNodes[q]; | |
if (cc.nodeName === 'br') { | |
continue; | |
} | |
if (cc.nodeName === '#text') { | |
var name = cc.value. | |
replace(/^Moderator: /, ''). | |
replace(/^Participant: /, ''); | |
// console.error('\tINITIAL MEMBER %j', name); | |
members[name] = true; | |
} | |
} | |
occupants_found = true; | |
break; | |
} | |
if (!occupants_found) { | |
i = before_occup_check; /* XXX */ | |
// return (new VE('could not find initial room occupants')); | |
} | |
var top_ts_found = false; | |
while (i < body.childNodes.length) { | |
c = body.childNodes[i++]; | |
if (c.nodeName !== 'a') { | |
continue; | |
} | |
if (find_first_attr(c, 'class') !== 'ts') { | |
continue; | |
} | |
if (c.childNodes.length !== 1) { | |
continue; | |
} | |
var t = find_first_child(c, '#text'); | |
if (!t || t.value !== 'GMT+0') { | |
continue; | |
} | |
//console.error('FOUND GMT+0 STAMP @ IDX %d', i - 1); | |
top_ts_found = true; | |
break; | |
} | |
if (!top_ts_found) { | |
return (new VE('could not find GMT+0 stamp')); | |
} | |
/* | |
* A line in the log begins with an <A> tag with the class "ts", | |
* as well as an "id" and "name" field which contain a time stamp. | |
* There is a #text node within the <A> node which contains the | |
* formatted timestamp. | |
* | |
* Now that we've found the point at which the GMT+0 timestamp | |
* header appears, we discard nodes until we locate one of these | |
* timestamp anchors. | |
*/ | |
var accum = null; | |
var out = []; | |
var err_return = null; | |
var wait_for_ts = true; | |
var commit = function () { | |
if (accum === null || accum.length === 0) { | |
/* | |
* Nothing to commit. | |
*/ | |
return (true); | |
} | |
var res = proc_html_line(dt, members, accum); | |
if (res instanceof Error) { | |
err_return = VE({ cause: res, | |
info: { obj: sanitise(accum) }}, | |
'proc_html_line'); | |
return (false); | |
} | |
if (res.type !== 'ignore') { | |
// console.log('%j', res); /* XXX */ | |
out.push(res); | |
} | |
accum = []; | |
return (true); | |
}; | |
while (i < body.childNodes.length) { | |
var ts; | |
c = body.childNodes[i++]; | |
if (c.nodeName === 'div') { | |
var dcls = find_first_attr(c, 'class'); | |
switch (dcls) { | |
case 'rc': | |
/* | |
* Sadly, it seems that "Room Configuration" | |
* blocks can appear basically anywhere. | |
* Skip them if they are amongst the regular | |
* log line population. | |
*/ | |
if (!commit()) { | |
return (err_return); | |
} | |
wait_for_ts = true; | |
accum = null; | |
continue; | |
case 'legend': | |
/* | |
* At the end of the messages section in | |
* the log, there is a <div> with class | |
* "legend". We use this to detect the | |
* well-formed end of a log file. | |
*/ | |
if (!commit()) { | |
return (err_return); | |
} | |
return (out); | |
default: | |
//unexpected(c); | |
return (new VE('unexpected <div> class "%s"', | |
dcls)); | |
} | |
} | |
if (c.nodeName !== 'a' || | |
find_first_attr(c, 'class') !== 'ts' || | |
(name = find_first_attr(c, 'name')) === null || | |
c.childNodes.length !== 1 || | |
(ts = find_first_child(c, '#text')) === null) { | |
/* | |
* This is not the anchor node we are looking for. | |
*/ | |
if (!wait_for_ts) { | |
accum.push(c); | |
} | |
continue; | |
} | |
if (wait_for_ts) { | |
/* | |
* We were waiting for a time stamp to resync | |
* with the log stream, and we've found one. | |
*/ | |
accum = [ c ]; | |
wait_for_ts = false; | |
} else { | |
/* | |
* We were accumulating an existing log message | |
* and have found another time stamp. Commit | |
* the existing log message, then start a new | |
* line. | |
*/ | |
if (!commit()) { | |
return (err_return); | |
} | |
accum.push(c); | |
} | |
} | |
/* | |
* Special case for when the Jabber server apparently just stopped | |
* writing to the log file before the end... | |
*/ | |
/* | |
var last = body.childNodes[body.childNodes.length - 1]; | |
var lastbutone = body.childNodes[body.childNodes.length - 2]; | |
if (text_node_is_match(last, '\n') && | |
lastbutone.nodeName === 'br') { | |
console.error('TRUNCATED FILE?'); | |
if (!commit()) { | |
return (err_return); | |
} | |
return (out); | |
}*/ | |
return (new VE({ | |
info: { obj: sanitise(accum) }, | |
}, 'did not find closing <div> tag before EOF')); | |
} | |
module.exports = { | |
proc_data: proc_data, | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment