Skip to content

Instantly share code, notes, and snippets.

@jclulow
Created August 3, 2017 00:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jclulow/b54d56217ad16a04bf193a5920c4b534 to your computer and use it in GitHub Desktop.
Save jclulow/b54d56217ad16a04bf193a5920c4b534 to your computer and use it in GitHub Desktop.
#!/usr/bin/env node
/* vim: set ts=8 sts=8 sw=8 noet: */
var mod_assert = require('assert-plus');
var mod_util = require('util');
var mod_parse5 = require('parse5');
var mod_jsprim = require('jsprim');
var mod_verror = require('verror');
var VE = mod_verror.VError;
var lib_common = require('./common');
var find_first_attr = lib_common.find_first_attr;
var find_first_child = lib_common.find_first_child;
var text_node_is_match = lib_common.text_node_is_match;
function
sanitise(node)
{
var replacer = function (key, value) {
if (key === 'parentNode') {
return (undefined);
}
return (value);
};
return (JSON.parse(JSON.stringify(node, replacer)));
}
function
process_hyperlink(link)
{
/*
* A hyperlink?
*/
var href = find_first_attr(link, 'href');
if (!href || link.childNodes.length !== 1) {
return (new VE('malformed <a> #1'));
}
var tn = link.childNodes[0];
if (tn.nodeName !== '#text' || tn.childNodes) {
return (new VE('malformed <a> #2'));
}
/*
* XXX Sigh. It appears that the escaping of double quotes in
* URLs did not always happen correctly. This manifests as an
* attribute named for a single double quote (!) and probably
* just means we should relax the text/URL check.
*/
var relax = false;
for (var lai = 0; lai < link.attrs.length; lai++) {
if (link.attrs[lai].name.indexOf('"') !== -1 &&
link.attrs[lai].value === '') {
relax = true;
break;
}
}
/*
var garbage = [
'},"',
'}"',
'"',
')"',
'&gt;&gt;"',
'21"',
'&gt;"',
];
for (var gi = 0; gi < garbage.length; gi++) {
if (find_first_attr(link, garbage[gi]) === '') {
relax = true;
break;
}
}*/
if (!relax && tn.value !== href) {
return (new VE('<a> href/text mismatch: %j',
sanitise(link)));
}
return (href);
}
function
OLD_proc_html_line(dt, members, l)
{
//console.error('%s', mod_util.inspect(l, false, 10, true));
//console.error('-----------------------');
var name, href;
if (l[0].nodeName !== 'a' || l[0].tagName !== 'a' ||
find_first_attr(l[0], 'class') !== 'ts' ||
(name = find_first_attr(l[0], 'name')) === null ||
(href = find_first_attr(l[0], 'href')) === null ||
//id !== name ||
href !== ('#' + name)) {
return (new VE('invalid timestamp: %j', sanitise(l)));
}
var out = {
time: new Date(dt + 'T' + name + 'Z').toISOString()
};
var pos = 1;
if (!l[pos]) {
console.log('%j', sanitise(l));
}
if (text_node_is_match(l[pos], ' ')) {
/*
* Skip a single space right after the timestamp.
*/
pos++;
}
if (l[pos].nodeName !== 'font') {
return (new VE('expected <font> node, not "%s"',
l[pos].nodeName));
}
var fcls = find_first_attr(l[pos], 'class');
var fch, fm;
var ci, cic, res;
switch (fcls) {
case 'mrcm':
var valid_values = [
'Chatroom configuration modified',
'Chatroom is started',
'Chatroom is created',
'Chatroom is stopped',
'Chatroom is destroyed',
];
out.type = 'ignore';
if (l[pos].childNodes.length !== 1) {
return (new VE('<font> with class "mrcm" had too ' +
'many children'));
}
fch = l[pos].childNodes[0];
if (fch.nodeName !== '#text' ||
valid_values.indexOf(fch.value) === -1 ||
fch.childNodes) {
return (new VE('<font> with class "mrcm" had ' +
'malformed child'));
}
pos++;
if (l[pos].nodeName !== 'br') {
return (new VE('<font> with class "mrcm" missing ' +
'<br>'));
}
pos++;
break;
case 'ml':
/*
* This font tag represents a user leaving a room.
*
* XXX Note that technically a user can provide a message when
* they leave, but we're just going to drop that on the floor.
*/
out.type = 'part';
if (l[pos].childNodes.length < 1) {
return (new VE('<font> with class "ml" had too ' +
'few children'));
}
fch = l[pos].childNodes[0];
if (fch.nodeName !== '#text' ||
(fm = fch.value.match(/^(.*) leaves the room/)) === null ||
fch.childNodes) {
return (new VE('<font> with class "ml" had ' +
'malformed child'));
}
out.name = fm[1];
pos++;
if (l[pos].nodeName !== 'br') {
return (new VE('<font> with class "ml" missing <br>'));
}
pos++;
//if (!members[out.name]) {
//console.error('USER "%s" LEAVING, BUT NOT PRESENT',
//out.name);
//process.exit(1);
//}
members[out.name] = true;
break;
case 'mj':
/*
* This font tag represents a user joining a room.
*/
out.type = 'join';
if (l[pos].childNodes.length !== 1) {
return (new VE('<font> with class "mj" had too ' +
'many children'));
}
fch = l[pos].childNodes[0];
if (fch.nodeName !== '#text' ||
(fm = fch.value.match(/^(.*) joins the room/)) === null ||
fch.childNodes) {
return (new VE('<font> with class "mj" had ' +
'malformed child'));
}
out.name = fm[1];
pos++;
if (l[pos].nodeName !== 'br') {
return (new VE('<font> with class "mj" missing <br>'));
}
pos++;
//if (!members[out.name]) {
//console.error('USER "%s" JOINING, BUT NOT PRESENT',
//out.name);
//process.exit(1);
//}
members[out.name] = true;
break;
case 'mnc':
/*
* This font tag represents a user changing nicknames.
* (Sigh.)
*/
out.type = 'rename';
if (l[pos].childNodes.length !== 1) {
return (new VE('<font> with class "mnc" had too ' +
'many children'));
}
fch = l[pos].childNodes[0];
if (fch.nodeName !== '#text' ||
(fm = fch.value.match(/^(.*) is now known as (.*)$/)) ===
null ||
fch.childNodes) {
return (new VE('<font> with class "mnc" had ' +
'malformed child'));
}
out.name = fm[1];
out.message = fm[2];
pos++;
if (l[pos].nodeName !== 'br') {
return (new VE('<font> with class "mnc" missing <br>'));
}
pos++;
//if (!members[out.name]) {
//console.error('USER "%s" JOINING, BUT NOT PRESENT',
//out.name);
//process.exit(1);
//}
members[out.name] = true;
break;
case 'msc':
/*
* This font tag represents a user setting the room topic.
*/
out.type = 'topic';
if (l[pos].childNodes.length < 1) {
return (new VE('<font> with class "msc" had too ' +
'few children'));
}
fch = l[pos].childNodes[0];
if (fch.nodeName !== '#text' ||
(fm = fch.value.match(
/^(.*) has set the subject to: (.*)$/)) === null ||
fch.childNodes) {
return (new VE('<font> with class "msc" had ' +
'malformed child'));
}
out.name = fm[1];
out.msg = [ fm[2] ];
/*
* Process remaining child elements...
*/
for (ci = 1; ci < l[pos].childNodes.length; ci++) {
cic = l[pos].childNodes[ci];
switch (cic.nodeName) {
case 'a':
res = process_hyperlink(cic);
if (res instanceof Error) {
return (res);
}
out.msg.push(res);
continue;
case 'br':
out.msg.push('\n');
continue;
case '#text':
if (cic.childNodes) {
return (new VE('text node with ' +
'children'));
}
out.msg.push(cic.value);
continue;
default:
return (new VE('unexpected node %j',
sanitise(cic)));
}
}
pos++;
if (l[pos].nodeName !== 'br') {
return (new VE('<font> with class "msc" missing <br>'));
}
/*
* Remove final linefeed.
*/
out.msg = out.msg.join('').replace(/\n$/, '');
//if (!members[out.name]) {
//console.error('USER "%s" JOINING, BUT NOT PRESENT',
//out.name);
//process.exit(1);
//}
members[out.name] = true;
pos++;
break;
case 'mne':
/*
* XXX These are "/me ..." lines. Unfortunately there is
* no delimiter between the nickname and the actual text,
* so we're going to have to be clever and try and track
* the room occupancy in order to detect who said this.
*/
out.type = 'describe';
if (l[pos].childNodes.length < 1) {
return (new VE('<font> with class "mne" had too ' +
'few children'));
}
fch = l[pos].childNodes[0];
if (fch.nodeName !== '#text' || fch.childNodes) {
return (new VE('<font> with class "mne" had ' +
'malformed initial text child'));
}
/*
* Decide whose message this is...
*/
var names = Object.keys(members).sort();
for (var ni = 0; ni < names.length; ni++) {
if (mod_jsprim.startsWith(fch.value, names[ni] + ' ')) {
if (out.name) {
return (new VE('amgiguous name: ' +
'"%s" and "%s"', out.name,
names[ni]));
}
out.name = names[ni];
out.msg = [ fch.value.substr(
out.name.length + 1) ];
break;
}
}
if (!out.name) {
return (new VE('unknown name "%s" on describe line',
fch.value));
}
/*
* Process remaining child elements...
*/
for (ci = 1; ci < l[pos].childNodes.length; ci++) {
cic = l[pos].childNodes[ci];
switch (cic.nodeName) {
case 'a':
res = process_hyperlink(cic);
if (res instanceof Error) {
return (res);
}
out.msg.push(res);
continue;
case 'br':
out.msg.push('\n');
continue;
case '#text':
if (cic.childNodes) {
return (new VE('text node with ' +
'children'));
}
out.msg.push(cic.value);
continue;
default:
return (new VE('unexpected node %j',
sanitise(cic)));
}
}
pos++;
if (l[pos].nodeName !== 'br') {
return (new VE('<font> with class "mne" missing <br>'));
}
/*
* Remove final linefeed.
*/
out.msg = out.msg.join('').replace(/\n$/, '');
pos++;
break;
case 'mn':
/*
* This font tag contains a Nickname as part of a regular
* chat message.
*/
out.type = 'message';
if (l[pos].childNodes.length !== 1) {
return (new VE('<font> with class "mn" had too ' +
'many children'));
}
fch = l[pos].childNodes[0];
if (fch.nodeName !== '#text' ||
(fm = fch.value.match(/^<(.*)>$/)) === null ||
fch.childNodes) {
return (new VE('<font> with class "mn" had ' +
'malformed child'));
}
out.name = fm[1].trim();
out.msg = [];
pos++;
for (;;) {
switch (l[pos].nodeName) {
case 'a':
res = process_hyperlink(l[pos]);
if (res instanceof Error) {
return (res);
}
out.msg.push(res);
pos++;
continue;
case 'br':
out.msg.push('\n');
pos++;
continue;
case '#text':
if (l[pos].childNodes) {
return (new VE('text node with ' +
'children'));
}
if (l[pos].value === '\n') {
/*
* This should be the end of a line.
*/
break;
}
out.msg.push(l[pos].value);
pos++;
continue;
default:
return (new VE('unexpected node'));
}
/*
* If the case arm does not terminate with "continue",
* this line is finished.
*/
break;
}
/*
* Remove final linefeed.
*/
out.msg = out.msg.join('').replace(/\n$/, '');
/*
* Remove initial single space separating log line from
* timestamp.
*/
out.msg = out.msg.replace(/^ /, '');
members[out.name] = true;
break;
case 'msm':
/*
* XXX WTF
*/
return ({ type: 'ignore' });
case 'mk':
/*
* e.g., "cody has been kicked: shoo"
* XXX Bah.
*/
return ({ type: 'ignore' });
default:
return (new VE('unexpected <font> class "%s"', fcls));
}
/*
* Because of the way ejabberd writes these log files, every line
* should terminate with a single line feed. If not, we have probably
* not correctly identified the bounds of this line.
*/
if (!text_node_is_match(l[pos], '\n')) {
return (new VE('expected newline @ %d', pos));
}
//console.error('%s', mod_util.inspect(out, false, 10, true));
//console.log('%j', out);
return (out);
//console.error();
//console.error();
}
var SKIPS = [
[ { nodeName: 'link', tagName: 'link', attrs: [
{ name: 'rel', value: 'stylesheet' },
{ name: 'type', value: 'text/css' },
{ name: 'href', value: 'http://http://joyent.com/assets/css/style.css' },
{ name: 'media', value: 'all' } ],
namespaceURI: 'http://www.w3.org/1999/xhtml',
childNodes: [] } ],
[ { nodeName: 'link', tagName: 'link', attrs: [
{ name: 'rel', value: 'stylesheet' },
{ name: 'type', value: 'text/css' },
{ name: 'href', value: 'http://joyent.com/assets/css/style.css' },
{ name: 'media', value: 'all' } ],
namespaceURI: 'http://www.w3.org/1999/xhtml',
childNodes: [] } ],
[ { nodeName: 'meta', tagName: 'meta', attrs: [
{ name: 'http-equiv', value: 'Content-Type' },
{ name: 'content', value: 'text/html; charset=utf-8' } ],
namespaceURI: 'http://www.w3.org/1999/xhtml',
childNodes: [] } ],
[ { nodeName: 'script', tagName: 'script', attrs: [
{ name: 'type', value: 'text/javascript' } ],
namespaceURI: 'http://www.w3.org/1999/xhtml',
childNodes: [] } ],
[ { nodeName: '#text',
value: 'function sh(e) // Show/Hide an element' } ],
[ { nodeName: '#text',
value: '{if(document.getElementById(e).style.display==\'none\')' } ],
[ { nodeName: '#text',
value: '{document.getElementById(e).style.display=\'block\';}' } ],
[ { nodeName: '#text',
value: 'else {document.getElementById(e).style.display=\'none\';}}' } ],
[ { nodeName: 'div', tagName: 'div', attrs: [
{ name: 'class', value: 'roomtitle' } ],
namespaceURI: 'http://www.w3.org/1999/xhtml',
childNodes: [] } ],
[ { nodeName: 'span', tagName: 'span', attrs: [
{ name: 'class', value: 'w3c' } ],
namespaceURI: 'http://www.w3.org/1999/xhtml',
childNodes: [] } ],
[ { nodeName: 'style', tagName: 'style', attrs: [
{ name: 'type', value: 'text/css' } ],
namespaceURI: 'http://www.w3.org/1999/xhtml',
childNodes: [] } ],
[],
];
function
extract_title(frag)
{
if (frag.nodeName !== '#document-fragment' ||
frag.childNodes.length !== 1 ||
frag.childNodes[0].nodeName !== 'title' ||
frag.childNodes[0].attrs.length !== 0 ||
frag.childNodes[0].childNodes.length !== 1 ||
frag.childNodes[0].childNodes[0].nodeName !== '#text') {
return (null);
}
return (frag.childNodes[0].childNodes[0].value);
}
function
skip_div(frag, last_rc_heading)
{
if (frag.nodeName !== '#document-fragment' ||
frag.childNodes.length !== 1) {
return (false);
}
var div = frag.childNodes[0];
if (div.nodeName !== 'div') {
return (false);
}
/*
* Skip any <div> of the class "rc". This div technically
* encloses the room configuration block that follows, except that
* we're parsing each line as a fragment and that structural
* relationship is interrupted by a line feed.
*/
if (find_first_attr(div, 'class') === 'rc') {
return (true);
}
if (find_first_attr(div, 'class') === 'roomtitle') {
return (true);
}
if (find_first_attr(div, 'class') === 'legend') {
return (true);
}
/*
* We do not care about the div containing the room configuration.
* This is a set of key-value pairs, of sorts, which describe
* the way the room is presently set up.
*/
if (find_first_attr(div, 'class') === 'rcos' &&
last_rc_heading === 'Room Configuration') {
return (true);
}
/*
* If the <div> has a class, or more or less than one child node, we
* don't want to skip it here.
*/
if (find_first_attr(div, 'class') ||
div.childNodes.length !== 1) {
return (false);
}
/*
* If this <div> contains a single <a> tag which links to the root of
* the site, it's safe to skip.
*/
var ch = div.childNodes[0];
if (ch.nodeName === 'a' &&
find_first_attr(ch, 'href') === '/') {
return (true);
}
var valid_urls = [
'https://connector.joyent.com/logs/',
'https://jabber.joyent.com/logs/',
];
if (ch.nodeName === 'a' &&
valid_urls.indexOf(find_first_attr(ch, 'href')) !== -1) {
return (true);
}
return (false);
}
function
extract_occupants(frag, last_rc_heading)
{
if (last_rc_heading !== 'Room Occupants') {
return (null);
}
if (frag.nodeName !== '#document-fragment' ||
frag.childNodes.length !== 1 ||
frag.childNodes[0].nodeName !== 'div') {
return (null);
}
var div = frag.childNodes[0];
if (find_first_attr(div, 'class') !== 'rcos' ||
div.childNodes.length !== 2 ||
div.childNodes[0].nodeName !== 'br' ||
div.childNodes[1].nodeName !== 'div') {
return (null);
}
var rcot = div.childNodes[1];
if (find_first_attr(rcot, 'class') !== 'rcot') {
return (null);
}
var members = {};
for (var q = 0; q < rcot.childNodes.length; q++) {
var cc = rcot.childNodes[q];
if (cc.nodeName === 'br') {
continue;
}
if (cc.nodeName === '#text') {
var name = cc.value.
replace(/^Moderator: /, '').
replace(/^Participant: /, '');
members[name] = true;
}
}
return (members);
}
function
extract_tz(frag)
{
if (frag.nodeName !== '#document-fragment' ||
frag.childNodes.length !== 3 ||
frag.childNodes[0].nodeName !== 'br' ||
frag.childNodes[1].nodeName !== 'a' ||
frag.childNodes[2].nodeName !== 'br') {
return (null);
}
var a = frag.childNodes[1];
if (find_first_attr(a, 'class') !== 'ts' ||
a.childNodes.length !== 1 ||
a.childNodes[0].nodeName !== '#text') {
return (null);
}
return (a.childNodes[0].value);
}
function
extract_rc_heading(frag)
{
if (frag.nodeName !== '#document-fragment' ||
frag.childNodes.length !== 1 ||
frag.childNodes[0].nodeName !== 'div') {
return (null);
}
var div = frag.childNodes[0];
if (find_first_attr(div, 'class') !== 'rct' ||
div.childNodes.length !== 1 ||
div.childNodes[0].nodeName !== '#text') {
return (null);
}
return (div.childNodes[0].value);
}
function
extract_room_jid(frag)
{
if (frag.nodeName !== '#document-fragment' ||
frag.childNodes.length !== 1 ||
frag.childNodes[0].nodeName !== 'a') {
return (null);
}
var a = frag.childNodes[0];
if (find_first_attr(a, 'class') !== 'roomjid' ||
a.childNodes.length !== 1 ||
a.childNodes[0].nodeName !== '#text') {
return (null);
}
return (a.childNodes[0].value);
}
function
extract_date(frag)
{
if (frag.nodeName !== '#document-fragment' ||
frag.childNodes.length !== 1 ||
frag.childNodes[0].nodeName !== 'div') {
return (null);
}
var div = frag.childNodes[0];
if (find_first_attr(div, 'class') !== 'logdate' ||
div.childNodes.length !== 2 ||
div.childNodes[0].nodeName !== '#text' ||
div.childNodes[1].nodeName !== 'span') {
return (null);
}
return (div.childNodes[0].value);
}
function
extract_room_subject(frag)
{
if (frag.nodeName !== '#document-fragment' ||
frag.childNodes.length !== 1 ||
frag.childNodes[0].nodeName !== 'div') {
return (null);
}
var div = frag.childNodes[0];
if (find_first_attr(div, 'class') !== 'roomsubject' ||
div.childNodes.length !== 1 ||
div.childNodes[0].nodeName !== '#text') {
return (null);
}
return (div.childNodes[0].value);
}
function
extract_message_common(frag, dt)
{
/*
* These lines must start with an <a> tag containing a timestamp.
*/
if (frag.nodeName !== '#document-fragment' ||
frag.childNodes.length < 1 ||
frag.childNodes[0].nodeName !== 'a') {
return (null);
}
var ts = frag.childNodes[0];
var name, href;
if (find_first_attr(ts, 'class') !== 'ts' ||
(name = find_first_attr(ts, 'name')) === null ||
(href = find_first_attr(ts, 'href')) === null) {
return (null);
}
var out = {
time: new Date(dt + 'T' + name + 'Z').toISOString(),
pos: 1
};
if (text_node_is_match(frag.childNodes[out.pos], ' ')) {
/*
* Skip a single space right after the timestamp.
*/
out.pos++;
}
return (out);
}
function
extract_message_tail(frag, dt, out)
{
/*
* Process the rest of the child nodes as the message.
*/
out.pos++;
while (out.pos < frag.childNodes.length) {
var ch = frag.childNodes[out.pos];
switch (ch.nodeName) {
case 'a':
var res = process_hyperlink(ch);
if (res instanceof Error) {
return (res);
}
out.msg.push(res);
out.pos++;
continue;
case 'br':
out.msg.push('\n');
out.pos++;
continue;
case '#text':
if (ch.childNodes) {
return (new VE('text node with ' +
'children'));
}
if (ch.value === '\n') {
return (new VE('unexpected raw newline'));
}
out.msg.push(ch.value);
out.pos++;
continue;
default:
return (new VE('unexpected node: %j',
frag.childNodes[out.pos]));
}
}
/*
* Remove final linefeed.
*/
out.msg = out.msg.join('').replace(/\n$/, '');
/*
* Remove initial single space separating log line from
* timestamp.
*/
out.msg = out.msg.replace(/^ /, '');
delete (out.pos);
return (out);
}
function
extract_message(frag, dt, out)
{
var mn = frag.childNodes[out.pos];
var m;
if (mn.nodeName !== 'font' ||
find_first_attr(mn, 'class') !== 'mn' ||
mn.childNodes.length !== 1 ||
mn.childNodes[0].nodeName !== '#text' ||
(m = mn.childNodes[0].value.match(/^<(.*)>$/)) === null) {
return (new VE('malformed message: %j', frag));
}
out.type = 'message';
out.name = m[1];
out.msg = [];
return (extract_message_tail(frag, dt, out));
}
function
extract_describe(frag, dt, out, members)
{
var mne = frag.childNodes[out.pos];
if (mne.nodeName !== 'font' ||
find_first_attr(mne, 'class') !== 'mne' ||
mne.childNodes.length < 1 ||
mne.childNodes[0].nodeName !== '#text') {
return (new VE('malformed message: %j', frag));
}
var text = mne.childNodes[0].value;
out.type = 'describe';
/*
* Decide whose message this is...
*/
var names = Object.keys(members).sort();
for (var ni = 0; ni < names.length; ni++) {
if (mod_jsprim.startsWith(text, names[ni] + ' ')) {
if (out.name) {
return (new VE('amgiguous name: ' +
'"%s" and "%s"', out.name,
names[ni]));
}
out.name = names[ni];
out.msg = [ text.substr(out.name.length + 1) ];
break;
}
}
if (!out.name) {
return (new VE('unknown name on describe line: "%s"', text));
}
if (frag.childNodes[out.pos + 1].nodeName !== 'br') {
return (new VE('describe should end with a <br>: %j', frag));
}
out.pos = 0; /* XXX */
return (extract_message_tail(mne, dt, out));
}
function
extract_metamsg(frag, dt, members)
{
var out = extract_message_common(frag, dt);
if (out === null || out instanceof Error) {
return (out);
}
if (frag.childNodes[out.pos].nodeName !== 'font') {
return (new VE('expected <font> node, not "%s"',
frag.childNodes[out.pos].nodeName));
}
var fch = frag.childNodes[out.pos];
var fcls = find_first_attr(fch, 'class');
var pat;
switch (fcls) {
case 'msm':
/*
* XXX Some kind of system message?
*/
return ({ type: 'ignore' });
case 'ml':
/*
* This font tag represents a user leaving a room.
*
* XXX Note that technically a user can provide a
* message when they leave, but we're just going to
* drop that on the floor.
*/
out.type = 'part';
pat = /^(.*) leaves the room/;
break;
case 'mj':
out.type = 'join';
pat = /^(.*) joins the room/;
break;
case 'mnc':
out.type = 'rename';
pat = /^(.*) is now known as (.*)$/;
break;
case 'msc':
out.type = 'topic';
pat = /^(.*) has set the subject to: (.*)$/;
break;
case 'mn':
return (extract_message(frag, dt, out));
case 'mne':
return (extract_describe(frag, dt, out, members));
case 'mrcm':
var valid_values = [
'Chatroom configuration modified',
'Chatroom is started',
'Chatroom is created',
'Chatroom is stopped',
'Chatroom is destroyed',
];
if (fch.childNodes.length !== 1) {
return (new VE('<font> with class "mrcm" had too ' +
'many children'));
}
if (fch.childNodes[0].nodeName !== '#text' ||
valid_values.indexOf(fch.childNodes[0].value) === -1) {
return (new VE('<font> with class "mrcm" had ' +
'malformed child'));
}
/*
* We don't care about these messages at all.
*/
return ({ type: 'ignore' });
default:
return (null);
}
if (fch.childNodes.length !== 1 ||
fch.childNodes[0].nodeName !== '#text') {
return (new VE('malformed "%s" <font> child',
out.type));
}
var m = fch.childNodes[0].value.match(pat);
if (!m) {
return (new VE('blah: %j', fch));
}
if (fcls === 'msc') {
out.name = m[1];
out.msg = [ m[2] ];
return (extract_message_tail(frag, dt, out));
} else {
out.name = m[1];
out.msg = m[2] ? m[2] : null;
}
delete (out.pos);
return (out);
}
function
proc_data(room, data, just_dump)
{
/*
* Though the documents we are processing are technically HTML,
* they are really constructed iteratively by the Jabber server.
* Most lines essentially stand alone as a document fragment.
* In order to handle times when the document is not a well-formed
* selection of fragments, we process line by line.
*/
var lines = data.split('\n');
var out = [];
var last_rc_heading = null;
var log_date;
/*
* Track room membership.
* XXX We probably need to do this in two passes; gather members
* first, and then later resolve "describe" lines.
*/
var members = {
james: true,
aconbere: true,
pedro: true,
'scott.mcwhirter': true,
ryan: true,
marsell: true,
bot: true,
jclulow: true,
alexwilson: true,
pmooney: true,
jperkin: true,
};
var in_comment = false;
for (var i = 0; i < lines.length; i++) {
var l = lines[i];
var v;
if (!l) {
/*
* Skip blank lines.
*/
continue;
}
if (l === '<!--') {
in_comment = true;
continue;
}
if (in_comment) {
if (l === '//-->') {
in_comment = false;
}
continue;
}
var frag = sanitise(mod_parse5.parseFragment(l));
var skip = false;
for (var j = 0; j < SKIPS.length; j++) {
var t = { nodeName: '#document-fragment',
childNodes: SKIPS[j] };
if (mod_jsprim.deepEqual(frag, t)) {
skip = true;
break;
}
}
if (skip) {
continue;
}
if (skip_div(frag, last_rc_heading)) {
continue;
}
if (frag.childNodes.length === 2 &&
text_node_is_match(frag.childNodes[0], ' ') &&
frag.childNodes[1].nodeName === 'a') {
var href = find_first_attr(frag.childNodes[1],
'href');
var powers = [
'http://www.ejabberd.im',
'http://www.erlang.org/',
'http://validator.w3.org/check?uri=referer',
'http://jigsaw.w3.org/css-validator/'
];
if (powers.indexOf(href) !== -1) {
console.log('BACK MATTER URL: %s', href);
continue;
}
}
var title = extract_title(frag);
if (title !== null) {
console.log('TITLE: %s', title);
continue;
}
if ((v = extract_room_jid(frag)) !== null) {
if (room !== v) {
return (new VE('unexpected room "%s"',
v));
}
continue;
}
if ((v = extract_date(frag)) !== null) {
log_date = new Date(v).toISOString().
replace(/T.*/, '');
console.log('LOG DATE: %s', log_date);
continue;
}
var room_subject = extract_room_subject(frag);
if (room_subject !== null) {
/*
* XXX need to grab the username out. sigh.
*/
console.log('ROOM SUBJECT: %s', room_subject);
continue;
}
if ((v = extract_occupants(frag, last_rc_heading)) !== null) {
var v_members = Object.keys(v);
console.log('MEMBERS: %s', v_members.join(', '));
v_members.forEach(function (m) {
members[m] = true;
});
continue;
}
var tz = extract_tz(frag);
if (tz !== null) {
console.log('LOG TIMEZONE: %s', tz);
continue;
}
/*
* XXX NB: this must be after the functions that use
* "last_rc_heading".
*/
if ((last_rc_heading = extract_rc_heading(frag)) !== null) {
console.log('ROOM CONFIGURATION: %s',
last_rc_heading);
continue;
}
var msg = extract_metamsg(frag, log_date, members);
if (msg instanceof Error) {
console.error('ERROR: %s', VE.fullStack(msg));
process.exit(1);
} else if (msg !== null) {
if (msg.type === 'ignore') {
continue;
}
msg.room = room;
if (process.env.V === '1') {
console.log('MESSAGE: %s', JSON.stringify(msg,
null, 4));
}
if (msg.name) {
members[msg.name] = true;
}
out.push(msg);
continue;
}
console.error('unknown fragment: %s', mod_util.inspect(
frag, false, 32, true));
process.exit(1);
}
if (in_comment) {
return (new VE('document terminated while inside a comment'));
}
return (out);
}
function
proc_data_old(data, just_dump)
{
var doc = mod_parse5.parse(data);
if (just_dump) {
return (sanitise(doc));
}
/*
* Track room membership.
* XXX We probably need to do this in two passes; gather members
* first, and then later resolve "describe" lines.
*/
var members = {
james: true,
aconbere: true,
pedro: true,
'scott.mcwhirter': true,
ryan: true,
marsell: true,
bot: true,
jclulow: true,
alexwilson: true,
pmooney: true,
jperkin: true,
};
var body = find_first_child(find_first_child(doc, 'html'), 'body');
var i = 0;
var c;
var dt;
while (i < body.childNodes.length) {
c = body.childNodes[i++];
if (c.nodeName !== 'div' ||
find_first_attr(c, 'class') !== 'logdate' ||
c.childNodes.length < 2) {
continue;
}
// console.error('DATE: %s', c.childNodes[0].value);
dt = (new Date(c.childNodes[0].value)).
toISOString().replace(/T.*/, '');
// console.error('DATE: %s', dt);
break;
}
if (dt === undefined) {
return (new VE('could not find date header'));
}
var before_occup_check = i;
var occupants_found = false;
while (i < body.childNodes.length) {
c = body.childNodes[i++];
/*
* Is this a "room configuration" style <DIV>?
*/
if (c.nodeName !== 'div' ||
find_first_attr(c, 'class') !== 'rc' ||
c.childNodes.length !== 5) {
continue;
}
/*
* Make sure this is the "Room Occupants" block:
*/
var rct = c.childNodes[1];
if (rct.nodeName !== 'div' ||
find_first_attr(rct, 'class') !== 'rct' ||
rct.childNodes[0].nodeName !== '#text' ||
rct.childNodes[0].value !== 'Room Occupants') {
continue;
}
var rcos = c.childNodes[3];
if (rcos.nodeName !== 'div' ||
find_first_attr(rcos, 'class') !== 'rcos' ||
rcos.childNodes.length !== 2) {
continue;
}
var rcot = rcos.childNodes[1];
if (rcot.nodeName !== 'div' ||
find_first_attr(rcot, 'class') !== 'rcot') {
continue;
}
for (var q = 0; q < rcot.childNodes.length; q++) {
var cc = rcot.childNodes[q];
if (cc.nodeName === 'br') {
continue;
}
if (cc.nodeName === '#text') {
var name = cc.value.
replace(/^Moderator: /, '').
replace(/^Participant: /, '');
// console.error('\tINITIAL MEMBER %j', name);
members[name] = true;
}
}
occupants_found = true;
break;
}
if (!occupants_found) {
i = before_occup_check; /* XXX */
// return (new VE('could not find initial room occupants'));
}
var top_ts_found = false;
while (i < body.childNodes.length) {
c = body.childNodes[i++];
if (c.nodeName !== 'a') {
continue;
}
if (find_first_attr(c, 'class') !== 'ts') {
continue;
}
if (c.childNodes.length !== 1) {
continue;
}
var t = find_first_child(c, '#text');
if (!t || t.value !== 'GMT+0') {
continue;
}
//console.error('FOUND GMT+0 STAMP @ IDX %d', i - 1);
top_ts_found = true;
break;
}
if (!top_ts_found) {
return (new VE('could not find GMT+0 stamp'));
}
/*
* A line in the log begins with an <A> tag with the class "ts",
* as well as an "id" and "name" field which contain a time stamp.
* There is a #text node within the <A> node which contains the
* formatted timestamp.
*
* Now that we've found the point at which the GMT+0 timestamp
* header appears, we discard nodes until we locate one of these
* timestamp anchors.
*/
var accum = null;
var out = [];
var err_return = null;
var wait_for_ts = true;
var commit = function () {
if (accum === null || accum.length === 0) {
/*
* Nothing to commit.
*/
return (true);
}
var res = proc_html_line(dt, members, accum);
if (res instanceof Error) {
err_return = VE({ cause: res,
info: { obj: sanitise(accum) }},
'proc_html_line');
return (false);
}
if (res.type !== 'ignore') {
// console.log('%j', res); /* XXX */
out.push(res);
}
accum = [];
return (true);
};
while (i < body.childNodes.length) {
var ts;
c = body.childNodes[i++];
if (c.nodeName === 'div') {
var dcls = find_first_attr(c, 'class');
switch (dcls) {
case 'rc':
/*
* Sadly, it seems that "Room Configuration"
* blocks can appear basically anywhere.
* Skip them if they are amongst the regular
* log line population.
*/
if (!commit()) {
return (err_return);
}
wait_for_ts = true;
accum = null;
continue;
case 'legend':
/*
* At the end of the messages section in
* the log, there is a <div> with class
* "legend". We use this to detect the
* well-formed end of a log file.
*/
if (!commit()) {
return (err_return);
}
return (out);
default:
//unexpected(c);
return (new VE('unexpected <div> class "%s"',
dcls));
}
}
if (c.nodeName !== 'a' ||
find_first_attr(c, 'class') !== 'ts' ||
(name = find_first_attr(c, 'name')) === null ||
c.childNodes.length !== 1 ||
(ts = find_first_child(c, '#text')) === null) {
/*
* This is not the anchor node we are looking for.
*/
if (!wait_for_ts) {
accum.push(c);
}
continue;
}
if (wait_for_ts) {
/*
* We were waiting for a time stamp to resync
* with the log stream, and we've found one.
*/
accum = [ c ];
wait_for_ts = false;
} else {
/*
* We were accumulating an existing log message
* and have found another time stamp. Commit
* the existing log message, then start a new
* line.
*/
if (!commit()) {
return (err_return);
}
accum.push(c);
}
}
/*
* Special case for when the Jabber server apparently just stopped
* writing to the log file before the end...
*/
/*
var last = body.childNodes[body.childNodes.length - 1];
var lastbutone = body.childNodes[body.childNodes.length - 2];
if (text_node_is_match(last, '\n') &&
lastbutone.nodeName === 'br') {
console.error('TRUNCATED FILE?');
if (!commit()) {
return (err_return);
}
return (out);
}*/
return (new VE({
info: { obj: sanitise(accum) },
}, 'did not find closing <div> tag before EOF'));
}
module.exports = {
proc_data: proc_data,
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment