Created
October 21, 2014 08:15
-
-
Save rhodey/a0527cba9a7b60e94fda to your computer and use it in GitHub Desktop.
A quick & dirty script to monitor the "ST LOUIS COPTALK" forum for deleted messages (http://members.boardhost.com/stlouiscoptalk/index.html). A list of message ids is saved to "process.argv[2].Date.now()", make this a cron job and then identify deleted messages through diffs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var split = require('split'); | |
var http = require('http'); | |
var pages = new Array(); | |
var messageIds = new Array(); | |
var pageScrapeCount = 0; | |
var msgIdFileName = process.argv[2] + '.' + Date.now(); | |
var msgIdFile = fs.createWriteStream(msgIdFileName); | |
function getUnique(a) { | |
return a.sort().filter(function(item, pos) { | |
return !pos || item != a[pos - 1]; | |
}) | |
} | |
function addPagesFromLine(offset, line) { | |
var index = line.indexOf('<a href="index-', offset); | |
if (index >= 0) { | |
var pageNumber = line.substring((index + 15), (index + 16)); | |
pages.push('http://members.boardhost.com/stlouiscoptalk/index-' + pageNumber + '.html'); | |
addPagesFromLine(index + 1, line); | |
} | |
} | |
function handleGetPages(cb) { | |
pages.push('http://members.boardhost.com/stlouiscoptalk/index.html'); | |
http.get('http://members.boardhost.com/stlouiscoptalk/index.html', function(res) { | |
var lines = res.pipe(split()); | |
lines.on('data', function (line) { | |
addPagesFromLine(0, line); | |
}); | |
lines.on('end', function (line) { | |
pages = getUnique(pages); | |
cb(); | |
}); | |
}).on('error', function(e) { | |
process.stderr.write("Got error: " + e.message); | |
}); | |
} | |
function addMessageIdsFromLine(offset, line) { | |
var startIndex = line.indexOf('a href="msg/', offset); | |
if (startIndex >= 0) { | |
var endIndex = line.indexOf('.html', startIndex); | |
var msgId = line.substring((startIndex + 12), endIndex); | |
if (msgId.indexOf(' ') < 0) | |
messageIds.push(msgId); | |
addMessageIdsFromLine(startIndex + 1, line); | |
} | |
} | |
function handleGetMessageIdsFromPage(page, cb) { | |
process.stdout.write('getting message ids from page ' + page + '\n'); | |
http.get(page, function(res) { | |
var lines = res.pipe(split()); | |
lines.on('data', function (line) { | |
addMessageIdsFromLine(0, line); | |
}); | |
lines.on('end', function (line) { | |
messageIds = getUnique(messageIds); | |
cb(); | |
}); | |
}).on('error', function(e) { | |
process.stderr.write.log("Got error: " + e.message); | |
}); | |
} | |
function handlePageScraped() { | |
if (++pageScrapeCount < pages.length) | |
return; | |
for(var id in messageIds) | |
msgIdFile.write(messageIds[id] + '\n'); | |
} | |
function handleScrapePages() { | |
process.stdout.write('found ' + pages.length + ' pages containing message lists\n'); | |
for (var page in pages) | |
handleGetMessageIdsFromPage(pages[page], handlePageScraped); | |
} | |
handleGetPages(handleScrapePages); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment