Skip to content

Instantly share code, notes, and snippets.

@robinp7720
Last active September 22, 2016 19:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save robinp7720/83d046434342b2590ee27e79d99f42c8 to your computer and use it in GitHub Desktop.
Save robinp7720/83d046434342b2590ee27e79d99f42c8 to your computer and use it in GitHub Desktop.
An indexer for the 1.4 scratch forums
// Load config for mysql and other things
var config = require('./config.json');
// Load nodejs modules
var mysql = require('mysql');
var cheerio = require('cheerio');
var sleep = require('sleep');
var fs = require("fs");
var async = require("async");
var colors = require('colors');
var each = require('async-each');
var entities = require("entities");
var moment = require("moment");
// The offset of where to start when resuming
var offset = 16000;
var connection = mysql.createPool({
connectionLimit : 10,
host: "localhost",
user: "root",
password: "root",
database: "Scratch_forums"
});
connection.on('enqueue', function () {
//console.log('Waiting for available connection slot'.rainbow);
});
function addUser(author) {
connection.query('SELECT count(*) FROM `user` where username=? ', [author], function (error, results, fields) {
if (error) {
console.log('Error: ' + error.message);
}
if (results[0]['count(*)'] == 0) {
connection.query('INSERT INTO `user` (`username`) VALUES (?)', [author], function (err) {
console.log(author.blue + " inserted into DB".yellow);
});
} else {
console.log(author.blue + " already in DB".red);
}
});
}
function addPost(parentForum,childForum,author,post,time) {
connection.query('SELECT count(*) FROM `posts` where `author` = ? and `postTime` = ? and `topic_name` = ? and `forum_name` = ? ', [author, time, childForum, parentForum], function (error, results, fields) {
if (error) {
console.log('Error: ' + error.message);
}
if (results[0]['count(*)'] == 0) {
//console.log("inserting into db".yellow);
connection.query('INSERT INTO `posts` (`author`,`post`,`postTime`,`topic_name`, `forum_name`) VALUES (?,?,?,?,?)', [author, post, time, childForum, parentForum], function (err) {
console.log(author.magenta + "'s post inserted into DB".magenta);
});
} else {
console.log('post already in db'.yellow)
}
});
}
function indexpage(threadID,pageID,cb) {
console.log("Page ".blue+pageID.blue+ "being indexed in thread ".blue + threadID.blue);
fs.readFile('../ScratchForumArchive/' + threadID + '/' + pageID, 'utf8', function (err, content) {
$ = cheerio.load(content, {decodeEntities: true});
//console.log('../ScratchForumArchive/' + threadID + '/' + pageID, content);
var parentThread = $('.linkst ul').children('li').next().children('a').text();
var threadName = $('.linkst ul').children('li').next().next().text().replace(' » ','');
console.log(threadName.red + " in ".red + parentThread.red + "being indexed".red);
$('.blockpost').each(function (i, e) {
var time = $(this).children('h2').children('span').children('a').text();
var post = $(this).children('.box').children('.inbox').children('.postright').children('.postmsg').text();
var author = $(this).children('.box').children('.inbox').children('.postleft').children('dl').children('dt').text();
//addUser(author);
//console.log(moment(time, "YYYY-MM-DD HH:mm:ss").format(),time);
addPost(parentThread,threadName,author,post.trim(),moment(time, "YYYY-MM-DD HH:mm:ss").format());
});
setTimeout(function(){
cb();
},200);
});
}
function indexthread(threadID,cb) {
fs.readdir('../ScratchForumArchive/' + threadID, function (err, pageID) {
async.eachLimit(pageID, 1, function(id,cb) {
indexpage(threadID,id,cb)
},function() {
cb();
});
});
}
fs.readdir('../ScratchForumArchive', function(err,threadID){
async.eachLimit(threadID.slice(offset), 1,
function(id,cb) {
console.log("Starting work on item ".black.bgRed +threadID.indexOf(id).toString().black.bgRed);
indexthread(id,cb)
}
);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment