Last active
September 22, 2016 19:43
-
-
Save robinp7720/83d046434342b2590ee27e79d99f42c8 to your computer and use it in GitHub Desktop.
An indexer for the 1.4 scratch forums
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Load config for mysql and other things | |
var config = require('./config.json'); | |
// Load nodejs modules | |
var mysql = require('mysql'); | |
var cheerio = require('cheerio'); | |
var sleep = require('sleep'); | |
var fs = require("fs"); | |
var async = require("async"); | |
var colors = require('colors'); | |
var each = require('async-each'); | |
var entities = require("entities"); | |
var moment = require("moment"); | |
// The offset of where to start when resuming | |
var offset = 16000; | |
var connection = mysql.createPool({ | |
connectionLimit : 10, | |
host: "localhost", | |
user: "root", | |
password: "root", | |
database: "Scratch_forums" | |
}); | |
connection.on('enqueue', function () { | |
//console.log('Waiting for available connection slot'.rainbow); | |
}); | |
function addUser(author) { | |
connection.query('SELECT count(*) FROM `user` where username=? ', [author], function (error, results, fields) { | |
if (error) { | |
console.log('Error: ' + error.message); | |
} | |
if (results[0]['count(*)'] == 0) { | |
connection.query('INSERT INTO `user` (`username`) VALUES (?)', [author], function (err) { | |
console.log(author.blue + " inserted into DB".yellow); | |
}); | |
} else { | |
console.log(author.blue + " already in DB".red); | |
} | |
}); | |
} | |
function addPost(parentForum,childForum,author,post,time) { | |
connection.query('SELECT count(*) FROM `posts` where `author` = ? and `postTime` = ? and `topic_name` = ? and `forum_name` = ? ', [author, time, childForum, parentForum], function (error, results, fields) { | |
if (error) { | |
console.log('Error: ' + error.message); | |
} | |
if (results[0]['count(*)'] == 0) { | |
//console.log("inserting into db".yellow); | |
connection.query('INSERT INTO `posts` (`author`,`post`,`postTime`,`topic_name`, `forum_name`) VALUES (?,?,?,?,?)', [author, post, time, childForum, parentForum], function (err) { | |
console.log(author.magenta + "'s post inserted into DB".magenta); | |
}); | |
} else { | |
console.log('post already in db'.yellow) | |
} | |
}); | |
} | |
function indexpage(threadID,pageID,cb) { | |
console.log("Page ".blue+pageID.blue+ "being indexed in thread ".blue + threadID.blue); | |
fs.readFile('../ScratchForumArchive/' + threadID + '/' + pageID, 'utf8', function (err, content) { | |
$ = cheerio.load(content, {decodeEntities: true}); | |
//console.log('../ScratchForumArchive/' + threadID + '/' + pageID, content); | |
var parentThread = $('.linkst ul').children('li').next().children('a').text(); | |
var threadName = $('.linkst ul').children('li').next().next().text().replace(' » ',''); | |
console.log(threadName.red + " in ".red + parentThread.red + "being indexed".red); | |
$('.blockpost').each(function (i, e) { | |
var time = $(this).children('h2').children('span').children('a').text(); | |
var post = $(this).children('.box').children('.inbox').children('.postright').children('.postmsg').text(); | |
var author = $(this).children('.box').children('.inbox').children('.postleft').children('dl').children('dt').text(); | |
//addUser(author); | |
//console.log(moment(time, "YYYY-MM-DD HH:mm:ss").format(),time); | |
addPost(parentThread,threadName,author,post.trim(),moment(time, "YYYY-MM-DD HH:mm:ss").format()); | |
}); | |
setTimeout(function(){ | |
cb(); | |
},200); | |
}); | |
} | |
function indexthread(threadID,cb) { | |
fs.readdir('../ScratchForumArchive/' + threadID, function (err, pageID) { | |
async.eachLimit(pageID, 1, function(id,cb) { | |
indexpage(threadID,id,cb) | |
},function() { | |
cb(); | |
}); | |
}); | |
} | |
fs.readdir('../ScratchForumArchive', function(err,threadID){ | |
async.eachLimit(threadID.slice(offset), 1, | |
function(id,cb) { | |
console.log("Starting work on item ".black.bgRed +threadID.indexOf(id).toString().black.bgRed); | |
indexthread(id,cb) | |
} | |
); | |
}); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment