Skip to content

Instantly share code, notes, and snippets.

@coline-carle
Created January 16, 2018 02:09
Show Gist options
  • Save coline-carle/8c8e8407acd6680438b2a67a804c5a2c to your computer and use it in GitHub Desktop.
Save coline-carle/8c8e8407acd6680438b2a67a804c5a2c to your computer and use it in GitHub Desktop.
'use strict';
const vm = require('vm');
const fs = require('fs');
const Promise = require('bluebird');
const cheerio = require('cheerio');
const he = require('he');
const Sequelize = require('sequelize');
const readFile = Promise.promisify(fs.readFile);
const writeFile = Promise.promisify(fs.writeFile);
const sequelize = new Sequelize('sqlite://npcs.db',
{
logging: false
}
);
const Page = sequelize.define('pages', {
id: {type: Sequelize.INTEGER, primaryKey: true},
loc: Sequelize.STRING,
gameID: Sequelize.INTEGER,
type: Sequelize.STRING,
lastMod: Sequelize.DATE,
localLastMod: Sequelize.DATE
}, {timestamps: false});
const Npc = sequelize.define('npcs', {
id: {type: Sequelize.INTEGER, primaryKey: true},
pageID: {
field: 'pageID',
type: Sequelize.INTEGER,
allowNull: false,
references: {
model: Page,
key: 'id'
}
},
data: Sequelize.JSON
}, {timestamps: false});
Npc.sync();
Npc.belongsTo(Page);
Page.hasOne(Npc);
function parseMapData(html) {
const REGEX = /var\s+g_mapperData\s+=[\s\S]+?};/gm;
const result = REGEX.exec(html);
if (result === null) {
return null;
}
const mapperDataAssign = result[0];
return evaluateJS(mapperDataAssign).g_mapperData;
}
function parseAlternateLinks(html, xmlParser) {
const $ = xmlParser;
const altLinks = [];
$('link[rel="alternate"]').each(function (i) {
altLinks[i] = {
lang: $(this).attr('hreflang'),
link: he.decode($(this).attr('href'))
};
});
return altLinks;
}
function parseDescription(html, xmlParser) {
const $ = xmlParser;
return $('meta[name=description]').attr('content');
}
function parseDisplayId(html) {
const REGEX = /displayId:\s(\d{1,6})/g;
const displayIDStr = REGEX.exec(html);
if (displayIDStr !== null) {
return Number(displayIDStr[1]);
}
return null;
}
function evaluateJS(code) {
const sandbox = {};
const script = new vm.Script(code);
const context = new vm.createContext(sandbox);
script.runInContext(context);
return sandbox;
}
function parseNpcInfo(html) {
const REGEX = /\$\.extend\(g_npcs\[\d{1,6}\],\s+({.+})/g;
let npcInfoStr = REGEX.exec(html)[1];
npcInfoStr = npcInfoStr.replace(/undefined/g, 'null');
return JSON.parse(npcInfoStr);
}
function loadCheerio(page) {
return cheerio.load(page, {
normalizeWhitespace: true,
xmlMode: true
});
}
function parseNpcPage(page) {
const xmlParser = loadCheerio(page);
const npc = parseNpcInfo(page);
npc.coordinates = parseMapData(page);
npc.displayId = parseDisplayId(page);
npc.alternateLinks = parseAlternateLinks(page, xmlParser);
npc.description = parseDescription(page, xmlParser);
return npc;
}
function parseNpc(id) {
const filename = pageFilename('npc', id);
return readFile(filename, 'utf8')
.then(data => (parseNpcPage(data)));
}
function showNpc(id) {
parseNpc(id)
.then(npc => console.dir(npc, {depth: 4, colors: true}));
}
function pageFilename(type, id) {
return './mirror/' + type + '/' + id + '.html';
}
function parseMissingData() {
connectToDatabase()
.then(() => (missingNpcs()))
.then(pages => Promise.each(pages, page => {
return parseNpc(page.gameID)
.then(npcData => (
Npc.create({
pageID: page.id,
data: npcData
})))
.then(() => (console.log('parsed and saved : gameID:', page.gameID)))
.catch({code: 'ENOENT'}, () => {
return console.error('npc not downloaded:\n - id: ', page.gameID, '\n - url: ', page.loc);
})
.catch(err => (console.error(err)));
}))
.catch(err => (console.error(err)));
}
function connectToDatabase() {
return sequelize.authenticate()
.catch(err => console.error('Unable to connect to sqlite database:', err));
}
function pageCount() {
return Page.count({
where: {
type: 'npc'
}
});
}
function npcCount() {
return Npc.count();
}
function missingNpcs() {
return Page.findAll({
where: {
type: 'npc',
'$npc.id$': null
},
include: [
{
model: Npc,
required: false,
attributes: ['id'],
as: 'npc'
}
]
});
}
function scrapeStatus() {
return connectToDatabase()
.then(() => (pageCount()))
.then(pages => (console.log('NPCs in wowhead database:', pages)))
.then(() => (npcCount()))
.then(npcs => (console.log('NPCs parsed so far:', npcs)));
}
function dumpData(filename) {
connectToDatabase()
.then(() => (Npc.findAll({
where: {
data: {$ne: null}
}})))
.then(npcs => (npcs.map(npc => npc.data)))
.then(npcs => (JSON.stringify(npcs)))
.then(jsonString => (writeFile(filename, jsonString)))
.then(() => console.log('database dumped to ', filename))
.catch(err => (console.log('error: ', err)));
}
const args = process.argv.slice(2);
if (args[0] === 'parse-all' || args.length === 0) {
console.log('# parsing misssing data');
parseMissingData();
}
if (args[0] === 'dump') {
dumpData('npcs_dump.json');
}
if (args[0] === 'status') {
scrapeStatus();
}
if (args[0] === 'drop') {
Npc.drop();
}
if (args[0] === 'parse') {
showNpc(args[1]);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment