Created
January 16, 2018 02:09
-
-
Save coline-carle/8c8e8407acd6680438b2a67a804c5a2c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
const vm = require('vm'); | |
const fs = require('fs'); | |
const Promise = require('bluebird'); | |
const cheerio = require('cheerio'); | |
const he = require('he'); | |
const Sequelize = require('sequelize'); | |
const readFile = Promise.promisify(fs.readFile); | |
const writeFile = Promise.promisify(fs.writeFile); | |
const sequelize = new Sequelize('sqlite://npcs.db', | |
{ | |
logging: false | |
} | |
); | |
const Page = sequelize.define('pages', { | |
id: {type: Sequelize.INTEGER, primaryKey: true}, | |
loc: Sequelize.STRING, | |
gameID: Sequelize.INTEGER, | |
type: Sequelize.STRING, | |
lastMod: Sequelize.DATE, | |
localLastMod: Sequelize.DATE | |
}, {timestamps: false}); | |
const Npc = sequelize.define('npcs', { | |
id: {type: Sequelize.INTEGER, primaryKey: true}, | |
pageID: { | |
field: 'pageID', | |
type: Sequelize.INTEGER, | |
allowNull: false, | |
references: { | |
model: Page, | |
key: 'id' | |
} | |
}, | |
data: Sequelize.JSON | |
}, {timestamps: false}); | |
Npc.sync(); | |
Npc.belongsTo(Page); | |
Page.hasOne(Npc); | |
function parseMapData(html) { | |
const REGEX = /var\s+g_mapperData\s+=[\s\S]+?};/gm; | |
const result = REGEX.exec(html); | |
if (result === null) { | |
return null; | |
} | |
const mapperDataAssign = result[0]; | |
return evaluateJS(mapperDataAssign).g_mapperData; | |
} | |
function parseAlternateLinks(html, xmlParser) { | |
const $ = xmlParser; | |
const altLinks = []; | |
$('link[rel="alternate"]').each(function (i) { | |
altLinks[i] = { | |
lang: $(this).attr('hreflang'), | |
link: he.decode($(this).attr('href')) | |
}; | |
}); | |
return altLinks; | |
} | |
function parseDescription(html, xmlParser) { | |
const $ = xmlParser; | |
return $('meta[name=description]').attr('content'); | |
} | |
function parseDisplayId(html) { | |
const REGEX = /displayId:\s(\d{1,6})/g; | |
const displayIDStr = REGEX.exec(html); | |
if (displayIDStr !== null) { | |
return Number(displayIDStr[1]); | |
} | |
return null; | |
} | |
function evaluateJS(code) { | |
const sandbox = {}; | |
const script = new vm.Script(code); | |
const context = new vm.createContext(sandbox); | |
script.runInContext(context); | |
return sandbox; | |
} | |
function parseNpcInfo(html) { | |
const REGEX = /\$\.extend\(g_npcs\[\d{1,6}\],\s+({.+})/g; | |
let npcInfoStr = REGEX.exec(html)[1]; | |
npcInfoStr = npcInfoStr.replace(/undefined/g, 'null'); | |
return JSON.parse(npcInfoStr); | |
} | |
function loadCheerio(page) { | |
return cheerio.load(page, { | |
normalizeWhitespace: true, | |
xmlMode: true | |
}); | |
} | |
function parseNpcPage(page) { | |
const xmlParser = loadCheerio(page); | |
const npc = parseNpcInfo(page); | |
npc.coordinates = parseMapData(page); | |
npc.displayId = parseDisplayId(page); | |
npc.alternateLinks = parseAlternateLinks(page, xmlParser); | |
npc.description = parseDescription(page, xmlParser); | |
return npc; | |
} | |
function parseNpc(id) { | |
const filename = pageFilename('npc', id); | |
return readFile(filename, 'utf8') | |
.then(data => (parseNpcPage(data))); | |
} | |
function showNpc(id) { | |
parseNpc(id) | |
.then(npc => console.dir(npc, {depth: 4, colors: true})); | |
} | |
function pageFilename(type, id) { | |
return './mirror/' + type + '/' + id + '.html'; | |
} | |
function parseMissingData() { | |
connectToDatabase() | |
.then(() => (missingNpcs())) | |
.then(pages => Promise.each(pages, page => { | |
return parseNpc(page.gameID) | |
.then(npcData => ( | |
Npc.create({ | |
pageID: page.id, | |
data: npcData | |
}))) | |
.then(() => (console.log('parsed and saved : gameID:', page.gameID))) | |
.catch({code: 'ENOENT'}, () => { | |
return console.error('npc not downloaded:\n - id: ', page.gameID, '\n - url: ', page.loc); | |
}) | |
.catch(err => (console.error(err))); | |
})) | |
.catch(err => (console.error(err))); | |
} | |
function connectToDatabase() { | |
return sequelize.authenticate() | |
.catch(err => console.error('Unable to connect to sqlite database:', err)); | |
} | |
function pageCount() { | |
return Page.count({ | |
where: { | |
type: 'npc' | |
} | |
}); | |
} | |
function npcCount() { | |
return Npc.count(); | |
} | |
function missingNpcs() { | |
return Page.findAll({ | |
where: { | |
type: 'npc', | |
'$npc.id$': null | |
}, | |
include: [ | |
{ | |
model: Npc, | |
required: false, | |
attributes: ['id'], | |
as: 'npc' | |
} | |
] | |
}); | |
} | |
function scrapeStatus() { | |
return connectToDatabase() | |
.then(() => (pageCount())) | |
.then(pages => (console.log('NPCs in wowhead database:', pages))) | |
.then(() => (npcCount())) | |
.then(npcs => (console.log('NPCs parsed so far:', npcs))); | |
} | |
function dumpData(filename) { | |
connectToDatabase() | |
.then(() => (Npc.findAll({ | |
where: { | |
data: {$ne: null} | |
}}))) | |
.then(npcs => (npcs.map(npc => npc.data))) | |
.then(npcs => (JSON.stringify(npcs))) | |
.then(jsonString => (writeFile(filename, jsonString))) | |
.then(() => console.log('database dumped to ', filename)) | |
.catch(err => (console.log('error: ', err))); | |
} | |
const args = process.argv.slice(2); | |
if (args[0] === 'parse-all' || args.length === 0) { | |
console.log('# parsing misssing data'); | |
parseMissingData(); | |
} | |
if (args[0] === 'dump') { | |
dumpData('npcs_dump.json'); | |
} | |
if (args[0] === 'status') { | |
scrapeStatus(); | |
} | |
if (args[0] === 'drop') { | |
Npc.drop(); | |
} | |
if (args[0] === 'parse') { | |
showNpc(args[1]); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment