Skip to content

Instantly share code, notes, and snippets.

@oshoham
Last active Aug 5, 2020
Embed
What would you like to do?
Parse the Book of Blaseball from the site's minified JavaScript
{
"name": "blaseball-book-scraper",
"version": "1.0.0",
"description": "",
"main": "parse_blaseball_book.js",
"author": "",
"license": "ISC",
"dependencies": {
"acorn": "^7.4.0",
"acorn-walk": "^7.2.0",
"bent": "^7.3.9",
"cheerio": "^1.0.0-rc.3"
}
}
const url = require('url');
const bent = require('bent');
const cheerio = require('cheerio');
const acorn = require('acorn');
const walk = require('acorn-walk');
const getString = bent('string');
async function parseBookFromJavaScript() {
const html = await getString('https://blaseball.com');
const $ = cheerio.load(html);
const scriptTags = $('script[src^="/static/js/main\."]');
if (scriptTags.length === 0) {
throw new Error('Could not find the main JS file.')
}
if (scriptTags.length > 1) {
throw new Error('More than one main JS files found.')
}
const src = scriptTags.attr('src');
const jsUrl = url.resolve('https://blaseball.com', src);
const js = await getString(jsUrl);
const ast = acorn.parse(js);
let bookFunctionNode = null;
walk.ancestor(ast, {
Literal(node, ancestors) {
if (node.value !== 'The Book of Blaseball' || bookFunctionNode !== null || ancestors.length <= 1) {
return;
}
// start at the 2nd-to-last ancestor since the last ancestor is the current node
for (let i = ancestors.length - 2; i >= 0; i--) {
if (ancestors[i].type === 'FunctionDeclaration') {
bookFunctionNode = ancestors[i];
break;
}
}
}
});
if (bookFunctionNode === null) {
throw new Error('Could not find the FunctionDeclaration node for rendering the Book in the AST.')
}
let text = '';
walk.recursive(bookFunctionNode, null, {
CallExpression(node, st, c) {
const isCreateElement = (
node.callee.type === 'MemberExpression' &&
node.callee.property.type === 'Identifier' &&
node.callee.property.name === 'createElement'
);
if (!isCreateElement) {
return;
}
c(node.callee, st, 'Expression');
if (!node.arguments) {
return;
}
for (let i = 0; i < node.arguments.length; i++) {
if (i === 0 && node.arguments[0].type === 'Literal') { // HTML tag
if (node.arguments[0].value === 'div' && text !== '') {
text += '\n';
}
continue;
}
const isLiteral = (
node.arguments[i].type === 'Literal' &&
node.arguments[i].value !== null
);
const hasStrProperty = (
node.arguments[i].type === 'ObjectExpression' &&
node.arguments[i].properties.length === 1 &&
node.arguments[i].properties[0].key.name === 'str'
);
const hasClassNameProperty = (
node.arguments[i].type === 'ObjectExpression' &&
node.arguments[i].properties.length === 1 &&
node.arguments[i].properties[0].key.name === 'className'
);
if (isLiteral) {
text += node.arguments[i].value;
} else if (hasStrProperty) {
text += node.arguments[i].properties[0].value.value;
} else if (hasClassNameProperty) {
const classNames = node.arguments[i].properties[0].value.value.split(' ');
if (classNames.includes('TheBook-Bullet')) {
text += '\n'
} else if (classNames.includes('TheBook-SubBullet')) {
text += ' '
}
}
c(node.arguments[i], st, 'Expression');
}
}
});
return text;
}
(async () => {
try {
const text = await parseBookFromJavaScript();
console.log(text);
} catch (e) {
console.error(e);
}
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment