Skip to content

Instantly share code, notes, and snippets.

@Higgs1
Last active August 29, 2015 14:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Higgs1/e020321828ed3dc2d0a4 to your computer and use it in GitHub Desktop.
Save Higgs1/e020321828ed3dc2d0a4 to your computer and use it in GitHub Desktop.
Extended JSON Parsing / Scraping in Python3
"""
Rationale: Some websites make it difficult to scrape from by obfuscating their otherwise JSON data with
things such as unquoted object keys, concatenated strings, comments, etc. which are all valid JavaScript
constructs but invalid JSON. This usually forces would-be web scraper developers to emulate an entire
browser just to mine the data, which apparently is a fairly successful deterrent. This script parses the
JavaScript AST without executing any potentially malicious JavaScript DRM code, and correctly parses a
number of valid JavaScript constructs into a Python dictionary.
It also provides methods to quickly download a web page and search for a global variable using pyquery.
"""
import operator, json, ast, os
# From PyPI 'pyquery'
from pyquery import PyQuery as pq
# From PyPI 'slimit'
from slimit.parser import Parser
from slimit.ast import (
Array, Boolean, BinOp, Identifier,
Null, Object, UnaryOp, VarStatement)
jsops = {
'+' : operator.add,
'-' : operator.sub,
'*' : operator.mul,
'/' : operator.truediv,
'%' : operator.mod,
}
def jsast2py(node):
"""Converts JSON or JSON-like formatted JavaScript AST into a Python dictionary."""
if isinstance(node, (Object, VarStatement)):
return {jsast2py(k) : jsast2py(v) for k, v in node}
elif isinstance(node, Array):
return [jsast2py(e) for e in node]
elif isinstance(node, BinOp):
return jsops[node.op](*[jsast2py(c) for c in node])
elif isinstance(node, UnaryOp):
return jsops[node.op](0, node.value)
elif isinstance(node, (Boolean, Identifier)):
return node.value
elif isinstance(node, Null):
return None
return ast.literal_eval(node.value)
jsparser = Parser()
def scrape_var_js(script, var):
"""Searches for a JSON-like structure in JavaScript or JSON without executing any JavaScript."""
for statement in jsparser.parse(script):
if isinstance(statement, VarStatement):
for ident, obj in statement:
if ident.value == var:
return jsast2py(obj)
def scrape_var_html(*args, var = None, **kwargs):
"""Searches for a JSON-like structure in an HTML document without executing any JavaScript.
Accepts any form of input that pyquery takes (pq, lxml, string doc, url + requests...)"""
for script in pq(*args, **kwargs)('body script'):
if script.text:
ret = scrape_var_js(script.text, var)
if ret:
return ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment