Skip to content

Instantly share code, notes, and snippets.

@Guest007
Forked from nl5887/gist:b981b217338494682bf7
Last active September 3, 2015 05:15
Show Gist options
  • Save Guest007/78502af4e2cfaf0f0ca8 to your computer and use it in GitHub Desktop.
Save Guest007/78502af4e2cfaf0f0ca8 to your computer and use it in GitHub Desktop.
Scrapy spider with V8 javascript parser. More info at http://dutchcoders.ghost.io/using-scrapy-and-pyv8-to-scrape-inline-javascript/.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http.request import Request
from scrapy.selector import Selector
import urllib2
import re
import PyV8
import json
from pdc.items import Product
class V8Spider(scrapy.Spider):
def parse_page(self, response):
item = response.meta['item']
item['link'] = response.url
exts = []
productsData = None
with PyV8.JSContext(Global(), extensions=exts) as ctxt:
for script in response.xpath("//script"):
try:
if (script.xpath("@src")):
src = script.xpath("@src").extract()[0]
import requests
r = requests.get(src)
print("loading script source ", src)
ext = PyV8.JSExtension(str(src), js_escape_unicode(r.text))
ctxt.eval(js_escape_unicode(r.text))
if script.xpath("text()").extract():
ctxt.eval(str(js_escape_unicode(script.xpath("text()").extract()[0])))
except Exception as exc:
import traceback
traceback.print_exc()
if ctxt.eval("[productsData]"):
productsData = PyV8.convert(ctxt.eval("[productsData]")[0])
for color_key in productsData['colors'].keys():
color = productsData['colors'][color_key]
for size_key in color['sizes'].keys():
size = productsData['sizes'][size_key]
product_key = "{0}_{1}".format(color_key, size_key)
product = productsData['products'][product_key]
subitem = item.copy()
subitem['productid']=product['id']
subitem['title']= "{0} {1} {2}".format(product['name'], size['label'], color['label'])
subitem['img']=color['media']['images'][0]['page']
price = Selector(text=productsData['products'][product_key]['price_html'])
subitem['price'] = price.xpath('//span[@class="new_price"]/strong/text()').extract()[0].strip() + price.xpath('//span[@class="new_price"]/strong/sup/text()').extract()[0].strip()
yield subitem
else:
yield item
# Text of the article
When using Scrapy it is easy to scrape HTML using selectors, but when you are confronted with inline javascript objects in the html it is an other story.
I'm using PyV8 to evaluate the imported scripts and inline javascript. The javascript objects in gistfile2.py allows the javascript libraries to access browser variables, like window, history and selectors. The functions I implemented are sufficient to run jQuery and other frameworks. This will work about the same in the Google crawler, where javascript is also being interpreted and evaluated.
Warning: this is merely a proof of concept, than production ready code.
First you need to download and install PyV8.
Google V8 - http://code.google.com/p/v8/
PyV8 - http://code.google.com/p/pyv8/
This is the code of the scraper: gistfile1.py
This code mimics the browser: gistfile2.py
As you can see, it gets an page, creates a new context using the Global() and evaluates all script tags. If the script tag is remote, it downloads and runs it. The end result is that you can just call for objects within the page, in this case ProducsData and use it as an Python object.
Work todo:
* make a nice library
* cache the evaluated context and downloaded scripts
* further enhance the browser mimicing.
import PyV8
class js_dom_stylesheet(PyV8.JSClass):
def __init__(self, document, *args, **kwargs):
self.cssRules = [ {'cssText': "test"}]
self.cssText = "test"
def __getattr__(self, name):
return super(js_dom_stylesheet, self).__getattr__(name)
def __setattr__(self, name, value):
super(js_dom_stylesheet, self).__setattr__(name, value)
pass
def __delattr__(self, name):
super(js_dom_stylesheet, self).__delattr__(name)
pass
class js_dom_element(PyV8.JSClass):
def __init__(self, document, *args, **kwargs):
print ("js_dom_element", args, kwargs)
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
self.tagName = "HTML"
self.nodeType = 9
self.style = {'background': None }
self.sheet = self.styleSheet= js_dom_stylesheet(document)
self.innerHTML = ""
self.className = ""
self.id = ""
self.offsetLeft = self.offsetHeight = 0
self.document = self.ownerDocument = document
pass
def __str__(self):
return str(self.__properties__) + str(self.__dict__)
def appendChild(self, *args, **kwargs):
args[0].parentNode = self
return args[0]
def getBoundingClientRect(self, *args, **kwargs):
return {}
def removeChild(self, *args, **kwargs):
return None
"""
def parentNode(self, *args, **kwargs):
print ("parentNode")
return js_dom_element(self.document)
"""
def insertBefore(self, *args, **kwargs):
args[0].parentNode = self
return args[0]
def offsetTop(self, *args, **kwargs):
return 0
def getAttribute(self, *args, **kwargs):
return None
def ondrop(self, *args, **kwargs):
pass
def ondragstart(self, *args, **kwargs):
pass
def setAttribute(self, *args, **kwargs):
return None
"""
def __getitem__(self, key):
print ("__getitem__", key)
return super(js_dom_element, self).__getitem__(key)
def __setitem__(self, key, value):
print ("__setitem__", key)
super(js_dom_element, self).__setitem__(key, value)
pass
def __delitem__(self, key):
print ("__delitem__", key)
super(js_dom_element, self).__delitem__(key)
pass
"""
def __getattr__(self, name):
return super(js_dom_element, self).__getattr__(name)
def __setattr__(self, name, value):
super(js_dom_element, self).__setattr__(name, value)
pass
def __delattr__(self, name):
super(js_dom_element, self).__delattr__(name)
pass
def attachEvent(self, *args, **kwargs):
pass
def getComputedStyle(self, *args, **kwargs):
return {}
def getElementsByTagName(self, *args, **kwargs):
return [js_dom_element(self.document)]
def addEventListener(self, *args, **kwargs):
pass
class js_window(PyV8.JSClass):
def __init__(self, document):
self.location = { 'href': '', 'hostname': 'www.test.nl' }
self.Event = {}
self.document = document
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
pass
def top(self):
return self
def self(self):
return self
def WebSocket(self, *args, **kwargs):
pass
def ontouchstart(self, *args, **kwargs):
pass
def setTimeout(self, *args, **kwargs):
pass
def postMessage(self, *args, **kwargs):
pass
def pushState(self, *args, **kwargs):
pass
def history(self, *args, **kwargs):
pass
def __setattr__(self, name, value):
print ("js_window.__setattr__", name, value)
#super(js_window, self).__setattr__(name, value)
#print (value, self.__getattr__(name))
try:
super(js_window, self).__setattr__(name, value)
except AttributeError as exc:
print ("__setattr__,AttributeError")
pass
def __getattr__(self, name):
print ("js_window.___getattr__", name)
print ("js_window.___getattr__", name, super(js_window, self).__getattr__(name))
try:
return super(js_window, self).__getattr__(name)
except AttributeError as exc:
pass
return None
def __delattr__(self, name):
print ("js_window.__delattr__", name)
super(js_window, self).__delattr__(name)
pass
def addEventListener(self, *args, **kwargs):
pass
def attachEvent(self, *args, **kwargs):
pass
class js_event(PyV8.JSClass):
def __init__(self):
self.__proto__ = {}
pass
class js_document(PyV8.JSClass):
def __init__(self):
self.window = js_window(self)
self.body = js_dom_element(self)
self.location = { 'href': '', 'hostname': 'www.test.nl' }
self.documentElement = js_dom_element(self)
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
pass
def appendChild(self, *args, **kwargs):
return None
def removeChild(self, *args, **kwargs):
return None
def getElementById(self, *args, **kwargs):
print ("getElementById", args, kwargs)
return js_dom_element(self)
def attachEvent(self, *args, **kwargs):
pass
def createEvent(self, *args, **kwargs):
return js_event()
def getElementsByTagName(self, *args, **kwargs):
return [js_dom_element(self)]
def createTextNode(self, *args, **kwargs):
return js_dom_element(self, *args, **kwargs)
def createComment(self, *args, **kwargs):
return js_dom_element(self, *args, **kwargs)
def createDocumentFragment(self, *args, **kwargs):
return js_dom_element(self, *args, **kwargs)
def createElement(self, *args, **kwargs):
return js_dom_element(self, *args, **kwargs)
def querySelector(self, *args, **kwargs):
pass
def evaluate(self, *args, **kwargs):
pass
def observe(self, *args, **kwargs):
pass
def __setattr__(self, name, value):
print ("__setattr__", name)
super(js_document, self).__setattr__(name, value)
pass
def __getattr__(self, name):
print ("js_document.___getattr__", name)
try:
return super(js_document, self).__getattr__(name)
except AttributeError as exc:
pass
return None
def addEventListener(self, *args, **kwargs):
pass
def __delattr__(self, name):
print ("__delattr__", name)
super(js_document, self).__delattr__(name)
pass
class Global(PyV8.JSClass): # define a compatible javascript class
def __init__(self):
self.document = js_document()
self.window = self.document.window
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
self.HTMLElement = js_dom_element(self.document)
pass
def Image(self):
return js_dom_element(self.document)
def __setattr__(self, name, value):
super(PyV8.JSClass, self).__setattr__(name, value)
pass
def __getattr__(self, name):
if self.window.__getattr__(name):
return self.window.__getattr__(name)
return super(PyV8.JSClass, self).__getattr__(name)
def __delattr__(self, name):
super(PyV8.JSClass, self).__delattr__(name)
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment