Skip to content

Instantly share code, notes, and snippets.

@nl5887
Last active November 9, 2020 14:57
Show Gist options
  • Save nl5887/b981b217338494682bf7 to your computer and use it in GitHub Desktop.
Save nl5887/b981b217338494682bf7 to your computer and use it in GitHub Desktop.
Scrapy spider with V8 javascript parser. More info at http://dutchcoders.ghost.io/using-scrapy-and-pyv8-to-scrape-inline-javascript/.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http.request import Request
from scrapy.selector import Selector
import urllib2
import re
import PyV8
import json
from pdc.items import Product
class V8Spider(scrapy.Spider):
def parse_page(self, response):
item = response.meta['item']
item['link'] = response.url
exts = []
productsData = None
with PyV8.JSContext(Global(), extensions=exts) as ctxt:
for script in response.xpath("//script"):
try:
if (script.xpath("@src")):
src = script.xpath("@src").extract()[0]
import requests
r = requests.get(src)
print("loading script source ", src)
ext = PyV8.JSExtension(str(src), js_escape_unicode(r.text))
ctxt.eval(js_escape_unicode(r.text))
if script.xpath("text()").extract():
ctxt.eval(str(js_escape_unicode(script.xpath("text()").extract()[0])))
except Exception as exc:
import traceback
traceback.print_exc()
if ctxt.eval("[productsData]"):
productsData = PyV8.convert(ctxt.eval("[productsData]")[0])
for color_key in productsData['colors'].keys():
color = productsData['colors'][color_key]
for size_key in color['sizes'].keys():
size = productsData['sizes'][size_key]
product_key = "{0}_{1}".format(color_key, size_key)
product = productsData['products'][product_key]
subitem = item.copy()
subitem['productid']=product['id']
subitem['title']= "{0} {1} {2}".format(product['name'], size['label'], color['label'])
subitem['img']=color['media']['images'][0]['page']
price = Selector(text=productsData['products'][product_key]['price_html'])
subitem['price'] = price.xpath('//span[@class="new_price"]/strong/text()').extract()[0].strip() + price.xpath('//span[@class="new_price"]/strong/sup/text()').extract()[0].strip()
yield subitem
else:
yield item
import PyV8
class js_dom_stylesheet(PyV8.JSClass):
def __init__(self, document, *args, **kwargs):
self.cssRules = [ {'cssText': "test"}]
self.cssText = "test"
def __getattr__(self, name):
return super(js_dom_stylesheet, self).__getattr__(name)
def __setattr__(self, name, value):
super(js_dom_stylesheet, self).__setattr__(name, value)
pass
def __delattr__(self, name):
super(js_dom_stylesheet, self).__delattr__(name)
pass
class js_dom_element(PyV8.JSClass):
def __init__(self, document, *args, **kwargs):
print ("js_dom_element", args, kwargs)
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
self.tagName = "HTML"
self.nodeType = 9
self.style = {'background': None }
self.sheet = self.styleSheet= js_dom_stylesheet(document)
self.innerHTML = ""
self.className = ""
self.id = ""
self.offsetLeft = self.offsetHeight = 0
self.document = self.ownerDocument = document
pass
def __str__(self):
return str(self.__properties__) + str(self.__dict__)
def appendChild(self, *args, **kwargs):
args[0].parentNode = self
return args[0]
def getBoundingClientRect(self, *args, **kwargs):
return {}
def removeChild(self, *args, **kwargs):
return None
"""
def parentNode(self, *args, **kwargs):
print ("parentNode")
return js_dom_element(self.document)
"""
def insertBefore(self, *args, **kwargs):
args[0].parentNode = self
return args[0]
def offsetTop(self, *args, **kwargs):
return 0
def getAttribute(self, *args, **kwargs):
return None
def ondrop(self, *args, **kwargs):
pass
def ondragstart(self, *args, **kwargs):
pass
def setAttribute(self, *args, **kwargs):
return None
"""
def __getitem__(self, key):
print ("__getitem__", key)
return super(js_dom_element, self).__getitem__(key)
def __setitem__(self, key, value):
print ("__setitem__", key)
super(js_dom_element, self).__setitem__(key, value)
pass
def __delitem__(self, key):
print ("__delitem__", key)
super(js_dom_element, self).__delitem__(key)
pass
"""
def __getattr__(self, name):
return super(js_dom_element, self).__getattr__(name)
def __setattr__(self, name, value):
super(js_dom_element, self).__setattr__(name, value)
pass
def __delattr__(self, name):
super(js_dom_element, self).__delattr__(name)
pass
def attachEvent(self, *args, **kwargs):
pass
def getComputedStyle(self, *args, **kwargs):
return {}
def getElementsByTagName(self, *args, **kwargs):
return [js_dom_element(self.document)]
def addEventListener(self, *args, **kwargs):
pass
class js_window(PyV8.JSClass):
def __init__(self, document):
self.location = { 'href': '', 'hostname': 'www.test.nl' }
self.Event = {}
self.document = document
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
pass
def top(self):
return self
def self(self):
return self
def WebSocket(self, *args, **kwargs):
pass
def ontouchstart(self, *args, **kwargs):
pass
def setTimeout(self, *args, **kwargs):
pass
def postMessage(self, *args, **kwargs):
pass
def pushState(self, *args, **kwargs):
pass
def history(self, *args, **kwargs):
pass
def __setattr__(self, name, value):
print ("js_window.__setattr__", name, value)
#super(js_window, self).__setattr__(name, value)
#print (value, self.__getattr__(name))
try:
super(js_window, self).__setattr__(name, value)
except AttributeError as exc:
print ("__setattr__,AttributeError")
pass
def __getattr__(self, name):
print ("js_window.___getattr__", name)
print ("js_window.___getattr__", name, super(js_window, self).__getattr__(name))
try:
return super(js_window, self).__getattr__(name)
except AttributeError as exc:
pass
return None
def __delattr__(self, name):
print ("js_window.__delattr__", name)
super(js_window, self).__delattr__(name)
pass
def addEventListener(self, *args, **kwargs):
pass
def attachEvent(self, *args, **kwargs):
pass
class js_event(PyV8.JSClass):
def __init__(self):
self.__proto__ = {}
pass
class js_document(PyV8.JSClass):
def __init__(self):
self.window = js_window(self)
self.body = js_dom_element(self)
self.location = { 'href': '', 'hostname': 'www.test.nl' }
self.documentElement = js_dom_element(self)
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
pass
def appendChild(self, *args, **kwargs):
return None
def removeChild(self, *args, **kwargs):
return None
def getElementById(self, *args, **kwargs):
print ("getElementById", args, kwargs)
return js_dom_element(self)
def attachEvent(self, *args, **kwargs):
pass
def createEvent(self, *args, **kwargs):
return js_event()
def getElementsByTagName(self, *args, **kwargs):
return [js_dom_element(self)]
def createTextNode(self, *args, **kwargs):
return js_dom_element(self, *args, **kwargs)
def createComment(self, *args, **kwargs):
return js_dom_element(self, *args, **kwargs)
def createDocumentFragment(self, *args, **kwargs):
return js_dom_element(self, *args, **kwargs)
def createElement(self, *args, **kwargs):
return js_dom_element(self, *args, **kwargs)
def querySelector(self, *args, **kwargs):
pass
def evaluate(self, *args, **kwargs):
pass
def observe(self, *args, **kwargs):
pass
def __setattr__(self, name, value):
print ("__setattr__", name)
super(js_document, self).__setattr__(name, value)
pass
def __getattr__(self, name):
print ("js_document.___getattr__", name)
try:
return super(js_document, self).__getattr__(name)
except AttributeError as exc:
pass
return None
def addEventListener(self, *args, **kwargs):
pass
def __delattr__(self, name):
print ("__delattr__", name)
super(js_document, self).__delattr__(name)
pass
class Global(PyV8.JSClass): # define a compatible javascript class
def __init__(self):
self.document = js_document()
self.window = self.document.window
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'}
self.HTMLElement = js_dom_element(self.document)
pass
def Image(self):
return js_dom_element(self.document)
def __setattr__(self, name, value):
super(PyV8.JSClass, self).__setattr__(name, value)
pass
def __getattr__(self, name):
if self.window.__getattr__(name):
return self.window.__getattr__(name)
return super(PyV8.JSClass, self).__getattr__(name)
def __delattr__(self, name):
super(PyV8.JSClass, self).__delattr__(name)
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment