-
-
Save Guest007/78502af4e2cfaf0f0ca8 to your computer and use it in GitHub Desktop.
Scrapy spider with V8 javascript parser. More info at http://dutchcoders.ghost.io/using-scrapy-and-pyv8-to-scrape-inline-javascript/.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from scrapy.http.request import Request | |
from scrapy.selector import Selector | |
import urllib2 | |
import re | |
import PyV8 | |
import json | |
from pdc.items import Product | |
class V8Spider(scrapy.Spider): | |
def parse_page(self, response): | |
item = response.meta['item'] | |
item['link'] = response.url | |
exts = [] | |
productsData = None | |
with PyV8.JSContext(Global(), extensions=exts) as ctxt: | |
for script in response.xpath("//script"): | |
try: | |
if (script.xpath("@src")): | |
src = script.xpath("@src").extract()[0] | |
import requests | |
r = requests.get(src) | |
print("loading script source ", src) | |
ext = PyV8.JSExtension(str(src), js_escape_unicode(r.text)) | |
ctxt.eval(js_escape_unicode(r.text)) | |
if script.xpath("text()").extract(): | |
ctxt.eval(str(js_escape_unicode(script.xpath("text()").extract()[0]))) | |
except Exception as exc: | |
import traceback | |
traceback.print_exc() | |
if ctxt.eval("[productsData]"): | |
productsData = PyV8.convert(ctxt.eval("[productsData]")[0]) | |
for color_key in productsData['colors'].keys(): | |
color = productsData['colors'][color_key] | |
for size_key in color['sizes'].keys(): | |
size = productsData['sizes'][size_key] | |
product_key = "{0}_{1}".format(color_key, size_key) | |
product = productsData['products'][product_key] | |
subitem = item.copy() | |
subitem['productid']=product['id'] | |
subitem['title']= "{0} {1} {2}".format(product['name'], size['label'], color['label']) | |
subitem['img']=color['media']['images'][0]['page'] | |
price = Selector(text=productsData['products'][product_key]['price_html']) | |
subitem['price'] = price.xpath('//span[@class="new_price"]/strong/text()').extract()[0].strip() + price.xpath('//span[@class="new_price"]/strong/sup/text()').extract()[0].strip() | |
yield subitem | |
else: | |
yield item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Text of the article | |
When using Scrapy it is easy to scrape HTML using selectors, but when you are confronted with inline javascript objects in the html it is an other story. | |
I'm using PyV8 to evaluate the imported scripts and inline javascript. The javascript objects in gistfile2.py allows the javascript libraries to access browser variables, like window, history and selectors. The functions I implemented are sufficient to run jQuery and other frameworks. This will work about the same in the Google crawler, where javascript is also being interpreted and evaluated. | |
Warning: this is merely a proof of concept, than production ready code. | |
First you need to download and install PyV8. | |
Google V8 - http://code.google.com/p/v8/ | |
PyV8 - http://code.google.com/p/pyv8/ | |
This is the code of the scraper: gistfile1.py | |
This code mimics the browser: gistfile2.py | |
As you can see, it gets an page, creates a new context using the Global() and evaluates all script tags. If the script tag is remote, it downloads and runs it. The end result is that you can just call for objects within the page, in this case ProducsData and use it as an Python object. | |
Work todo: | |
* make a nice library | |
* cache the evaluated context and downloaded scripts | |
* further enhance the browser mimicing. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PyV8 | |
class js_dom_stylesheet(PyV8.JSClass): | |
def __init__(self, document, *args, **kwargs): | |
self.cssRules = [ {'cssText': "test"}] | |
self.cssText = "test" | |
def __getattr__(self, name): | |
return super(js_dom_stylesheet, self).__getattr__(name) | |
def __setattr__(self, name, value): | |
super(js_dom_stylesheet, self).__setattr__(name, value) | |
pass | |
def __delattr__(self, name): | |
super(js_dom_stylesheet, self).__delattr__(name) | |
pass | |
class js_dom_element(PyV8.JSClass): | |
def __init__(self, document, *args, **kwargs): | |
print ("js_dom_element", args, kwargs) | |
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'} | |
self.tagName = "HTML" | |
self.nodeType = 9 | |
self.style = {'background': None } | |
self.sheet = self.styleSheet= js_dom_stylesheet(document) | |
self.innerHTML = "" | |
self.className = "" | |
self.id = "" | |
self.offsetLeft = self.offsetHeight = 0 | |
self.document = self.ownerDocument = document | |
pass | |
def __str__(self): | |
return str(self.__properties__) + str(self.__dict__) | |
def appendChild(self, *args, **kwargs): | |
args[0].parentNode = self | |
return args[0] | |
def getBoundingClientRect(self, *args, **kwargs): | |
return {} | |
def removeChild(self, *args, **kwargs): | |
return None | |
""" | |
def parentNode(self, *args, **kwargs): | |
print ("parentNode") | |
return js_dom_element(self.document) | |
""" | |
def insertBefore(self, *args, **kwargs): | |
args[0].parentNode = self | |
return args[0] | |
def offsetTop(self, *args, **kwargs): | |
return 0 | |
def getAttribute(self, *args, **kwargs): | |
return None | |
def ondrop(self, *args, **kwargs): | |
pass | |
def ondragstart(self, *args, **kwargs): | |
pass | |
def setAttribute(self, *args, **kwargs): | |
return None | |
""" | |
def __getitem__(self, key): | |
print ("__getitem__", key) | |
return super(js_dom_element, self).__getitem__(key) | |
def __setitem__(self, key, value): | |
print ("__setitem__", key) | |
super(js_dom_element, self).__setitem__(key, value) | |
pass | |
def __delitem__(self, key): | |
print ("__delitem__", key) | |
super(js_dom_element, self).__delitem__(key) | |
pass | |
""" | |
def __getattr__(self, name): | |
return super(js_dom_element, self).__getattr__(name) | |
def __setattr__(self, name, value): | |
super(js_dom_element, self).__setattr__(name, value) | |
pass | |
def __delattr__(self, name): | |
super(js_dom_element, self).__delattr__(name) | |
pass | |
def attachEvent(self, *args, **kwargs): | |
pass | |
def getComputedStyle(self, *args, **kwargs): | |
return {} | |
def getElementsByTagName(self, *args, **kwargs): | |
return [js_dom_element(self.document)] | |
def addEventListener(self, *args, **kwargs): | |
pass | |
class js_window(PyV8.JSClass): | |
def __init__(self, document): | |
self.location = { 'href': '', 'hostname': 'www.test.nl' } | |
self.Event = {} | |
self.document = document | |
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'} | |
pass | |
def top(self): | |
return self | |
def self(self): | |
return self | |
def WebSocket(self, *args, **kwargs): | |
pass | |
def ontouchstart(self, *args, **kwargs): | |
pass | |
def setTimeout(self, *args, **kwargs): | |
pass | |
def postMessage(self, *args, **kwargs): | |
pass | |
def pushState(self, *args, **kwargs): | |
pass | |
def history(self, *args, **kwargs): | |
pass | |
def __setattr__(self, name, value): | |
print ("js_window.__setattr__", name, value) | |
#super(js_window, self).__setattr__(name, value) | |
#print (value, self.__getattr__(name)) | |
try: | |
super(js_window, self).__setattr__(name, value) | |
except AttributeError as exc: | |
print ("__setattr__,AttributeError") | |
pass | |
def __getattr__(self, name): | |
print ("js_window.___getattr__", name) | |
print ("js_window.___getattr__", name, super(js_window, self).__getattr__(name)) | |
try: | |
return super(js_window, self).__getattr__(name) | |
except AttributeError as exc: | |
pass | |
return None | |
def __delattr__(self, name): | |
print ("js_window.__delattr__", name) | |
super(js_window, self).__delattr__(name) | |
pass | |
def addEventListener(self, *args, **kwargs): | |
pass | |
def attachEvent(self, *args, **kwargs): | |
pass | |
class js_event(PyV8.JSClass): | |
def __init__(self): | |
self.__proto__ = {} | |
pass | |
class js_document(PyV8.JSClass): | |
def __init__(self): | |
self.window = js_window(self) | |
self.body = js_dom_element(self) | |
self.location = { 'href': '', 'hostname': 'www.test.nl' } | |
self.documentElement = js_dom_element(self) | |
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'} | |
pass | |
def appendChild(self, *args, **kwargs): | |
return None | |
def removeChild(self, *args, **kwargs): | |
return None | |
def getElementById(self, *args, **kwargs): | |
print ("getElementById", args, kwargs) | |
return js_dom_element(self) | |
def attachEvent(self, *args, **kwargs): | |
pass | |
def createEvent(self, *args, **kwargs): | |
return js_event() | |
def getElementsByTagName(self, *args, **kwargs): | |
return [js_dom_element(self)] | |
def createTextNode(self, *args, **kwargs): | |
return js_dom_element(self, *args, **kwargs) | |
def createComment(self, *args, **kwargs): | |
return js_dom_element(self, *args, **kwargs) | |
def createDocumentFragment(self, *args, **kwargs): | |
return js_dom_element(self, *args, **kwargs) | |
def createElement(self, *args, **kwargs): | |
return js_dom_element(self, *args, **kwargs) | |
def querySelector(self, *args, **kwargs): | |
pass | |
def evaluate(self, *args, **kwargs): | |
pass | |
def observe(self, *args, **kwargs): | |
pass | |
def __setattr__(self, name, value): | |
print ("__setattr__", name) | |
super(js_document, self).__setattr__(name, value) | |
pass | |
def __getattr__(self, name): | |
print ("js_document.___getattr__", name) | |
try: | |
return super(js_document, self).__getattr__(name) | |
except AttributeError as exc: | |
pass | |
return None | |
def addEventListener(self, *args, **kwargs): | |
pass | |
def __delattr__(self, name): | |
print ("__delattr__", name) | |
super(js_document, self).__delattr__(name) | |
pass | |
class Global(PyV8.JSClass): # define a compatible javascript class | |
def __init__(self): | |
self.document = js_document() | |
self.window = self.document.window | |
self.navigator = {'userAgent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36", 'appVersion': '5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2106.0 Safari/537.36"'} | |
self.HTMLElement = js_dom_element(self.document) | |
pass | |
def Image(self): | |
return js_dom_element(self.document) | |
def __setattr__(self, name, value): | |
super(PyV8.JSClass, self).__setattr__(name, value) | |
pass | |
def __getattr__(self, name): | |
if self.window.__getattr__(name): | |
return self.window.__getattr__(name) | |
return super(PyV8.JSClass, self).__getattr__(name) | |
def __delattr__(self, name): | |
super(PyV8.JSClass, self).__delattr__(name) | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment