Created
June 24, 2015 01:49
-
-
Save johnpauljanecek/3446d11bed47b3b12b27 to your computer and use it in GitHub Desktop.
Ajax Kickstarter Scraping Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from rpyc_docker import Browser,WebDriver\n", | |
"import os.path" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"\"\"\"\n", | |
"basepage.py\n", | |
"used for the examples, other pages should inherit off this page\n", | |
"\"\"\"\n", | |
"\n", | |
"class BasePage(object):\n", | |
" url = None\n", | |
" js_dict_to_array = \"\"\"\n", | |
" window.dict_to_array = function(dict) {\n", | |
" var result = [];\n", | |
" for(var k in dict) {\n", | |
" result.push([k,dict[k]]);\n", | |
" }\n", | |
" return result;}\n", | |
" \"\"\"\n", | |
" def __init__(self,browser):\n", | |
" self.browser = browser\n", | |
" self.driver = browser.driver\n", | |
" #make nice shortcuts to browser\n", | |
" self.js_ex = self.browser.js_ex\n", | |
" \n", | |
" def find_elements_with_text(self,tagName,rePattern):\n", | |
" return self.driver.execute_script(\"\"\"\n", | |
" return (function(tag,pattern) {\n", | |
" var patt = RegExp(pattern);\n", | |
" var elms = Array.prototype.slice.call(document.getElementsByTagName(tag));\n", | |
" return elms.filter(function(elm) {\n", | |
" return patt.test(elm.textContent);\n", | |
" })\n", | |
" })(arguments[0],arguments[1]);\n", | |
" \"\"\",tagName,rePattern)\n", | |
"\n", | |
" def scroll_top(self):\n", | |
" self.driver.execute_script(\"window.scrollTo(0,0);\")\n", | |
" return True\n", | |
"\n", | |
" def scroll_bottom(self):\n", | |
" self.driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n", | |
" return True\n", | |
"\n", | |
" def goto(self,url = None):\n", | |
" if url :\n", | |
" self.driver.get(url)\n", | |
" else:\n", | |
" self.driver.get(self.url)\n", | |
"\n", | |
" def ipython_screenshot(self):\n", | |
" from IPython.display import Image\n", | |
" img = self.driver.get_screenshot_as_png()\n", | |
" return Image(data = img)\n", | |
"\n", | |
" def find_css_input(self,css,value):\n", | |
" elm = self.driver.find_element_by_css_selector(css)\n", | |
" elm.clear()\n", | |
" elm.send_keys(value)\n", | |
" \n", | |
" def find_css_click(self,css):\n", | |
" try :\n", | |
" elm = self.driver.find_element_by_css_selector(css)\n", | |
" elm.click()\n", | |
" return True\n", | |
" except SelEx.ElementNotVisibleException:\n", | |
" return False\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import urlparse\n", | |
"from bs4 import BeautifulSoup\n", | |
"import selenium.common.exceptions as SelEx\n", | |
"\n", | |
"class SearchPage(BasePage):\n", | |
" \n", | |
" def __init__(self,browser,searchUrl):\n", | |
" BasePage.__init__(self,browser)\n", | |
" self.url = searchUrl\n", | |
" \n", | |
" \n", | |
" def goto(self):\n", | |
" BasePage.goto(self)\n", | |
" \n", | |
" def do_ajax_results_request(self,url):\n", | |
" js = \"\"\"\n", | |
" var url = arguments[0];\n", | |
" window._jsonResult = null;\n", | |
" var token = document.querySelector('meta[name = \"csrf-token\"]').getAttribute(\"content\");\n", | |
" \n", | |
" var xmlhttp = new XMLHttpRequest();\n", | |
" \n", | |
" xmlhttp.onreadystatechange = function() {\n", | |
" if (xmlhttp.readyState == 4 && xmlhttp.status == 200) {\n", | |
" window._jsonResult = jsonResult = JSON.parse(xmlhttp.responseText);\n", | |
" }}\n", | |
"\n", | |
" xmlhttp.open(\"GET\", url, true);\n", | |
" xmlhttp.setRequestHeader(\"X-CSRF-Token\",token);\n", | |
" xmlhttp.setRequestHeader(\"X-Requested-With\",\"XMLHttpRequest\")\n", | |
" xmlhttp.setRequestHeader(\"Accept\",\"application/json, text/javascript, */*; q=0.01\")\n", | |
" xmlhttp.send();\n", | |
"\n", | |
" return true;;\n", | |
" \"\"\"\n", | |
" return self.driver.execute_script(js,url)\n", | |
" \n", | |
" def get_ajax_result(self):\n", | |
" #the result when passed back to python will be converted to a python dict automatically\"\n", | |
" return self.js_ex(\"return window._jsonResult\")\n", | |
" \n", | |
" def do_next_request(self,pageNum):\n", | |
" #woe_id is the location identifier in this case 23424977 for USA\n", | |
" nextPageUrl = 'https://www.kickstarter.com/discover/categories/12?page=%d&sort=popularity&term=card+games&woe_id=23424977'\n", | |
" self.do_ajax_results_request(nextPageUrl % pageNum)\n", | |
" return True" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"INFO:rpyc_docker:def driver_firefox(self):\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 48, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"browser = Browser()\n", | |
"browser.setup(visible = True, driver = \"firefox\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"searchUrl = \"https://www.kickstarter.com/discover/advanced?term=card+games&category_id=12&woe_id=23424977&sort=popularity\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"searchPage = SearchPage(browser,searchUrl)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"searchPage.goto()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 52, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"searchPage.do_next_request(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[u'total_hits', u'seed', u'colloquial_title', u'projects', u'see_more']" | |
] | |
}, | |
"execution_count": 53, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"projectResults = searchPage.get_ajax_result()\n", | |
"projectResults.keys()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u'Pillars of Eternity: Lords of the Eastern Reach Card Game'" | |
] | |
}, | |
"execution_count": 54, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"projectResults['projects'][0][\"name\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u'Build cities, raise armies, defeat your enemies in this one to four player card game based in the world of Pillars of Eternity.'" | |
] | |
}, | |
"execution_count": 55, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"projectResults['projects'][0][\"blurb\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2480" | |
] | |
}, | |
"execution_count": 56, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"projectResults['projects'][0][\"backers_count\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"157880.5" | |
] | |
}, | |
"execution_count": 57, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"projectResults['projects'][0][\"pledged\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 58, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"browser.teardown()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#Running headless in a docker container" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 60, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from docker import Client\n", | |
"docker = Client(base_url='unix://var/run/docker.sock')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from rpyc_docker.rpyc_browser_worker import BrowserRpycWorker" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"INFO:worker 1:RpycWorker __init__\n" | |
] | |
} | |
], | |
"source": [ | |
"worker = BrowserRpycWorker(docker,mount = \"/home/john/Development\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"worker.create_container()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"worker.conn is a rpyc connection instance inside the docker container" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 63, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"worker.connect_rpyc()\n", | |
"worker.conn.modules.sys.path.insert(0,\"/Development/python/rpyc_docker\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 65, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"worker.setup_browser(driver = \"firefox\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"worker.browser is an rpyc instance of browser running inside the docker container" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"searchPage = SearchPage(worker.browser,searchUrl)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"searchPage.goto()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 70, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"searchPage.do_next_request(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 71, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[u'total_hits', u'seed', u'colloquial_title', u'projects', u'see_more']" | |
] | |
}, | |
"execution_count": 71, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"projectResults = searchPage.get_ajax_result()\n", | |
"projectResults.keys()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u'Pillars of Eternity: Lords of the Eastern Reach Card Game'" | |
] | |
}, | |
"execution_count": 72, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"projectResults['projects'][0][\"name\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 73, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u'Build cities, raise armies, defeat your enemies in this one to four player card game based in the world of Pillars of Eternity.'" | |
] | |
}, | |
"execution_count": 73, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"projectResults['projects'][0][\"blurb\"]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Don't forget to tear down the docker container after it is done.\n", | |
"Multiple docker containers can be run to create a grid of headless browsers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"worker.teardown()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment