Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johnpauljanecek/3446d11bed47b3b12b27 to your computer and use it in GitHub Desktop.
Save johnpauljanecek/3446d11bed47b3b12b27 to your computer and use it in GitHub Desktop.
Ajax Kickstarter Scraping Example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from rpyc_docker import Browser,WebDriver\n",
"import os.path"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\"\"\"\n",
"basepage.py\n",
"used for the examples, other pages should inherit off this page\n",
"\"\"\"\n",
"\n",
"class BasePage(object):\n",
" url = None\n",
" js_dict_to_array = \"\"\"\n",
" window.dict_to_array = function(dict) {\n",
" var result = [];\n",
" for(var k in dict) {\n",
" result.push([k,dict[k]]);\n",
" }\n",
" return result;}\n",
" \"\"\"\n",
" def __init__(self,browser):\n",
" self.browser = browser\n",
" self.driver = browser.driver\n",
" #make nice shortcuts to browser\n",
" self.js_ex = self.browser.js_ex\n",
" \n",
" def find_elements_with_text(self,tagName,rePattern):\n",
" return self.driver.execute_script(\"\"\"\n",
" return (function(tag,pattern) {\n",
" var patt = RegExp(pattern);\n",
" var elms = Array.prototype.slice.call(document.getElementsByTagName(tag));\n",
" return elms.filter(function(elm) {\n",
" return patt.test(elm.textContent);\n",
" })\n",
" })(arguments[0],arguments[1]);\n",
" \"\"\",tagName,rePattern)\n",
"\n",
" def scroll_top(self):\n",
" self.driver.execute_script(\"window.scrollTo(0,0);\")\n",
" return True\n",
"\n",
" def scroll_bottom(self):\n",
" self.driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
" return True\n",
"\n",
" def goto(self,url = None):\n",
" if url :\n",
" self.driver.get(url)\n",
" else:\n",
" self.driver.get(self.url)\n",
"\n",
" def ipython_screenshot(self):\n",
" from IPython.display import Image\n",
" img = self.driver.get_screenshot_as_png()\n",
" return Image(data = img)\n",
"\n",
" def find_css_input(self,css,value):\n",
" elm = self.driver.find_element_by_css_selector(css)\n",
" elm.clear()\n",
" elm.send_keys(value)\n",
" \n",
" def find_css_click(self,css):\n",
" try :\n",
" elm = self.driver.find_element_by_css_selector(css)\n",
" elm.click()\n",
" return True\n",
" except SelEx.ElementNotVisibleException:\n",
" return False\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import urlparse\n",
"from bs4 import BeautifulSoup\n",
"import selenium.common.exceptions as SelEx\n",
"\n",
"class SearchPage(BasePage):\n",
" \n",
" def __init__(self,browser,searchUrl):\n",
" BasePage.__init__(self,browser)\n",
" self.url = searchUrl\n",
" \n",
" \n",
" def goto(self):\n",
" BasePage.goto(self)\n",
" \n",
" def do_ajax_results_request(self,url):\n",
" js = \"\"\"\n",
" var url = arguments[0];\n",
" window._jsonResult = null;\n",
" var token = document.querySelector('meta[name = \"csrf-token\"]').getAttribute(\"content\");\n",
" \n",
" var xmlhttp = new XMLHttpRequest();\n",
" \n",
" xmlhttp.onreadystatechange = function() {\n",
" if (xmlhttp.readyState == 4 && xmlhttp.status == 200) {\n",
" window._jsonResult = jsonResult = JSON.parse(xmlhttp.responseText);\n",
" }}\n",
"\n",
" xmlhttp.open(\"GET\", url, true);\n",
" xmlhttp.setRequestHeader(\"X-CSRF-Token\",token);\n",
" xmlhttp.setRequestHeader(\"X-Requested-With\",\"XMLHttpRequest\")\n",
" xmlhttp.setRequestHeader(\"Accept\",\"application/json, text/javascript, */*; q=0.01\")\n",
" xmlhttp.send();\n",
"\n",
" return true;;\n",
" \"\"\"\n",
" return self.driver.execute_script(js,url)\n",
" \n",
" def get_ajax_result(self):\n",
" #the result when passed back to python will be converted to a python dict automatically\"\n",
" return self.js_ex(\"return window._jsonResult\")\n",
" \n",
" def do_next_request(self,pageNum):\n",
" #woe_id is the location identifier in this case 23424977 for USA\n",
" nextPageUrl = 'https://www.kickstarter.com/discover/categories/12?page=%d&sort=popularity&term=card+games&woe_id=23424977'\n",
" self.do_ajax_results_request(nextPageUrl % pageNum)\n",
" return True"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:rpyc_docker:def driver_firefox(self):\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"browser = Browser()\n",
"browser.setup(visible = True, driver = \"firefox\")"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"searchUrl = \"https://www.kickstarter.com/discover/advanced?term=card+games&category_id=12&woe_id=23424977&sort=popularity\""
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"searchPage = SearchPage(browser,searchUrl)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"searchPage.goto()"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"searchPage.do_next_request(1)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[u'total_hits', u'seed', u'colloquial_title', u'projects', u'see_more']"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"projectResults = searchPage.get_ajax_result()\n",
"projectResults.keys()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u'Pillars of Eternity: Lords of the Eastern Reach Card Game'"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"projectResults['projects'][0][\"name\"]"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u'Build cities, raise armies, defeat your enemies in this one to four player card game based in the world of Pillars of Eternity.'"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"projectResults['projects'][0][\"blurb\"]"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"2480"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"projectResults['projects'][0][\"backers_count\"]"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"157880.5"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"projectResults['projects'][0][\"pledged\"]"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"browser.teardown()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#Running headless in a docker container"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from docker import Client\n",
"docker = Client(base_url='unix://var/run/docker.sock')"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from rpyc_docker.rpyc_browser_worker import BrowserRpycWorker"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:worker 1:RpycWorker __init__\n"
]
}
],
"source": [
"worker = BrowserRpycWorker(docker,mount = \"/home/john/Development\")"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"worker.create_container()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"worker.conn is a rpyc connection instance inside the docker container"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"worker.connect_rpyc()\n",
"worker.conn.modules.sys.path.insert(0,\"/Development/python/rpyc_docker\")"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"worker.setup_browser(driver = \"firefox\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"worker.browser is an rpyc instance of browser running inside the docker container"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"searchPage = SearchPage(worker.browser,searchUrl)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"searchPage.goto()"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"searchPage.do_next_request(1)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[u'total_hits', u'seed', u'colloquial_title', u'projects', u'see_more']"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"projectResults = searchPage.get_ajax_result()\n",
"projectResults.keys()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u'Pillars of Eternity: Lords of the Eastern Reach Card Game'"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"projectResults['projects'][0][\"name\"]"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u'Build cities, raise armies, defeat your enemies in this one to four player card game based in the world of Pillars of Eternity.'"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"projectResults['projects'][0][\"blurb\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Don't forget to tear down the docker container after it is done.\n",
"Multiple docker containers can be run to create a grid of headless browsers"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"worker.teardown()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment