Skip to content

Instantly share code, notes, and snippets.

@llimllib
Last active February 7, 2019 17:14
Show Gist options
  • Star 22 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save llimllib/7f6143a1a6955d243161b2fec23b8f4d to your computer and use it in GitHub Desktop.
Save llimllib/7f6143a1a6955d243161b2fec23b8f4d to your computer and use it in GitHub Desktop.
Headless Chrome Protocol Exploration
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Start Chrome with:\n",
"\n",
"```\n",
"$ /Applications/Google\\ Chrome\\ Canary.app/Contents/MacOS/Google\\ Chrome\\ Canary \\\n",
"> --headless \\\n",
"> --remote-debugging-port=9222 \\\n",
"> --no-first-run \\\n",
"> --disable-gpu \\\n",
"> https://adhocteam.us\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import base64\n",
"import json\n",
"import requests\n",
"import websocket"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'description': '',\n",
" 'devtoolsFrontendUrl': '/devtools/inspector.html?ws=localhost:9222/devtools/page/1a57b07d-ae67-451c-a493-3d980ecad668',\n",
" 'id': '1a57b07d-ae67-451c-a493-3d980ecad668',\n",
" 'title': 'Better government software. Services that work, for people. - Ad Hoc',\n",
" 'type': 'page',\n",
" 'url': 'https://adhocteam.us/',\n",
" 'webSocketDebuggerUrl': 'ws://localhost:9222/devtools/page/1a57b07d-ae67-451c-a493-3d980ecad668'}]"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r = requests.get(\"http://localhost:9222/json\")\n",
"r.json()"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"# \"docuumentation\" such as it exists is at: \n",
"# https://chromedevtools.github.io/debugger-protocol-viewer/tot/Runtime/\n",
"# \n",
"# protocol documentation lives in json format at:\n",
"# https://chromium.googlesource.com/chromium/src/+/master/third_party/WebKit/Source/core/inspector/browser_protocol.json\n",
"# \n",
"# the goal will be to parse this and use it to generate our API\n",
"# actually that's only half the protocol. The whole thing seems to be available at:\n",
"# https://github.com/cyrus-and/chrome-remote-interface/blob/master/lib/protocol.json\n",
"# ... I don't know how to get the other half of the protocol from the chrome site\n",
"ws = websocket.create_connection(r.json()[0]['webSocketDebuggerUrl'])"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id': 0,\n",
" 'result': {'result': {'className': 'Array',\n",
" 'description': 'Array(57)',\n",
" 'objectId': '{\"injectedScriptId\":2,\"id\":3}',\n",
" 'subtype': 'array',\n",
" 'type': 'object'}}}"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# redirect to https://adhocteam.us/our-team/\n",
"nav = json.dumps({\"id\": 0, \"method\": \"Page.navigate\", \"params\": {\"url\": \"https://adhocteam.us/our-team/\"}})\n",
"ws.send(nav)\n",
"json.loads(ws.recv())"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id': 0,\n",
" 'result': {'result': {'type': 'object',\n",
" 'value': ['Paul Smith',\n",
" 'Greg Gershman',\n",
" 'Mike Auclair',\n",
" 'Jeff Balboni',\n",
" 'Rachel Baliff',\n",
" 'Brian Bonenberger',\n",
" 'Danny Chapman',\n",
" 'Daniel Cloud',\n",
" 'Nick Clyde',\n",
" 'Alastair Dawson',\n",
" 'Sophia Dengo',\n",
" 'Eugene Doan',\n",
" 'Laura Ellena',\n",
" 'Robert Fairhead',\n",
" 'Louis Fettet',\n",
" 'Oren Fromberg',\n",
" 'Chris Gansen',\n",
" 'Brian Gryth',\n",
" 'Katie Gwinn',\n",
" 'Shawna Hein',\n",
" 'Aubrey Holland',\n",
" 'Alexis James',\n",
" 'Carl Johnson',\n",
" 'Chris Johnson',\n",
" 'Kam Karshenas',\n",
" 'James Kassemi',\n",
" 'Brian King',\n",
" 'Ken Koski',\n",
" 'Ben Kutil',\n",
" 'Elizabeth Lewis',\n",
" 'Lihan Li',\n",
" 'Maury Lindo',\n",
" 'Austin Martinez',\n",
" 'Wryen Meek',\n",
" 'Curtis Mejeur',\n",
" 'Bill Mill',\n",
" 'Michael Miller',\n",
" 'Leanna Miller Sharkey',\n",
" 'Ryan Nagle',\n",
" 'Juliana Neelbauer',\n",
" 'Mark Olson',\n",
" 'Daniel X. O’Neil',\n",
" 'James Rhein',\n",
" 'Rachael Roueché',\n",
" 'Bill Ryan',\n",
" 'Ben Shyong',\n",
" 'Graham Smith',\n",
" 'Sarah-Jaine Szekeresh',\n",
" 'Chris Szeluga',\n",
" 'Alex Taylor',\n",
" 'Ian Topper',\n",
" 'Christopher Valarida',\n",
" 'Patrick Vinograd',\n",
" 'Kristin Walker',\n",
" 'Caitlin Weber',\n",
" 'Rob Wilkerson',\n",
" 'Mel Woodard']}}}"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# how to execute a script:\n",
"cmd = '[].map.call(document.querySelectorAll(\"h3.centered\"), n => n.textContent)'\n",
"# if you don't use `returnByValue` here you get a remote object id that I have no idea how to use\n",
"ws.send(json.dumps({\"id\": 0, \"method\": \"Runtime.evaluate\", \"params\": {\"expression\": cmd, \"returnByValue\": \"true\"}}))\n",
"o = json.loads(ws.recv())\n",
"o"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@Yensan
Copy link

Yensan commented Feb 7, 2019

hi, llimllib, But some site can detect headless Chrome&Firefox, JUST give you a "Forbidden to visit" page!!!
What should I do?
I just want do something automatic job. Search keywords then save to csv.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment