-
-
Save llimllib/7f6143a1a6955d243161b2fec23b8f4d to your computer and use it in GitHub Desktop.
Headless Chrome Protocol Exploration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Start Chrome with:\n", | |
"\n", | |
"```\n", | |
"$ /Applications/Google\\ Chrome\\ Canary.app/Contents/MacOS/Google\\ Chrome\\ Canary \\\n", | |
"> --headless \\\n", | |
"> --remote-debugging-port=9222 \\\n", | |
"> --no-first-run \\\n", | |
"> --disable-gpu \\\n", | |
"> https://adhocteam.us\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import base64\n", | |
"import json\n", | |
"import requests\n", | |
"import websocket" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[{'description': '',\n", | |
" 'devtoolsFrontendUrl': '/devtools/inspector.html?ws=localhost:9222/devtools/page/1a57b07d-ae67-451c-a493-3d980ecad668',\n", | |
" 'id': '1a57b07d-ae67-451c-a493-3d980ecad668',\n", | |
" 'title': 'Better government software. Services that work, for people. - Ad Hoc',\n", | |
" 'type': 'page',\n", | |
" 'url': 'https://adhocteam.us/',\n", | |
" 'webSocketDebuggerUrl': 'ws://localhost:9222/devtools/page/1a57b07d-ae67-451c-a493-3d980ecad668'}]" | |
] | |
}, | |
"execution_count": 51, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"r = requests.get(\"http://localhost:9222/json\")\n", | |
"r.json()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# \"docuumentation\" such as it exists is at: \n", | |
"# https://chromedevtools.github.io/debugger-protocol-viewer/tot/Runtime/\n", | |
"# \n", | |
"# protocol documentation lives in json format at:\n", | |
"# https://chromium.googlesource.com/chromium/src/+/master/third_party/WebKit/Source/core/inspector/browser_protocol.json\n", | |
"# \n", | |
"# the goal will be to parse this and use it to generate our API\n", | |
"# actually that's only half the protocol. The whole thing seems to be available at:\n", | |
"# https://github.com/cyrus-and/chrome-remote-interface/blob/master/lib/protocol.json\n", | |
"# ... I don't know how to get the other half of the protocol from the chrome site\n", | |
"ws = websocket.create_connection(r.json()[0]['webSocketDebuggerUrl'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'id': 0,\n", | |
" 'result': {'result': {'className': 'Array',\n", | |
" 'description': 'Array(57)',\n", | |
" 'objectId': '{\"injectedScriptId\":2,\"id\":3}',\n", | |
" 'subtype': 'array',\n", | |
" 'type': 'object'}}}" | |
] | |
}, | |
"execution_count": 63, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# redirect to https://adhocteam.us/our-team/\n", | |
"nav = json.dumps({\"id\": 0, \"method\": \"Page.navigate\", \"params\": {\"url\": \"https://adhocteam.us/our-team/\"}})\n", | |
"ws.send(nav)\n", | |
"json.loads(ws.recv())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'id': 0,\n", | |
" 'result': {'result': {'type': 'object',\n", | |
" 'value': ['Paul Smith',\n", | |
" 'Greg Gershman',\n", | |
" 'Mike Auclair',\n", | |
" 'Jeff Balboni',\n", | |
" 'Rachel Baliff',\n", | |
" 'Brian Bonenberger',\n", | |
" 'Danny Chapman',\n", | |
" 'Daniel Cloud',\n", | |
" 'Nick Clyde',\n", | |
" 'Alastair Dawson',\n", | |
" 'Sophia Dengo',\n", | |
" 'Eugene Doan',\n", | |
" 'Laura Ellena',\n", | |
" 'Robert Fairhead',\n", | |
" 'Louis Fettet',\n", | |
" 'Oren Fromberg',\n", | |
" 'Chris Gansen',\n", | |
" 'Brian Gryth',\n", | |
" 'Katie Gwinn',\n", | |
" 'Shawna Hein',\n", | |
" 'Aubrey Holland',\n", | |
" 'Alexis James',\n", | |
" 'Carl Johnson',\n", | |
" 'Chris Johnson',\n", | |
" 'Kam Karshenas',\n", | |
" 'James Kassemi',\n", | |
" 'Brian King',\n", | |
" 'Ken Koski',\n", | |
" 'Ben Kutil',\n", | |
" 'Elizabeth Lewis',\n", | |
" 'Lihan Li',\n", | |
" 'Maury Lindo',\n", | |
" 'Austin Martinez',\n", | |
" 'Wryen Meek',\n", | |
" 'Curtis Mejeur',\n", | |
" 'Bill Mill',\n", | |
" 'Michael Miller',\n", | |
" 'Leanna Miller Sharkey',\n", | |
" 'Ryan Nagle',\n", | |
" 'Juliana Neelbauer',\n", | |
" 'Mark Olson',\n", | |
" 'Daniel X. O’Neil',\n", | |
" 'James Rhein',\n", | |
" 'Rachael Roueché',\n", | |
" 'Bill Ryan',\n", | |
" 'Ben Shyong',\n", | |
" 'Graham Smith',\n", | |
" 'Sarah-Jaine Szekeresh',\n", | |
" 'Chris Szeluga',\n", | |
" 'Alex Taylor',\n", | |
" 'Ian Topper',\n", | |
" 'Christopher Valarida',\n", | |
" 'Patrick Vinograd',\n", | |
" 'Kristin Walker',\n", | |
" 'Caitlin Weber',\n", | |
" 'Rob Wilkerson',\n", | |
" 'Mel Woodard']}}}" | |
] | |
}, | |
"execution_count": 69, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# how to execute a script:\n", | |
"cmd = '[].map.call(document.querySelectorAll(\"h3.centered\"), n => n.textContent)'\n", | |
"# if you don't use `returnByValue` here you get a remote object id that I have no idea how to use\n", | |
"ws.send(json.dumps({\"id\": 0, \"method\": \"Runtime.evaluate\", \"params\": {\"expression\": cmd, \"returnByValue\": \"true\"}}))\n", | |
"o = json.loads(ws.recv())\n", | |
"o" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
hi, llimllib, But some site can detect headless Chrome&Firefox, JUST give you a "Forbidden to visit" page!!!
What should I do?
I just want do something automatic job. Search keywords then save to csv.