Skip to content

Instantly share code, notes, and snippets.

@nikolaysm
Last active February 21, 2022 21:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nikolaysm/2c0f0c16084284c8be8f71f83dba71b8 to your computer and use it in GitHub Desktop.
Save nikolaysm/2c0f0c16084284c8be8f71f83dba71b8 to your computer and use it in GitHub Desktop.
#
# This small example shows you how to access JS-based requests via Selenium
# Like this, one can access raw data for scraping,
# for example on many JS-intensive/React-based websites
#
from time import sleep
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
# make chrome log requests
capabilities = DesiredCapabilities.CHROME
"""
As specified in the release notes for ChromeDriver 75.0.3770.8,
capability loggingPrefs has been renamed to goog:loggingPrefs
# https://chromedriver.chromium.org/downloads#h.p_ID_520
# https://chromedriver.storage.googleapis.com/75.0.3770.8/notes.txt
"""
capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
driver = webdriver.Chrome(
desired_capabilities=capabilities, executable_path="./chromedriver"
)
# fetch a site that does xhr requests
driver.get("https://sitewithajaxorsomething.com")
sleep(5) # wait for the requests to take place
# extract requests from logs
logs_raw = driver.get_log("performance")
logs = [json.loads(lr["message"])["message"] for lr in logs_raw]
def log_filter(log_):
return (
# is an actual response
log_["method"] == "Network.responseReceived"
# and json
and "json" in log_["params"]["response"]["mimeType"]
)
for log in filter(log_filter, logs):
request_id = log["params"]["requestId"]
resp_url = log["params"]["response"]["url"]
print(f"Caught {resp_url}")
print(driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id}))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment