-
-
Save rengler33/f8b9d3f26a518c08a414f6f86109863c to your computer and use it in GitHub Desktop.
# see rkengler.com for related blog post | |
# https://www.rkengler.com/how-to-capture-network-traffic-when-scraping-with-selenium-and-python/ | |
import json | |
import pprint | |
from selenium import webdriver | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
capabilities = DesiredCapabilities.CHROME | |
# capabilities["loggingPrefs"] = {"performance": "ALL"} # chromedriver < ~75 | |
capabilities["goog:loggingPrefs"] = {"performance": "ALL"} # chromedriver 75+ | |
driver = webdriver.Chrome( | |
r"chromedriver.exe", | |
desired_capabilities=capabilities, | |
) | |
def process_browser_logs_for_network_events(logs): | |
""" | |
Return only logs which have a method that start with "Network.response", "Network.request", or "Network.webSocket" | |
since we're interested in the network events specifically. | |
""" | |
for entry in logs: | |
log = json.loads(entry["message"])["message"] | |
if ( | |
"Network.response" in log["method"] | |
or "Network.request" in log["method"] | |
or "Network.webSocket" in log["method"] | |
): | |
yield log | |
driver.get("https://www.rkengler.com") | |
logs = driver.get_log("performance") | |
events = process_browser_logs_for_network_events(logs) | |
with open("log_entries.txt", "wt") as out: | |
for event in events: | |
pprint.pprint(event, stream=out) |
So you want the script extended for capturing HAR files because you are exporting your web traffic?
good!!!
How can I make it capture a specific Request URL Only?
use this to get bearer token
logs = driver.get_log("performance")
for entry in logs:
if "Bearer" in str(entry["message"]):
token = (entry["message"].split()[3]).split('"')[0]
print(token)
break
use this to get bearer token logs = driver.get_log("performance") for entry in logs: if "Bearer" in str(entry["message"]): token = (entry["message"].split()[3]).split('"')[0] print(token) break
Maybe my answer is a bit late, but I have encountered the problem of getting a token in this way when we use chromium headless.
The logs are in json format, so we can use a solution like this:
logs = browser.get_log("performance")
for entry in logs:
if "Bearer" in str(entry["message"]):
json_message_data = json.loads(str(entry["message"]))
authorization_json = json_message_data['message']['params']['request']['headers']['Authorization']
print(authorization_json)
break
Result will be: Bearer xxxxxxx
whwere is the response i cant find it for any of the requests
If anyone needs this for Kotlin
val options = ChromeOptions()
options.setCapability(ChromeOptions.LOGGING_PREFS, mapOf("performance" to "ALL"))
val driver = ChromeDriver(options)
/*
Do some stuff here .... get post ..
*/
val logs: Logs = driver.manage().logs()
val performance = logs.get("performance")
This code Won't work on latest selenium. Try selenium==4.9.1
Hi @masummuhammad I found a way with selenium >4 . Slightly different but the spirit is the same.
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
options = webdriver.ChromeOptions()
options.set_capability("goog:loggingPrefs", {"performance": "ALL"})
options.add_experimental_option("perfLoggingPrefs", {"enableNetwork": True})
service = Service()
driver = webdriver.Chrome(service=service, options=options)
This still works great with newest version of Selenium and Chrome Driver. Bit of tweaking but worked out in the end. Here is the code functions I wrote and utilize when performing web-scraping:
-Function to initialize the driver_instance and return it:
def initialize_chrome_driver_instance(profile_id):
logger.info(f"{Colors.CYAN}Initializing{Colors.END}{Colors.YELLOW} chrome-driver{Colors.END} {Colors.CYAN} 'webdriver_instance' for web-automation via chrome...{Colors.END}")
# Create a WebDriver instance
# Here we specify full path to chromedriver
chrome_driver_path = "/home/software/chromedriver-linux64/chromedriver"
service = Service(executable_path=chrome_driver_path)
chrome_options = webdriver.ChromeOptions()
chrome_options.headless = False # Set to False if you want to see the browser while running
chrome_options.add_experimental_option("debuggerAddress", debugger_address)
# needs to be added for network logging
chrome_options.add_argument("--auto-open-devtools-for-tabs")
# Add this for CDP network logging :D
chrome_options.set_capability("goog:loggingPrefs", {"performance": "ALL"})
driver_instance = webdriver.Chrome(service=service, options=chrome_options)
# Enable network logging
driver_instance.execute_cdp_cmd('Network.enable', {})
logger.info(f"{Colors.GREEN}Successfully initialized webdriver_instance:{Colors.END}")
logger.info(f"{Colors.MAGENTA} {driver_instance} {Colors.END}")
logger.info(f"{Colors.BLUE}Returning;{Colors.END} {Colors.YELLOW}driver_instance{Colors.END}")
logger.info(f"{Colors.CYAN}-{Colors.END}{Colors.CYAN} driver_instance:{Colors.END} {Colors.YELLOW}{driver_instance}{Colors.END}")
# Print a blank line to the terminal
print("")
return driver_instance
if __name__ == "__main__":
initialize_chrome_driver_instance()
# Use these imports for the above code of initializing webdriver
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
-Function to capture traffic from specified url:
def intercept_traffic_check_log_auth(driver_instance):
driver_instance.get("https://example.com")
# Allow some time for traffic to load
time.sleep(5)
# Function to capture and print network events
intercepted_traffic_object = driver_instance.get_log("performance")
#print(intercepted_traffic_object)
# Here we return all the traffic we captured as a object.
return intercepted_traffic_object
if __name__ == "__main__":
intercept_traffic_check_log_auth()
Thanks so much to the author of this post I found on Google: "https://www.rkengler.com/how-to-capture-network-traffic-when-scraping-with-selenium-and-python/" Amazing!
Have a blessed day ^_^
Hi, this tool is cool and i would like to see this extended for capturing HAR files if possible.
Thanks in advance,
Naveen