Skip to content

Instantly share code, notes, and snippets.

@rengler33
Last active June 1, 2024 00:23
Show Gist options
  • Save rengler33/f8b9d3f26a518c08a414f6f86109863c to your computer and use it in GitHub Desktop.
Save rengler33/f8b9d3f26a518c08a414f6f86109863c to your computer and use it in GitHub Desktop.
How to Capture Network Traffic When Scraping with Selenium & Python
# see rkengler.com for related blog post
# https://www.rkengler.com/how-to-capture-network-traffic-when-scraping-with-selenium-and-python/
import json
import pprint
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
capabilities = DesiredCapabilities.CHROME
# capabilities["loggingPrefs"] = {"performance": "ALL"} # chromedriver < ~75
capabilities["goog:loggingPrefs"] = {"performance": "ALL"} # chromedriver 75+
driver = webdriver.Chrome(
r"chromedriver.exe",
desired_capabilities=capabilities,
)
def process_browser_logs_for_network_events(logs):
"""
Return only logs which have a method that start with "Network.response", "Network.request", or "Network.webSocket"
since we're interested in the network events specifically.
"""
for entry in logs:
log = json.loads(entry["message"])["message"]
if (
"Network.response" in log["method"]
or "Network.request" in log["method"]
or "Network.webSocket" in log["method"]
):
yield log
driver.get("https://www.rkengler.com")
logs = driver.get_log("performance")
events = process_browser_logs_for_network_events(logs)
with open("log_entries.txt", "wt") as out:
for event in events:
pprint.pprint(event, stream=out)
@YnotY2
Copy link

YnotY2 commented Jun 1, 2024

This still works great with newest version of Selenium and Chrome Driver. Bit of tweaking but worked out in the end. Here is the code functions I wrote and utilize when performing web-scraping:

-Function to initialize the driver_instance and return it:

def initialize_chrome_driver_instance(profile_id):
	logger.info(f"{Colors.CYAN}Initializing{Colors.END}{Colors.YELLOW} chrome-driver{Colors.END} {Colors.CYAN}   'webdriver_instance' for web-automation via chrome...{Colors.END}")

	# Create a WebDriver instance
	# Here we specify full path to chromedriver
	chrome_driver_path = "/home/software/chromedriver-linux64/chromedriver"

	service = Service(executable_path=chrome_driver_path)

	chrome_options = webdriver.ChromeOptions()
	chrome_options.headless = False  # Set to False if you want to see the browser while running
	chrome_options.add_experimental_option("debuggerAddress", debugger_address)

	# needs to be added for network logging
	chrome_options.add_argument("--auto-open-devtools-for-tabs")

	# Add this for CDP network logging :D
	chrome_options.set_capability("goog:loggingPrefs", {"performance": "ALL"})

	driver_instance = webdriver.Chrome(service=service, options=chrome_options)

	# Enable network logging
	driver_instance.execute_cdp_cmd('Network.enable', {})

	logger.info(f"{Colors.GREEN}Successfully initialized webdriver_instance:{Colors.END}")
	logger.info(f"{Colors.MAGENTA} {driver_instance} {Colors.END}")
	logger.info(f"{Colors.BLUE}Returning;{Colors.END} {Colors.YELLOW}driver_instance{Colors.END}")
	logger.info(f"{Colors.CYAN}-{Colors.END}{Colors.CYAN} driver_instance:{Colors.END} {Colors.YELLOW}{driver_instance}{Colors.END}")

	# Print a blank line to the terminal
	print("")
	return driver_instance

if __name__ == "__main__":
    initialize_chrome_driver_instance()
# Use these imports for the above code of initializing webdriver 
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

-Function to capture traffic from specified url:

def intercept_traffic_check_log_auth(driver_instance):
    driver_instance.get("https://example.com")
    # Allow some time for traffic to load
    time.sleep(5)

    # Function to capture and print network events
    intercepted_traffic_object = driver_instance.get_log("performance")
    #print(intercepted_traffic_object)

    # Here we return all the traffic we captured as a object. 
    return intercepted_traffic_object

if __name__ == "__main__":
    intercept_traffic_check_log_auth()

Thanks so much to the author of this post I found on Google: "https://www.rkengler.com/how-to-capture-network-traffic-when-scraping-with-selenium-and-python/" Amazing!

Have a blessed day ^_^

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment