Skip to content

Instantly share code, notes, and snippets.

@weiglemc
Created January 18, 2023 21:36
Show Gist options
  • Save weiglemc/81c7befb1ee35c405105036c5632ff82 to your computer and use it in GitHub Desktop.
Save weiglemc/81c7befb1ee35c405105036c5632ff82 to your computer and use it in GitHub Desktop.
Python script using selenium-wire to render a webpage and capture specific requests that it generates
# run the script on a set of URI-Ms:
# python3 capture-requests.py < to-request.txt >> requests-log.txt
# process the results and generate a new list of URI-Ms that were requested:
# awk '{if ($1 ~ /cnn\.com(:80)?[\/]+$/ && $2 == "200") print $0}' requests-log.txt | sort -t '/' -k 5 >! requests.txt
# https://pypi.org/project/selenium-wire/#installation
import sys
import time
from seleniumwire import webdriver # Import from seleniumwire
def interceptor(request):
# Block PNG, JPEG and GIF images, fonts
if request.path.endswith(('.png', '.jpg', '.gif', '.woff', 'woff2', '.tff')):
request.abort()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('headless')
# Create a new instance of the Chrome driver
driver = webdriver.Chrome(options=chrome_options)
# set URLs to capture (scopes) and extensions to ignore (interceptor)
driver.scopes = ['.*www.cnn.com/$', '.*www.cnn.com:80/$', '.*www.cnn.com//$', '.*header.*', '.*zone-manager.*']
driver.request_interceptor = interceptor
# read in URI-Ms we've already requested
requested = []
with open ('requests.txt') as fp:
line = fp.readline()
while line:
urim = line.split(" ")[0]
requested.append(urim)
line = fp.readline()
for url in sys.stdin:
url = url.rstrip() # remove trailing newlines
# check to see if we already have data on this URI-M
if (url in requested):
print (url + " already DONE", file=sys.stderr)
sys.stderr.flush()
continue
print ("REQ " + url, end=' ', file=sys.stderr)
sys.stderr.flush()
try:
driver.get(url)
except Exception as err:
sys.exit (" - EXCEPTION: " + str(err))
# Access requests via the `requests` attribute
for request in driver.requests:
if request.response:
print(request.url, request.response.status_code, " ", end='')
if (request.response.status_code >= 300 | request.response.status_code<400):
print (request.response.headers['location'])
elif (request.response.headers['content-length']):
print (request.response.headers['content-length'])
elif (request.response.headers['x-archive-orig-content-length']):
print (request.response.headers['x-archive-orig-content-length'])
else:
print ("NoContentLength")
sys.stdout.flush()
print (" - DONE, waiting 10 seconds... ", file=sys.stderr)
sys.stderr.flush()
time.sleep(10) # wait 10 seconds between requests
# reset driver.requests
del driver.requests
driver.close()
del driver
# avoid warnings about selenium.Service not shutting down in time
time.sleep(3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment