Crawls input urls using selenium and headless chrome to search for form elements and image sizes.
import pandas as pd
import requests
import os
import argparse
from selenium import webdriver
from urllib.request import urlopen
from urllib.parse import urlparse
from PIL import ImageFile
from bs4 import BeautifulSoup
def geturl(url):
scheme, host, path, params, query, fragment = urlparse(url)
if not path:
path = "/"
if params:
path = path + ";" + params
if query:
path = path + "?" + query
url = host + path
return "http://" + url
def getsizes(uri, timeout=30):
# get file size *and* image size (None if not known)
size = None
tries = 0
with requests.get(uri, stream=True, timeout=timeout) as file:
size = file.headers.get("content-length")
if size:
size = int(size)
p = ImageFile.Parser()
data = file.iter_content(chunk_size=1024)
for datum in data:
if p.image:
return size, p.image.size
return None, None
return size, None
def get_images(url, driver=None, timeout=30):
if driver is not None:
page = driver.page_source
return page
page = None
while not page:
page = requests.get(url, timeout=timeout).text
timeout += 1
return page
def image_sizes(soup, timeout=30):
for img in soup.find_all("img"):
filesize, imgsize = getsizes(geturl(img['src']), timeout=timeout)
if imgsize is not None:
width, height = imgsize
yield (img['src'], filesize, width, height)
yield (img['src'], filesize, None, None)
def form_elements(soup):
form_count = 0
input_count = 0
for form in soup.find_all("form"):
form_count += 1
for input_element in form.find_all("input"):
input_count += 1
return form_count, input_count
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape URLs for ML Features")
parser.add_argument("-i", "--input", type=str, help="Location of Input CSV",
required=True, dest="input_file")
parser.add_argument("-o", "--output", type=str, help="Output location",
required=True, dest="output_file")
parser.add_argument("--selenium", default=False)
parser.add_argument("--chrome-driver", type=str,
help="Location of chromedriver binary",
parser.add_argument("--base_url", type=str)
args = parser.parse_args()
urls = pd.read_csv(args.input_file)
if args.selenium:
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(args.chrome_driver, options=options)
driver = None
output_1 = []
output_2 = []
for url in urls['url']:
url = args.base_url + url
timeout = 10
html = get_images(url, driver, timeout=timeout)
soup = BeautifulSoup(html, features="lxml")
sizes = image_sizes(soup, timeout=timeout)
form_count, input_count = form_elements(soup)
output_1.append((url, form_count, input_count))
for src, filesize, width, height in sizes:
print("image: ", src)
output_2.append((url, src, filesize, width, height))
output_1 = pd.DataFrame(output_1,
columns=["url", "form_count", "input_count"])
output_2 = pd.DataFrame(output_2,
columns=["url", "img_src","filesize",
"width", "height"])
output_1.to_csv(os.path.join(args.output_file, "./form_counts.csv"))
output_2.to_csv(os.path.join(args.output_file, "./img_sizes.csv"))

This assumes that you have a chromedriver for your specific OS either in the same folder. Alternatively you can point to your chromedriver using the --chrome-driver parameter.

--input is a CSV file
--output is a folder (output names are fixed, sorry)

Setting --selenium=True is required for pages served with Javascript

This script is single-threaded and synchronous'll take a while to run.

