Crawls input urls using selenium and headless chrome to search for form elements and image sizes.
import pandas as pd | |
import requests | |
import os | |
import argparse | |
from selenium import webdriver | |
from urllib.request import urlopen | |
from urllib.parse import urlparse | |
from PIL import ImageFile | |
from bs4 import BeautifulSoup | |
def geturl(url): | |
scheme, host, path, params, query, fragment = urlparse(url) | |
if not path: | |
path = "/" | |
if params: | |
path = path + ";" + params | |
if query: | |
path = path + "?" + query | |
url = host + path | |
return "http://" + url | |
def getsizes(uri, timeout=30): | |
# get file size *and* image size (None if not known) | |
size = None | |
tries = 0 | |
try: | |
with requests.get(uri, stream=True, timeout=timeout) as file: | |
size = file.headers.get("content-length") | |
if size: | |
size = int(size) | |
p = ImageFile.Parser() | |
data = file.iter_content(chunk_size=1024) | |
for datum in data: | |
p.feed(datum) | |
if p.image: | |
return size, p.image.size | |
except: | |
return None, None | |
return size, None | |
def get_images(url, driver=None, timeout=30): | |
if driver is not None: | |
driver.get(url) | |
page = driver.page_source | |
return page | |
else: | |
page = None | |
while not page: | |
try: | |
page = requests.get(url, timeout=timeout).text | |
except: | |
timeout += 1 | |
return page | |
def image_sizes(soup, timeout=30): | |
for img in soup.find_all("img"): | |
filesize, imgsize = getsizes(geturl(img['src']), timeout=timeout) | |
if imgsize is not None: | |
width, height = imgsize | |
yield (img['src'], filesize, width, height) | |
else: | |
yield (img['src'], filesize, None, None) | |
def form_elements(soup): | |
form_count = 0 | |
input_count = 0 | |
for form in soup.find_all("form"): | |
form_count += 1 | |
for input_element in form.find_all("input"): | |
input_count += 1 | |
return form_count, input_count | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Scrape URLs for ML Features") | |
parser.add_argument("-i", "--input", type=str, help="Location of Input CSV", | |
required=True, dest="input_file") | |
parser.add_argument("-o", "--output", type=str, help="Output location", | |
required=True, dest="output_file") | |
parser.add_argument("--selenium", default=False) | |
parser.add_argument("--chrome-driver", type=str, | |
help="Location of chromedriver binary", | |
default="./chromedriver") | |
parser.add_argument("--base_url", type=str) | |
args = parser.parse_args() | |
urls = pd.read_csv(args.input_file) | |
if args.selenium: | |
options = webdriver.ChromeOptions() | |
options.add_argument("--headless") | |
driver = webdriver.Chrome(args.chrome_driver, options=options) | |
else: | |
driver = None | |
output_1 = [] | |
output_2 = [] | |
for url in urls['url']: | |
url = args.base_url + url | |
print("----------------------") | |
print(url) | |
timeout = 10 | |
html = get_images(url, driver, timeout=timeout) | |
soup = BeautifulSoup(html, features="lxml") | |
sizes = image_sizes(soup, timeout=timeout) | |
form_count, input_count = form_elements(soup) | |
output_1.append((url, form_count, input_count)) | |
for src, filesize, width, height in sizes: | |
print("image: ", src) | |
output_2.append((url, src, filesize, width, height)) | |
print("----------------------") | |
output_1 = pd.DataFrame(output_1, | |
columns=["url", "form_count", "input_count"]) | |
output_2 = pd.DataFrame(output_2, | |
columns=["url", "img_src","filesize", | |
"width", "height"]) | |
output_1.to_csv(os.path.join(args.output_file, "./form_counts.csv")) | |
output_2.to_csv(os.path.join(args.output_file, "./img_sizes.csv")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
This assumes that you have a
chromedriver
for your specific OS either in the same folder. Alternatively you can point to your chromedriver using the--chrome-driver
parameter.--input
is a CSV file--output
is a folder (output names are fixed, sorry)Setting
--selenium=True
is required for pages served with JavascriptThis script is single-threaded and synchronous so...it'll take a while to run.