Skip to content

Instantly share code, notes, and snippets.

@lgendrot
Created November 27, 2018 17:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lgendrot/84e374aacca4025eec59469270b6828c to your computer and use it in GitHub Desktop.
Save lgendrot/84e374aacca4025eec59469270b6828c to your computer and use it in GitHub Desktop.
Crawls input urls using selenium and headless chrome to search for form elements and image sizes.
import pandas as pd
import requests
import os
import argparse
from selenium import webdriver
from urllib.request import urlopen
from urllib.parse import urlparse
from PIL import ImageFile
from bs4 import BeautifulSoup
def geturl(url):
scheme, host, path, params, query, fragment = urlparse(url)
if not path:
path = "/"
if params:
path = path + ";" + params
if query:
path = path + "?" + query
url = host + path
return "http://" + url
def getsizes(uri, timeout=30):
# get file size *and* image size (None if not known)
size = None
tries = 0
try:
with requests.get(uri, stream=True, timeout=timeout) as file:
size = file.headers.get("content-length")
if size:
size = int(size)
p = ImageFile.Parser()
data = file.iter_content(chunk_size=1024)
for datum in data:
p.feed(datum)
if p.image:
return size, p.image.size
except:
return None, None
return size, None
def get_images(url, driver=None, timeout=30):
if driver is not None:
driver.get(url)
page = driver.page_source
return page
else:
page = None
while not page:
try:
page = requests.get(url, timeout=timeout).text
except:
timeout += 1
return page
def image_sizes(soup, timeout=30):
for img in soup.find_all("img"):
filesize, imgsize = getsizes(geturl(img['src']), timeout=timeout)
if imgsize is not None:
width, height = imgsize
yield (img['src'], filesize, width, height)
else:
yield (img['src'], filesize, None, None)
def form_elements(soup):
form_count = 0
input_count = 0
for form in soup.find_all("form"):
form_count += 1
for input_element in form.find_all("input"):
input_count += 1
return form_count, input_count
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape URLs for ML Features")
parser.add_argument("-i", "--input", type=str, help="Location of Input CSV",
required=True, dest="input_file")
parser.add_argument("-o", "--output", type=str, help="Output location",
required=True, dest="output_file")
parser.add_argument("--selenium", default=False)
parser.add_argument("--chrome-driver", type=str,
help="Location of chromedriver binary",
default="./chromedriver")
parser.add_argument("--base_url", type=str)
args = parser.parse_args()
urls = pd.read_csv(args.input_file)
if args.selenium:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(args.chrome_driver, options=options)
else:
driver = None
output_1 = []
output_2 = []
for url in urls['url']:
url = args.base_url + url
print("----------------------")
print(url)
timeout = 10
html = get_images(url, driver, timeout=timeout)
soup = BeautifulSoup(html, features="lxml")
sizes = image_sizes(soup, timeout=timeout)
form_count, input_count = form_elements(soup)
output_1.append((url, form_count, input_count))
for src, filesize, width, height in sizes:
print("image: ", src)
output_2.append((url, src, filesize, width, height))
print("----------------------")
output_1 = pd.DataFrame(output_1,
columns=["url", "form_count", "input_count"])
output_2 = pd.DataFrame(output_2,
columns=["url", "img_src","filesize",
"width", "height"])
output_1.to_csv(os.path.join(args.output_file, "./form_counts.csv"))
output_2.to_csv(os.path.join(args.output_file, "./img_sizes.csv"))
@lgendrot
Copy link
Author

lgendrot commented Nov 27, 2018

This assumes that you have a chromedriver for your specific OS either in the same folder. Alternatively you can point to your chromedriver using the --chrome-driver parameter.

--input is a CSV file
--output is a folder (output names are fixed, sorry)

Setting --selenium=True is required for pages served with Javascript

This script is single-threaded and synchronous so...it'll take a while to run.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment