mkfink/label_scrape.py

## label_scrape.py
""" Scrape the wiki page of equipment labels to generate images of
each label at the resolution required by the label printer
"""

from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
from io import BytesIO
from PIL import Image
import requests
import string
import json

page_url = "https://www.i3detroit.org/wiki/Equipment_Labels_List"
printer_resolution = (1109, 696)

load_delay = 3  # seconds to let page load before taking screenshot
fn_valid_chars = "-_.() " + string.ascii_letters + string.digits

# chrome webdriver options
op = Options()
op.add_argument("--force-device-scale-factor=2.0")  # labels on wiki are at
                                                    # 1/2 printer resolution
op.add_argument("start-maximized")
op.add_argument('--headless')

# Process the html to find the table of labels
html = requests.get(page_url).text
soup = BeautifulSoup(html, 'html.parser')
table = soup.findChildren('table')[0]
rows = table.findChildren(['th', 'tr'])

# Each row of the table is one label
for row in rows:
    # Extract the label html from each row and render with selenium
    content = str(row)
    raw = json.dumps(content)[1:-1]  # indicies strip off quotes
    raw = raw
    name = row.find(id='title').find('text').contents[0]
    filename = ''.join([c for c in name if c in fn_valid_chars]) + '.png'

    driver = webdriver.Chrome(options=op)
    driver.execute_script("document.write('{}')".format(raw))

    # Get the size and location of the label in the browser
    element = driver.find_element_by_tag_name('svg')
    location = element.location
    size = element.size
    sleep(load_delay)  # Make sure the label fully loads. QR code can be slow

    # Screenshot, crop, resize to the exact printer resolution
    png = driver.get_screenshot_as_png()
    driver.close()
    left = location['x']*2
    top = location['y']*2
    right = location['x']*2 + size['width']*2
    bottom = location['y']*2 + size['height']*2
    im = Image.open(BytesIO(png))
    im = im.crop((left, top, right, bottom))
    im = im.resize(printer_resolution, Image.BICUBIC)
    im.save(filename)
    print("Captured image for " + name)
	""" Scrape the wiki page of equipment labels to generate images of
	each label at the resolution required by the label printer
	"""

	from selenium.webdriver.chrome.options import Options
	from selenium import webdriver
	from bs4 import BeautifulSoup
	from time import sleep
	from io import BytesIO
	from PIL import Image
	import requests
	import string
	import json

	page_url = "https://www.i3detroit.org/wiki/Equipment_Labels_List"
	printer_resolution = (1109, 696)

	load_delay = 3 # seconds to let page load before taking screenshot
	fn_valid_chars = "-_.() " + string.ascii_letters + string.digits

	# chrome webdriver options
	op = Options()
	op.add_argument("--force-device-scale-factor=2.0") # labels on wiki are at
	# 1/2 printer resolution
	op.add_argument("start-maximized")
	op.add_argument('--headless')

	# Process the html to find the table of labels
	html = requests.get(page_url).text
	soup = BeautifulSoup(html, 'html.parser')
	table = soup.findChildren('table')[0]
	rows = table.findChildren(['th', 'tr'])

	# Each row of the table is one label
	for row in rows:
	# Extract the label html from each row and render with selenium
	content = str(row)
	raw = json.dumps(content)[1:-1] # indicies strip off quotes
	raw = raw
	name = row.find(id='title').find('text').contents[0]
	filename = ''.join([c for c in name if c in fn_valid_chars]) + '.png'

	driver = webdriver.Chrome(options=op)
	driver.execute_script("document.write('{}')".format(raw))

	# Get the size and location of the label in the browser
	element = driver.find_element_by_tag_name('svg')
	location = element.location
	size = element.size
	sleep(load_delay) # Make sure the label fully loads. QR code can be slow

	# Screenshot, crop, resize to the exact printer resolution
	png = driver.get_screenshot_as_png()
	driver.close()
	left = location['x']*2
	top = location['y']*2
	right = location['x']2 + size['width']2
	bottom = location['y']2 + size['height']2
	im = Image.open(BytesIO(png))
	im = im.crop((left, top, right, bottom))
	im = im.resize(printer_resolution, Image.BICUBIC)
	im.save(filename)
	print("Captured image for " + name)