Last active
January 22, 2020 16:55
-
-
Save mkfink/aac99312cca2377629d3548d96cfc469 to your computer and use it in GitHub Desktop.
scrape and generate images of equipment labels from wiki page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Scrape the wiki page of equipment labels to generate images of | |
each label at the resolution required by the label printer | |
""" | |
from selenium.webdriver.chrome.options import Options | |
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
from time import sleep | |
from io import BytesIO | |
from PIL import Image | |
import requests | |
import string | |
import json | |
page_url = "https://www.i3detroit.org/wiki/Equipment_Labels_List" | |
printer_resolution = (1109, 696) | |
load_delay = 3 # seconds to let page load before taking screenshot | |
fn_valid_chars = "-_.() " + string.ascii_letters + string.digits | |
# chrome webdriver options | |
op = Options() | |
op.add_argument("--force-device-scale-factor=2.0") # labels on wiki are at | |
# 1/2 printer resolution | |
op.add_argument("start-maximized") | |
op.add_argument('--headless') | |
# Process the html to find the table of labels | |
html = requests.get(page_url).text | |
soup = BeautifulSoup(html, 'html.parser') | |
table = soup.findChildren('table')[0] | |
rows = table.findChildren(['th', 'tr']) | |
# Each row of the table is one label | |
for row in rows: | |
# Extract the label html from each row and render with selenium | |
content = str(row) | |
raw = json.dumps(content)[1:-1] # indicies strip off quotes | |
raw = raw | |
name = row.find(id='title').find('text').contents[0] | |
filename = ''.join([c for c in name if c in fn_valid_chars]) + '.png' | |
driver = webdriver.Chrome(options=op) | |
driver.execute_script("document.write('{}')".format(raw)) | |
# Get the size and location of the label in the browser | |
element = driver.find_element_by_tag_name('svg') | |
location = element.location | |
size = element.size | |
sleep(load_delay) # Make sure the label fully loads. QR code can be slow | |
# Screenshot, crop, resize to the exact printer resolution | |
png = driver.get_screenshot_as_png() | |
driver.close() | |
left = location['x']*2 | |
top = location['y']*2 | |
right = location['x']*2 + size['width']*2 | |
bottom = location['y']*2 + size['height']*2 | |
im = Image.open(BytesIO(png)) | |
im = im.crop((left, top, right, bottom)) | |
im = im.resize(printer_resolution, Image.BICUBIC) | |
im.save(filename) | |
print("Captured image for " + name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment