Skip to content

Instantly share code, notes, and snippets.

@lukpueh
Created March 23, 2022 13:45
Show Gist options
  • Save lukpueh/e3af1fa4665eaf3f3ec6c42783465632 to your computer and use it in GitHub Desktop.
Save lukpueh/e3af1fa4665eaf3f3ec6c42783465632 to your computer and use it in GitHub Desktop.
Scrape app.box.com pseudo PDFs
import os
import base64
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
TIMEOUT = 10000
driver = webdriver.Safari()
driver.maximize_window()
try:
with open("links.txt") as f:
links = f.read()
for i, link in enumerate(links.splitlines()):
if i != 0 and i % 50 == 0:
time.sleep(20)
name = None
driver.get(link)
name_el = WebDriverWait(driver, TIMEOUT).until(
lambda d: d.find_element(by=By.CLASS_NAME, value="item-name"))
name = name_el.text
if not os.path.exists(name):
os.mkdir(name)
page_els = WebDriverWait(driver, TIMEOUT).until(
lambda d: d.find_elements(by=By.CLASS_NAME, value="page")
)
for idx, page_el in enumerate(page_els):
driver.execute_script("arguments[0].scrollIntoView(true);", page_el)
time.sleep(0.3)
canvas_el = WebDriverWait(driver, TIMEOUT).until(
lambda d: page_el.find_element(by=By.TAG_NAME, value="canvas")
)
b64str = driver.execute_script("return arguments[0].toDataURL('image/png');", canvas_el)
with open(f"{name}/{idx:03d}.png", 'wb') as f:
f.write(base64.b64decode(b64str[22:]))
except Exception as e:
raise e
finally:
print(name, link)
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment