Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Create screenshots of articles; work for WeChat articles with lazy loading.
'''
Usage: python archive_articles.py test.csv
Input: test.csv
name url
1 url1
2 url2
.....
output:
1.png
2.png
.....
Dependency:
Firefox
pip install selenium
pip install pillow (for image compression)
brew install geckodriver (for mac)
'''
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import csv
import sys
from PIL import Image
import math
def fullpage_screenshot(nameAndURL):
# Only work with Firefox; Chrome does not work
options = webdriver.FirefoxOptions()
options.add_argument("--headless");
driver = webdriver.Firefox(options=options)
driver.maximize_window()
for pageInfo in nameAndURL:
name = pageInfo[0]
url = pageInfo[1]
print('Capturing: ', name, url)
# URL here
driver.get(url)
time.sleep(2)
height = driver.execute_script("return document.body.scrollHeight")
# The trick for lazy loading of images in WeChat articles
driver.set_window_size(1000, height - 1000)
driver.execute_script("window.scrollTo(0, 1000)")
driver.execute_script("window.scrollTo(0, 0)")
time.sleep(10) # new images need time to load
new_height = driver.execute_script("return document.body.scrollHeight")
driver.set_window_size(1000, new_height)
driver.save_screenshot(name+".png")
# Optimize image
foo = Image.open(name+".png")
foo.save(name+".png",optimize=True,quality=95)
driver.quit()
if __name__ == "__main__":
assert len(sys.argv) == 2
nameAndURL = []
with open(sys.argv[1]) as f:
lis = [line.split(' ') for line in f]
for i, x in enumerate(lis):
if i != 0:
nameAndURL.append(x)
fullpage_screenshot(nameAndURL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.