Skip to content

Instantly share code, notes, and snippets.

@2019ncovmemory
Created February 5, 2020 18:25
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 6 You must be signed in to fork a gist
  • Save 2019ncovmemory/1e4225aa73011cb0d6e544aad1468541 to your computer and use it in GitHub Desktop.
Save 2019ncovmemory/1e4225aa73011cb0d6e544aad1468541 to your computer and use it in GitHub Desktop.
Create screenshots of articles; work for WeChat articles with lazy loading.
'''
Usage: python archive_articles.py test.csv
Input: test.csv
name url
1 url1
2 url2
.....
output:
1.png
2.png
.....
Dependency:
Firefox
pip install selenium
pip install pillow (for image compression)
brew install geckodriver (for mac)
'''
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import csv
import sys
from PIL import Image
import math
def fullpage_screenshot(nameAndURL):
# Only work with Firefox; Chrome does not work
options = webdriver.FirefoxOptions()
options.add_argument("--headless");
driver = webdriver.Firefox(options=options)
driver.maximize_window()
for pageInfo in nameAndURL:
name = pageInfo[0]
url = pageInfo[1]
print('Capturing: ', name, url)
# URL here
driver.get(url)
time.sleep(2)
height = driver.execute_script("return document.body.scrollHeight")
# The trick for lazy loading of images in WeChat articles
driver.set_window_size(1000, height - 1000)
driver.execute_script("window.scrollTo(0, 1000)")
driver.execute_script("window.scrollTo(0, 0)")
time.sleep(10) # new images need time to load
new_height = driver.execute_script("return document.body.scrollHeight")
driver.set_window_size(1000, new_height)
driver.save_screenshot(name+".png")
# Optimize image
foo = Image.open(name+".png")
foo.save(name+".png",optimize=True,quality=95)
driver.quit()
if __name__ == "__main__":
assert len(sys.argv) == 2
nameAndURL = []
with open(sys.argv[1]) as f:
lis = [line.split(' ') for line in f]
for i, x in enumerate(lis):
if i != 0:
nameAndURL.append(x)
fullpage_screenshot(nameAndURL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment