Skip to content

Instantly share code, notes, and snippets.

@xinlc
Forked from 2019ncovmemory/capture_articles.py
Created February 14, 2020 01:17
Show Gist options
  • Save xinlc/ea7c8ac66ea59ef81ed5d811d08325da to your computer and use it in GitHub Desktop.
Save xinlc/ea7c8ac66ea59ef81ed5d811d08325da to your computer and use it in GitHub Desktop.
Create screenshots of articles; work for WeChat articles with lazy loading.
'''
Usage: python archive_articles.py test.csv
Input: test.csv
name url
1 url1
2 url2
.....
output:
1.png
2.png
.....
Dependency:
Firefox
pip install selenium
pip install pillow (for image compression)
brew install geckodriver (for mac)
'''
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import csv
import sys
from PIL import Image
import math
def fullpage_screenshot(nameAndURL):
# Only work with Firefox; Chrome does not work
options = webdriver.FirefoxOptions()
options.add_argument("--headless");
driver = webdriver.Firefox(options=options)
driver.maximize_window()
for pageInfo in nameAndURL:
name = pageInfo[0]
url = pageInfo[1]
print('Capturing: ', name, url)
# URL here
driver.get(url)
time.sleep(2)
height = driver.execute_script("return document.body.scrollHeight")
# The trick for lazy loading of images in WeChat articles
driver.set_window_size(1000, height - 1000)
driver.execute_script("window.scrollTo(0, 1000)")
driver.execute_script("window.scrollTo(0, 0)")
time.sleep(10) # new images need time to load
new_height = driver.execute_script("return document.body.scrollHeight")
driver.set_window_size(1000, new_height)
driver.save_screenshot(name+".png")
# Optimize image
foo = Image.open(name+".png")
foo.save(name+".png",optimize=True,quality=95)
driver.quit()
if __name__ == "__main__":
assert len(sys.argv) == 2
nameAndURL = []
with open(sys.argv[1]) as f:
lis = [line.split(' ') for line in f]
for i, x in enumerate(lis):
if i != 0:
nameAndURL.append(x)
fullpage_screenshot(nameAndURL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment