Skip to content

Instantly share code, notes, and snippets.

@vane
Created November 5, 2023 03:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vane/76aa72737809e091bc5732015c1867f3 to your computer and use it in GitHub Desktop.
Save vane/76aa72737809e091bc5732015c1867f3 to your computer and use it in GitHub Desktop.
python save page mhtml and screenshot using selenium webdriver
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import base64
import os.path
import selenium.webdriver.chrome.webdriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def save_screenshot(driver: selenium.webdriver.chrome.webdriver.WebDriver, fname: str):
page_rect = driver.execute_cdp_cmd("Page.getLayoutMetrics", {})
src = driver.execute_cdp_cmd('Page.captureScreenshot', {
"format": "png",
"captureBeyondViewport": True,
"clip": {
"width": page_rect["cssContentSize"]["width"],
"height": page_rect["cssContentSize"]["height"],
"x": 0,
"y": 0,
"scale": 1
}
})
with open(fname, 'wb+') as f:
f.write(base64.urlsafe_b64decode(src['data']))
def save_mhtml(driver: selenium.webdriver.chrome.webdriver.WebDriver, fname: str):
with open(fname, 'w+') as f:
page_source = driver.execute_cdp_cmd('Page.captureSnapshot', {})
f.write(page_source['data'])
def driver_save(url: str, path: str, mhtml: str, screenshot: str):
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get(url)
if mhtml:
save_mhtml(driver, fname=os.path.join(path, f'{mhtml}.mhtml'))
if screenshot:
save_screenshot(driver, fname=os.path.join(path, f'{screenshot}.jpg'))
finally:
driver.quit()
if __name__ == '__main__':
parser = argparse.ArgumentParser('Selenium scrap')
parser.add_argument('-u', '--url', required=True, help='page url')
parser.add_argument('-p', '--path', default='', help='save path')
parser.add_argument('-m', '--mhtml', default='page', help='mhtml file name')
parser.add_argument('-s', '--screenshot', default='page', help='screenshot file name')
args = parser.parse_args()
driver_save(args.url, args.path, args.mhtml, args.screenshot)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment