Skip to content

Instantly share code, notes, and snippets.

@isaacgr
Last active March 13, 2021 05:17
Show Gist options
  • Save isaacgr/c2458ea740aa1e3c2645c6c3920e8933 to your computer and use it in GitHub Desktop.
Save isaacgr/c2458ea740aa1e3c2645c6c3920e8933 to your computer and use it in GitHub Desktop.
Python with Selenium code to scrape quantamagazine.com and save articles to pdf using chromedriver
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import os
import json
import sys
import subprocess
SPECIAL_CHARS = ['\\', '/', ':', '?', '*', '<', '>', '|']
CHROMEDRIVER_PATH = 'chromedriver.exe'
class QuantaScrape(object):
url="https://www.quantamagazine.org/graphql"
headers={"Content-type": "application/json"}
def __init__(self):
self.transport = None
self.client = None
def define_transport(self):
self.transport = RequestsHTTPTransport(
use_json=True,
url=self.url,
verify=False,
headers=self.headers,
retries=3
)
def define_client(self):
self.client = Client(transport=self.transport, fetch_schema_from_transport=True)
def execute(self, query, variables):
self.define_transport()
self.define_client()
return self.client.execute(query, variable_values=variables)
query = gql(
"""
query ($offset: Int){
operationName: getPostPageArchive(offset: $offset, type: "archive"){
meta{
max_num_pages
}
data{
...on Post{
title
link
}
}
}
}
"""
)
def print_to_pdf(title, link):
appState = {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local",
"account": "",
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
prefs = {
'printing.print_preview_sticky_settings.appState': json.dumps(appState)
}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=chrome_options)
driver.get(link)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
WebDriverWait(driver, 20)
driver.execute_script('window.print();')
driver.quit()
def download_pdf(filename):
with open(filename) as f:
data = json.load(f)
for article in data:
title = '_'.join(article['title'].split(' '))
link = article['link']
for char in title:
if char in SPECIAL_CHARS:
title = title.replace(char, '')
print(title)
print_to_pdf(title, link)
def main():
articles = []
scraper = QuantaScrape()
offset = 1
data = scraper.execute(query, {"offset": offset})
for pages in range(1, data['operationName']['meta']['max_num_pages']+1):
data = scraper.execute(query, {"offset": offset})
articles.extend(data['operationName']['data'])
offset += 1
with open('articles.json', 'w') as f:
f.write(json.dumps(articles, indent=4))
if __name__=='__main__':
if sys.argv[1] == 'download':
download_pdf(sys.argv[2])
else:
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment