Skip to content

Instantly share code, notes, and snippets.

@joeyv120
Created June 13, 2024 20:06
Show Gist options
  • Save joeyv120/ae6af8b97380e0e099638619abd108c2 to your computer and use it in GitHub Desktop.
Save joeyv120/ae6af8b97380e0e099638619abd108c2 to your computer and use it in GitHub Desktop.
Script to scrape PDF files from d4Caltrop's blog
# https://martechwithme.com/get-list-pages-url-from-website-python/
# https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url#39225272
import requests
import os
import re
from usp.tree import sitemap_tree_for_homepage
from time import sleep
def list_pages(url):
listPagesRaw = []
tree = sitemap_tree_for_homepage(url)
for page in tree.all_pages():
listPagesRaw.append(page.url)
# Go through List Pages Raw output a list of unique pages links
listPages = []
for page in listPagesRaw:
if page in listPages:
pass
else:
listPages.append(page)
return listPages
def download_from_gDrive(file_id, folder):
gURL = "https://docs.google.com/uc?export=download&confirm=1"
session = requests.Session()
response = session.get(gURL, params={"id": file_id}, stream=True)
try:
file_name = re.search(r'filename\=\"(.*)\"',
response.headers['Content-Disposition']).group(1)
except KeyError:
print('File not accessible.')
return
if os.path.isfile(folder + file_name):
print('File already exists.')
return
else:
print('Downloading...')
token = get_confirm_token(response)
if token:
params = {"id": file_id, "confirm": token}
response = session.get(gURL, params=params, stream=True)
with open(folder + file_name, "wb") as f:
for chunk in response.iter_content(32768):
if chunk:
f.write(chunk)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith("download_warning"):
return value
return None
def find_file_id(page):
response = requests.get(page)
txt = response.text
pattern = r"https://docs.google.com/file/d/([-\w]+)"
try:
id = re.search(pattern, txt).group(1)
except AttributeError:
print("Error: " + page)
return None
return id
if __name__ == '__main__':
bURL = "https://blog.d4caltrops.com/"
folder = os.path.expanduser('~') + '\\Downloads\\d4Caltrops\\'
if not os.path.exists(folder):
os.mkdir(folder)
pages = list_pages(bURL)
for page in pages[0:5]: # just grab the 5 most recent posts
print(str(pages.index(page) + 1) + " of " + str(len(pages)))
# print(page)
id = find_file_id(page)
# print(id)
if id is not None:
download_from_gDrive(id, folder)
sleep(1) # Don't bug them too much
print('complete')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment