Skip to content

Instantly share code, notes, and snippets.

@helrond
Last active May 30, 2019 02:46
Show Gist options
  • Save helrond/3107fb81356d40f51129d1a2e8f7aa2a to your computer and use it in GitHub Desktop.
Save helrond/3107fb81356d40f51129d1a2e8f7aa2a to your computer and use it in GitHub Desktop.
Downloads all PDF files associated with a diarist
#!/usr/bin/env python
""""
Downloads all PDF files associated with a diarist
usage: download_diaries.py [-h] diarist target
diarist: URL for a diarist.
target: Directory into which PDFs will be downloaded.
"""
import argparse
import os
from requests_html import HTMLSession
base_path = os.path.abspath(os.path.dirname(__file__))
class DiaryDownloader:
def __init__(self, diarist, target):
self.diarist = diarist
self.target = os.path.join(base_path, target)
if not os.path.isdir(self.target):
os.makedirs(self.target)
self.session = HTMLSession()
def download(self):
d = self.session.get(self.diarist).html
for link in d.find('.dao.row'):
diary = self.session.get("https://dimes.rockarch.org/{}".format(link.attrs.get('data-identifier'))).html
for link in diary.find('a.download', first=True).absolute_links:
filename = link.split('/')[-1]
print(filename)
with self.session.get(link, stream=True) as r:
r.raise_for_status()
with open(os.path.join(self.target, filename), 'wb') as f:
for chunk in r.iter_content():
if chunk:
f.write(chunk)
parser = argparse.ArgumentParser(description='Downloads PDFs from a diarist.')
parser.add_argument('diarist', help='URL for a diarist.')
parser.add_argument('target', help='Target directory into which PDFs will be downloaded.')
args = parser.parse_args()
DiaryDownloader(args.diarist, args.target).download()
@helrond
Copy link
Author

helrond commented May 30, 2019

For example:

python download_diaries.py "https://dimes.rockarch.org/xtf/view?docId=ead/FA392/FA392.xml;chunk.id=b707f5838c464fada751ac22f353966b;brand=default;doc.view=dao" fahs

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment