Skip to content

Instantly share code, notes, and snippets.

@ishanfdo18098
Forked from jwoglom/data.py
Last active June 2, 2022 15:44
Show Gist options
  • Save ishanfdo18098/0b40bd099638fc9b65b82df076d53dd2 to your computer and use it in GitHub Desktop.
Save ishanfdo18098/0b40bd099638fc9b65b82df076d53dd2 to your computer and use it in GitHub Desktop.
Download Perusall readings as PDF
title = "The title of the article"
urls="""
<image URLs scraped from the page>
"""
# dependencies: imagemagick, img2pdf
# multithreaded by Ishan
import os
import requests
from data import title, urls
from concurrent.futures import ThreadPoolExecutor
NUM_OF_THREADS = 10
def downloadImage(folder, i, u):
open('{}/{:0>2}.png'.format(folder, i), 'wb').write(requests.get(u.strip()).content)
folder = title.replace(' ', '-')
if not os.path.exists(folder):
os.mkdir(folder)
i = 0
with ThreadPoolExecutor(max_workers=NUM_OF_THREADS) as exe:
for u in urls.splitlines():
if u:
print('Downloading chunk', i, 'of', title)
exe.submit(downloadImage, folder, i, u)
i += 1
pgno = 1
def convertPage(f, folder, pgno):
os.system('convert -append %s %s/page_%s.png' % (f, folder, pgno))
with ThreadPoolExecutor(max_workers=NUM_OF_THREADS) as exe:
for j in range(0, i, 6):
f = ' '.join(['{}/{:0>2}.png'.format(folder, k) for k in range(j, min(i, j+6))])
print('Converting page', pgno)
exe.submit(convertPage, f, folder, pgno)
pgno += 1
print('Converting to pdf')
pages = ' '.join(['{}/page_{}.png'.format(folder, k) for k in range(1, pgno)])
os.system('img2pdf %s -o %s.pdf' % (pages, title))
print('Done')
/*
* Click on a reading in the Perusall web interface,
* and run this script in the developer console.
* Copy-and-paste the console.info output to data.py.
*/
var len = 0;
var times = 0;
var i = setInterval(() => {
var img = document.querySelectorAll("img.chunk"); img[img.length-1].scrollIntoView();
if (len < img.length) {
len = img.length;
} else if (times > 3) {
var urls = [];
img.forEach((e) => urls.push(e.src));
var spl = location.pathname.split('/');
console.info('urls = """\n'+urls.join('\n')+'\n"""\n\ntitle="'+spl[spl.length-1]+'"\n');
clearInterval(i);
} else {
times++;
}
}, 2000);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment