jwoglom/data.py

## data.py
title = "The title of the article"
urls="""
<image URLs scraped from the page>
"""

## download_perusall.py
# dependencies: imagemagick, img2pdf
from data import title, urls

folder = title.replace(' ','-')
import requests
import os
if not os.path.exists(folder):
    os.mkdir(folder)
i = 0
for u in urls.splitlines():
    if u:
        print('Downloading chunk', i, 'of', title)
        open('{}/{:0>2}.png'.format(folder, i), 'wb').write(requests.get(u.strip()).content)
        i += 1

pgno = 1
for j in range(0, i, 6):
    f = ' '.join(['{}/{:0>2}.png'.format(folder, k) for k in range(j, min(i, j+6))])
    print('Converting page', pgno)
    os.system('convert -append %s %s/page_%s.png' % (f, folder, pgno))
    pgno += 1

print('Converting to pdf')
pages = ' '.join(['{}/page_{}.png'.format(folder, k) for k in range(1, pgno)])
os.system('img2pdf %s -o %s.pdf' % (pages, title))
print('Done')

## get_urls.py
/*
 * Click on a reading in the Perusall web interface,
 * and run this script in the developer console.
 * Copy-and-paste the console.info output to data.py.
 */
var len = 0;
var times = 0;
var i = setInterval(() => {
  var img = document.querySelectorAll("img.chunk"); img[img.length-1].scrollIntoView();
  if (len < img.length) {
    len = img.length;
  } else if (times > 3) {
    var urls = [];
    img.forEach((e) => urls.push(e.src));
    var spl = location.pathname.split('/');
    console.info('urls = """\n'+urls.join('\n')+'\n"""\n\ntitle="'+spl[spl.length-1]+'"\n');
    clearInterval(i);
  } else {
      times++;
  }
}, 2000);
	title = "The title of the article"
	urls="""
	<image URLs scraped from the page>
	"""
	# dependencies: imagemagick, img2pdf
	from data import title, urls

	folder = title.replace(' ','-')
	import requests
	import os
	if not os.path.exists(folder):
	os.mkdir(folder)
	i = 0
	for u in urls.splitlines():
	if u:
	print('Downloading chunk', i, 'of', title)
	open('{}/{:0>2}.png'.format(folder, i), 'wb').write(requests.get(u.strip()).content)
	i += 1

	pgno = 1
	for j in range(0, i, 6):
	f = ' '.join(['{}/{:0>2}.png'.format(folder, k) for k in range(j, min(i, j+6))])
	print('Converting page', pgno)
	os.system('convert -append %s %s/page_%s.png' % (f, folder, pgno))
	pgno += 1

	print('Converting to pdf')
	pages = ' '.join(['{}/page_{}.png'.format(folder, k) for k in range(1, pgno)])
	os.system('img2pdf %s -o %s.pdf' % (pages, title))
	print('Done')
	/*
	* Click on a reading in the Perusall web interface,
	* and run this script in the developer console.
	* Copy-and-paste the console.info output to data.py.
	*/
	var len = 0;
	var times = 0;
	var i = setInterval(() => {
	var img = document.querySelectorAll("img.chunk"); img[img.length-1].scrollIntoView();
	if (len < img.length) {
	len = img.length;
	} else if (times > 3) {
	var urls = [];
	img.forEach((e) => urls.push(e.src));
	var spl = location.pathname.split('/');
	console.info('urls = """\n'+urls.join('\n')+'\n"""\n\ntitle="'+spl[spl.length-1]+'"\n');
	clearInterval(i);
	} else {
	times++;
	}
	}, 2000);