ishanfdo18098/data.py

## data.py
title = "The title of the article"
urls="""
<image URLs scraped from the page>
"""

## download_perusall.py
# dependencies: imagemagick, img2pdf
# multithreaded by Ishan
import os
import requests
from data import title, urls
from concurrent.futures import ThreadPoolExecutor

NUM_OF_THREADS = 10


def downloadImage(folder, i, u):
    open('{}/{:0>2}.png'.format(folder, i), 'wb').write(requests.get(u.strip()).content)


folder = title.replace(' ', '-')
if not os.path.exists(folder):
    os.mkdir(folder)
i = 0

with ThreadPoolExecutor(max_workers=NUM_OF_THREADS) as exe:
    for u in urls.splitlines():
        if u:
            print('Downloading chunk', i, 'of', title)
            exe.submit(downloadImage, folder, i, u)
            i += 1

pgno = 1


def convertPage(f, folder, pgno):
    os.system('convert -append %s %s/page_%s.png' % (f, folder, pgno))


with ThreadPoolExecutor(max_workers=NUM_OF_THREADS) as exe:
    for j in range(0, i, 6):
        f = ' '.join(['{}/{:0>2}.png'.format(folder, k) for k in range(j, min(i, j+6))])
        print('Converting page', pgno)
        exe.submit(convertPage, f, folder, pgno)
        pgno += 1

print('Converting to pdf')
pages = ' '.join(['{}/page_{}.png'.format(folder, k) for k in range(1, pgno)])
os.system('img2pdf %s -o %s.pdf' % (pages, title))
print('Done')

## get_urls.py
/*
 * Click on a reading in the Perusall web interface,
 * and run this script in the developer console.
 * Copy-and-paste the console.info output to data.py.
 */
var len = 0;
var times = 0;
var i = setInterval(() => {
  var img = document.querySelectorAll("img.chunk"); img[img.length-1].scrollIntoView();
  if (len < img.length) {
    len = img.length;
  } else if (times > 3) {
    var urls = [];
    img.forEach((e) => urls.push(e.src));
    var spl = location.pathname.split('/');
    console.info('urls = """\n'+urls.join('\n')+'\n"""\n\ntitle="'+spl[spl.length-1]+'"\n');
    clearInterval(i);
  } else {
      times++;
  }
}, 2000);
	title = "The title of the article"
	urls="""
	<image URLs scraped from the page>
	"""
	# dependencies: imagemagick, img2pdf
	# multithreaded by Ishan
	import os
	import requests
	from data import title, urls
	from concurrent.futures import ThreadPoolExecutor

	NUM_OF_THREADS = 10


	def downloadImage(folder, i, u):
	open('{}/{:0>2}.png'.format(folder, i), 'wb').write(requests.get(u.strip()).content)


	folder = title.replace(' ', '-')
	if not os.path.exists(folder):
	os.mkdir(folder)
	i = 0

	with ThreadPoolExecutor(max_workers=NUM_OF_THREADS) as exe:
	for u in urls.splitlines():
	if u:
	print('Downloading chunk', i, 'of', title)
	exe.submit(downloadImage, folder, i, u)
	i += 1

	pgno = 1


	def convertPage(f, folder, pgno):
	os.system('convert -append %s %s/page_%s.png' % (f, folder, pgno))


	with ThreadPoolExecutor(max_workers=NUM_OF_THREADS) as exe:
	for j in range(0, i, 6):
	f = ' '.join(['{}/{:0>2}.png'.format(folder, k) for k in range(j, min(i, j+6))])
	print('Converting page', pgno)
	exe.submit(convertPage, f, folder, pgno)
	pgno += 1

	print('Converting to pdf')
	pages = ' '.join(['{}/page_{}.png'.format(folder, k) for k in range(1, pgno)])
	os.system('img2pdf %s -o %s.pdf' % (pages, title))
	print('Done')
	/*
	* Click on a reading in the Perusall web interface,
	* and run this script in the developer console.
	* Copy-and-paste the console.info output to data.py.
	*/
	var len = 0;
	var times = 0;
	var i = setInterval(() => {
	var img = document.querySelectorAll("img.chunk"); img[img.length-1].scrollIntoView();
	if (len < img.length) {
	len = img.length;
	} else if (times > 3) {
	var urls = [];
	img.forEach((e) => urls.push(e.src));
	var spl = location.pathname.split('/');
	console.info('urls = """\n'+urls.join('\n')+'\n"""\n\ntitle="'+spl[spl.length-1]+'"\n');
	clearInterval(i);
	} else {
	times++;
	}
	}, 2000);