sumeet-bansal/extractor.py

## extractor.py
from bs4 import BeautifulSoup   # for parsing HTML
import os                       # for managing files
import sys                      # for cleaner stdout
import urllib.request           # for downloading images
from fpdf import FPDF           # for generating PDFs

# page content saved as HTML file
inputfile = 'MUS-17-Tricia-Rose-reading.html'
output = inputfile[:-5] + '.pdf'

# sets up BeautifulSoup HTML parser
body = open(inputfile, encoding='utf8').read()
soup = BeautifulSoup(body, 'html.parser')

# retrieves all images
directory = 'images/'
images = []
if not os.path.exists(directory):
    os.makedirs(directory)
for link in soup.find_all('a'):
	href = str(link.get('href'))
	if 'cdn.alexanderstreet.com' in href:
		filename = 'images/' + href[-24:-18] + href[-4:]
		sys.stdout.write("\rRetrieving image: " + href)
		sys.stdout.flush()
		urllib.request.urlretrieve(href, filename)
		images.append(filename)
print("\nRetrieved %s images." % len(images))

# writes PDF
print()
pdf = FPDF()
for image in images:
	pdf.add_page()
	pdf.image(image, 0, 0, 210, 297)
print("Writing PDF...")
pdf.output(output, 'F')
print("Successfully wrote PDF: " + output)

# cleans up
for image in images:
	os.remove(image)
os.rmdir(directory)
print("Cleaned up.")
	from bs4 import BeautifulSoup # for parsing HTML
	import os # for managing files
	import sys # for cleaner stdout
	import urllib.request # for downloading images
	from fpdf import FPDF # for generating PDFs

	# page content saved as HTML file
	inputfile = 'MUS-17-Tricia-Rose-reading.html'
	output = inputfile[:-5] + '.pdf'

	# sets up BeautifulSoup HTML parser
	body = open(inputfile, encoding='utf8').read()
	soup = BeautifulSoup(body, 'html.parser')

	# retrieves all images
	directory = 'images/'
	images = []
	if not os.path.exists(directory):
	os.makedirs(directory)
	for link in soup.find_all('a'):
	href = str(link.get('href'))
	if 'cdn.alexanderstreet.com' in href:
	filename = 'images/' + href[-24:-18] + href[-4:]
	sys.stdout.write("\rRetrieving image: " + href)
	sys.stdout.flush()
	urllib.request.urlretrieve(href, filename)
	images.append(filename)
	print("\nRetrieved %s images." % len(images))

	# writes PDF
	print()
	pdf = FPDF()
	for image in images:
	pdf.add_page()
	pdf.image(image, 0, 0, 210, 297)
	print("Writing PDF...")
	pdf.output(output, 'F')
	print("Successfully wrote PDF: " + output)

	# cleans up
	for image in images:
	os.remove(image)
	os.rmdir(directory)
	print("Cleaned up.")