khaledkee/Springer Ebooks

## collector.py
from PyPDF2 import PdfFileReader
from bs4 import BeautifulSoup
from urllib.request import urlopen
import tqdm
import os
import re

with open('Springer Ebooks.pdf', 'rb') as PDFFile:
    PDF = PdfFileReader(PDFFile)
    pages = PDF.getNumPages()
    # key = '/Annots'
    # uri = '/URI'
    # anchor = '/A'

    # constants
    BOOKS_NO = 408
    URL_PREFIX = "http"

    # dictionary of title -> url
    urls = dict()

    # Construct the dictionary
    bookIndex = 1
    for page in range(pages):
        pageObject = PDF.getPage(page)
        lines = pageObject.extractText().split('\n')
        currentLine = 0
        while currentLine < len(lines):
            # find line with book index
            while currentLine + 1 < len(lines) and (
                    not lines[currentLine].isnumeric() or int(lines[currentLine]) != bookIndex):
                currentLine += 1
            if currentLine + 1 >= len(lines):
                break
            title = lines[currentLine + 1]
            # move to the url
            while not lines[currentLine].startswith(URL_PREFIX):
                currentLine += 1
            url = lines[currentLine]
            urls[title] = url
            bookIndex += 1

BOOKS_DIR = "books"
PDF_EXT = ".pdf"
PDF_LINK_ATTRIB = {'title': 'Download this book in PDF format'}
ANCHOR_TAG = 'a'
ANCHOR_LINK_ATTRIB = 'href'
SPRINGER_PATH = 'https://link.springer.com'

# Download the books
for title in tqdm.tqdm(urls):
    url = urls[title]
    html_page = urlopen(url)
    soup = BeautifulSoup(html_page, 'html.parser')
    if not os.path.exists(BOOKS_DIR):
        os.makedirs(BOOKS_DIR)
    for link in soup.findAll(ANCHOR_TAG, attrs=PDF_LINK_ATTRIB):
        pdfLink = SPRINGER_PATH + link.get(ANCHOR_LINK_ATTRIB)
        # escape / to not make python think it's a directory
        file_name = title.replace("/", ",").strip()
        fullfilename = os.path.join(BOOKS_DIR, file_name + PDF_EXT)
        if not os.path.exists(fullfilename):
            file_resp = urlopen(pdfLink)
            file_resp_bin = file_resp.read()
            with open(fullfilename, "wb") as binfile:
                binfile.write(file_resp_bin)

## Springer Ebooks

      
    Raw
  

              Springer Ebooks
            
          
            View raw
	from PyPDF2 import PdfFileReader
	from bs4 import BeautifulSoup
	from urllib.request import urlopen
	import tqdm
	import os
	import re

	with open('Springer Ebooks.pdf', 'rb') as PDFFile:
	PDF = PdfFileReader(PDFFile)
	pages = PDF.getNumPages()
	# key = '/Annots'
	# uri = '/URI'
	# anchor = '/A'

	# constants
	BOOKS_NO = 408
	URL_PREFIX = "http"

	# dictionary of title -> url
	urls = dict()

	# Construct the dictionary
	bookIndex = 1
	for page in range(pages):
	pageObject = PDF.getPage(page)
	lines = pageObject.extractText().split('\n')
	currentLine = 0
	while currentLine < len(lines):
	# find line with book index
	while currentLine + 1 < len(lines) and (
	not lines[currentLine].isnumeric() or int(lines[currentLine]) != bookIndex):
	currentLine += 1
	if currentLine + 1 >= len(lines):
	break
	title = lines[currentLine + 1]
	# move to the url
	while not lines[currentLine].startswith(URL_PREFIX):
	currentLine += 1
	url = lines[currentLine]
	urls[title] = url
	bookIndex += 1

	BOOKS_DIR = "books"
	PDF_EXT = ".pdf"
	PDF_LINK_ATTRIB = {'title': 'Download this book in PDF format'}
	ANCHOR_TAG = 'a'
	ANCHOR_LINK_ATTRIB = 'href'
	SPRINGER_PATH = 'https://link.springer.com'

	# Download the books
	for title in tqdm.tqdm(urls):
	url = urls[title]
	html_page = urlopen(url)
	soup = BeautifulSoup(html_page, 'html.parser')
	if not os.path.exists(BOOKS_DIR):
	os.makedirs(BOOKS_DIR)
	for link in soup.findAll(ANCHOR_TAG, attrs=PDF_LINK_ATTRIB):
	pdfLink = SPRINGER_PATH + link.get(ANCHOR_LINK_ATTRIB)
	# escape / to not make python think it's a directory
	file_name = title.replace("/", ",").strip()
	fullfilename = os.path.join(BOOKS_DIR, file_name + PDF_EXT)
	if not os.path.exists(fullfilename):
	file_resp = urlopen(pdfLink)
	file_resp_bin = file_resp.read()
	with open(fullfilename, "wb") as binfile:
	binfile.write(file_resp_bin)