Last active
December 23, 2015 05:19
-
-
Save oleeander/6586392 to your computer and use it in GitHub Desktop.
STealAllBooks from it-ebooks.info
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import requests | |
import atexit | |
import os | |
import signal | |
import sys | |
from pyquery import PyQuery | |
def latest_book_id(): | |
html = PyQuery(url='http://it-ebooks.info/') | |
return int( | |
html('table[width="100%"] tr td a').eq(0).attr.href.split('/')[2] | |
) | |
def download_ebook(directory_prefix, book_id): | |
book_url = 'http://it-ebooks.info/book/{id}/'.format( | |
id=book_id | |
) | |
book_page = PyQuery(url=book_url) | |
book_pdf = book_page('table > tr').eq(15)('td').eq(1)('a').attr.href | |
if book_pdf == None: | |
return 'Unable to download #{book_id}. 404? {book_url}'.format( | |
book_id=book_id, | |
book_url=book_url | |
) | |
book_pdf = book_pdf.replace('/go.php', 'http://it-ebooks.info/go.php') | |
book_head = requests.head(book_pdf, headers={'referer': book_url}) | |
if 'content-disposition' in book_head.headers: | |
filename = book_head.headers['content-disposition'].split('filename=', 1)[1] | |
if filename[0] == "'" or filename[0] == '"': | |
filename = filename[1:-1] | |
filename = os.path.basename(filename) | |
else: | |
filename = 'it-ebooks_info_{book_id}_{name}.pdf'.format( | |
book_id=book_id, | |
name=os.path.basename(book_pdf) | |
) | |
if os.path.exists(directory_prefix) and not os.path.isdir(directory_prefix): | |
raise NotADirectoryError('"{directory_prefix}" isn\'t a directory.'.format( | |
directory_prefix=directory_prefix | |
)) | |
elif not os.path.exists(directory_prefix): | |
os.makedirs(directory_prefix) | |
with open(directory_prefix + '/' + filename, 'wb') as pdf: | |
book_dl = requests.get(book_pdf, headers={'referer': book_url}, stream=True) | |
for block in book_dl.iter_content(1024): | |
if not block: | |
pdf.close() | |
book_dl.close() | |
break | |
pdf.write(block) | |
return (book_id, filename) | |
def save_last_id(): | |
with open('STAB_last_id', 'w') as last_id: | |
last_id.write(str(book_id)) | |
last_id.close() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='STealAllBooks from it-ebooks.info') | |
parser.add_argument('start_id', nargs='?', metavar='start_id', type=int, | |
default=1, | |
help='ID of first ebook') | |
parser.add_argument('end_id', nargs='?', metavar='end_id', type=int, | |
default=latest_book_id(), | |
help='ID of last ebook') | |
parser.add_argument('--directory_prefix', '-P', metavar='directory', type=str, | |
default='.', | |
help='The directory where all ebooks will be saved to.') | |
args = parser.parse_args() | |
args.directory_prefix = os.path.normpath(args.directory_prefix) | |
if args.start_id == 1: | |
try: | |
with open('STAB_last_id') as last_id: | |
args.start_id = int(last_id.read()) | |
last_id.close() | |
except: | |
pass | |
atexit.register(save_last_id) | |
signal.signal(signal.SIGTERM, lambda signum, frame: sys.exit(1)) | |
try: | |
for book_id in range(args.start_id, args.end_id): | |
print('Downloading ebook #{id}...'.format(id=book_id)) | |
print(download_ebook(args.directory_prefix, book_id)) | |
except KeyboardInterrupt: | |
sys.exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment