Skip to content

Instantly share code, notes, and snippets.

@Kovrinic
Last active December 17, 2023 13:39
Show Gist options
  • Save Kovrinic/30bc4eca70ea8f062225d8521bc1722f to your computer and use it in GitHub Desktop.
Save Kovrinic/30bc4eca70ea8f062225d8521bc1722f to your computer and use it in GitHub Desktop.
Download all free ebooks (epub, mobi, pdf) from oreilly.com
#!/usr/bin/env python
#
# Matthew Rothfuss
# 10/13/2016
#
# Purpose:
# Download all free ebooks (epub, mobi, pdf) from oreilly.com
#
# Overview:
# Runs in parallel threads, based on cpu count * 2 (i.e. 8 cpus * 2 = 16 threads)
# Change "process_count" to desired threads if cpu count causes issues
#
# Original:
# https://www.reddit.com/r/Python/comments/56syaa/7_oreilly_python_books_for_free/d8n6597
import multiprocessing
import requests
import re
import os
####################################################################################################
def is_same_file_size(filepath, url):
"""Check file size against URL ebook content size"""
# Read the length of a file from the file header it is in Bytes
r = requests.head(url)
remote_size = float(r.headers['Content-Length'])
# Local File size in bytes and 0 if it does not exist
try:
local_size = float(os.path.getsize(filepath))
except:
local_size = 0.0
return remote_size == local_size
####################################################################################################
def ensure_dir(path):
try:
os.makedirs(path, exist_ok=True)
except TypeError as e:
# python 2.x fix
if os.path.exists(path):
if os.path.isfile(path):
raise Exception("Error: Directory path already exists as a file. {}".format(path)) from e
else:
os.makedirs(path)
####################################################################################################
def download_file(url):
"""
Download ebook
Ensures download directory, based on category and book title
Will download updated versions of ebooks if exists
checks file size against URL ebook content size
"""
dl_dir = url.split('/', 4)[3]
# http://stackoverflow.com/a/16696317
filename = url.split('/')[-1]
file_dir = "".join(filename.split('.')[:-1])
filepath = os.path.abspath("./{}/{}/{}".format(dl_dir, file_dir, filename))
# Test if file already exists
if os.path.isfile(filepath):
# Test file size matching
if is_same_file_size(filepath, url):
print("SKIP Existing {}".format(filepath))
return
else:
print("NEW {}".format(filename))
# download/save ebook
try:
r = requests.get(url, stream=True) # NOTE the stream=True parameter
r.raise_for_status() # raise error if status gives error
ensure_dir(os.path.abspath("./{}/{}".format(dl_dir, file_dir))) # ensure download directory
with open(filepath, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print("SAVED '{}'".format(url))
return filename
except Exception as e:
print(u"ERROR: Cannot handle '{}' >>> ({})".format(url, e))
return
####################################################################################################
def download_all_from_urls(endings, urls):
"""
Format download URLs into ebook URLs
Spawn download category with multiple threads
"""
combinations = (url.replace('.csp', ending)
.replace('/free/', '/free/files/')
for url in urls
for ending in endings)
process_count = multiprocessing.cpu_count()*2 # thread count depending on cpu count
#process_count = 4 # hard code thread count
pool = multiprocessing.Pool(processes=process_count)
pool.map(download_file, combinations)
pool.close() # no more tasks
pool.join() # wrap up current tasks
####################################################################################################
def get_urls(dl_type):
"""Parse oreilly category for all download URLs"""
data = requests.get('http://www.oreilly.com/{}/free/'.format(dl_type))
# Other URLs
# Don't worry, I'm not _parsing_ html with regex. Merely scraping it. :)
pattern = re.compile('http://www.oreilly.com/{}/free.*\.csp'.format(dl_type))
return pattern.findall(data.text)
####################################################################################################
def get_main_urls():
"""Get all the pages containing free ebooks"""
data = requests.get('http://www.oreilly.com/programming/free/')
# Just match the name not the whole URL
pattern = re.compile('(?<=oreilly\.com\/)(.*)(?=\/free\/)')
# Remove duplicates
return list(set(pattern.findall(data.text)))
####################################################################################################
"""Main Program"""
file_types = ['.pdf', '.mobi', '.epub']
download_types = get_main_urls()
for download_type in download_types:
urls = get_urls(download_type)
download_all_from_urls(file_types, urls)
#!/usr/bin/env python3
#
# Matthew Rothfuss
# First: 10/13/2016
# Edit: 03/07/2018
#
# Purpose:
# Download all free ebooks (epub, mobi, pdf) from oreilly.com
#
# Overview:
# Runs in parallel threads, based on cpu count * 2 (i.e. 8 cpus * 2 = 16 threads)
# Change "processes" to desired threads if cpu count causes issues
# Can specify custom "categories" and "etypes"
# Set "verbose" to True, to print all output
#
# Original:
# https://www.reddit.com/r/Python/comments/56syaa/7_oreilly_python_books_for_free/d8n6597
from argparse import ArgumentParser as AP
from lxml import html as HTML
import multiprocessing
import argcomplete
import requests
import re
import os
####################################################################################################
class OReillyEbookFreeScraper(object):
def __init__(self, categories=None, etypes=None, processes=None, outdir=None, dryrun=False, verbose=0):
self.base_url = 'http://www.oreilly.com'
self.dryrun = dryrun
self.verbose = verbose > 0
self.outdir = outdir
self.categories = list()
if categories:
self.categories = categories if isinstance(categories, list) else [categories]
self.ebook_types = ['.pdf', '.mobi', '.epub']
if etypes:
self.ebook_types = etypes if isinstance(etypes, list) else [etypes]
if set([ ebt for ebt in self.ebook_types if not ebt.startswith('.')]):
self.ebook_types = [('.'+t if not t.startswith('.') else t) for t in self.ebook_types]
self.processes = processes if processes else multiprocessing.cpu_count()*2
self.verify_outdir() # setup output directory
if self.dryrun:
print("* *")
print("******************************* Dry-Run Mode *********************************")
print("* *")
print("\nSaving all free ebooks ({}) from oreilly.com".format(', '.join(self.ebook_types)))
print("*" * 80)
self.get_all_free_ebook_urls()
print("Done")
def is_same_file_size(self, filepath, url):
"""Check file size against URL ebook content size"""
# Read the length of a file from the file header it is in Bytes
r = requests.head(url)
remote_size = float(r.headers['Content-Length'])
# Local File size in bytes and 0 if it does not exist
try: local_size = float(os.path.getsize(filepath))
except: local_size = 0.0
return remote_size == local_size
def ensure_dir(self, path):
try:
os.makedirs(path, exist_ok=True)
except TypeError as e:
# python 2.x fix
if os.path.exists(path):
if os.path.isfile(path):
raise Exception("Error: Directory path already exists as a file. {}".format(path))
else:
os.makedirs(path)
def verify_outdir(self):
if not self.dryrun:
self.ensure_dir(self.outdir)
def download(self, url):
"""
Download ebook
Ensures download directory, based on category and book title
Will download updated versions of ebooks if exists
checks file size against URL ebook content size
"""
dl_dir = url.split('/', 4)[3]
# http://stackoverflow.com/a/16696317
filename = url.split('/')[-1]
file_dir = "".join(filename.split('.')[:-1])
filepath = os.path.abspath(self.outdir+"/{}/{}/{}".format(dl_dir, file_dir, filename))
# Test if file already exists
if os.path.isfile(filepath):
# Test file size matching
if self.is_same_file_size(filepath, url):
if self.verbose:
print("SKIP: Existing {}".format(filepath))
return
elif self.verbose:
print("NEW: {}".format(url))
# skip download for dryrun
if self.dryrun:
print("[dryrun]: {} --> {}".format(filename, filepath))
return
# download/save ebook
try:
r = requests.get(url, stream=True) # NOTE the stream=True parameter
r.raise_for_status() # raise error if status gives error
self.ensure_dir(os.path.abspath(self.outdir+"/{}/{}".format(dl_dir, file_dir))) # ensure download directory
with open(filepath, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print("SAVED: {}".format(url))
return filename
except requests.exceptions.HTTPError as e:
if (e.response.status_code == 404):
if self.verbose:
print(u"WARN: {} file does NOT exists for '{}'".format(url.split('.')[-1], url))
else:
print(u"ERROR: <HTTP {}> Cannot handle '{}'".format(e.response.status_code, url))
except Exception as e:
print(u"ERROR: Cannot handle '{}' >>> ({})".format(url, e))
return
def expand_url(self, url):
try:
r = requests.head(url)
r.raise_for_status()
rurl = r.url
if 'location' in r.headers:
rurl = r.headers['location']
if rurl.endswith('.do'):
return
elif rurl.split('/', 5)[4] != 'free':
return
return rurl
except:
return
def pa(self, cat):
if cat == "ai":
return "data"
elif cat == "software-engineering":
return "programming"
elif cat == "operations":
return "webops-perf"
elif cat == "web-programming":
return "web-platform"
else:
return cat
def category_from_url(self, url):
surl = url.split('/', 5) if url else ''
return surl[3] if surl else ''
def get_dl_links(self, url):
try:
req = requests.get(url)
req.raise_for_status()
except Exception as e:
if self.verbose:
print(u"WARN: Cannot handle '{}' >>> ({})".format(url, e))
return list()
try:
html = HTML.fromstring(req.text)
except Exception as e:
if self.verbose:
print(u"WARN: Cannot find HTML in '{}' >>> ({})".format(url, e))
return list()
post_vals = dict()
for item in html.xpath('//form[@method="post"]'):
keys = item.xpath('./input/@name')
values = item.xpath('./input/@value')
post_vals = dict(zip(keys, values))
post_vals.update(
{
'first': 'a', 'last': 'b', 'email': 'a@b.com',
'newsletter': 'nl_webops_perf', 'x-a': 'Get Your Free Ebook',
'x-redirect': url+"?download=true"
}
)
purl = self.base_url+"/cs/user/create/download_requests"
try:
req2 = requests.post(purl, data=post_vals)
req2.raise_for_status()
except Exception as e:
if self.verbose:
print(u"WARN: Cannot handle '{}' >>> ({})".format(purl, e))
return list()
r = re.search("var\s*?shortLink\s*?\=\s*?[\'\"]([^\'\"]+)[\'\"]", req2.text)
if not r:
return list()
slink = r.group(1).strip()
dlurl = self.base_url + '/' + self.pa(self.category_from_url(url)) + '/free/files/' + slink
return [ dlurl+t for t in self.ebook_types ]
def get_all_free_ebook_urls(self):
"""Get all free ebook urls"""
furl = self.base_url+'/free/'
try:
req = requests.get(furl)
req.raise_for_status()
except Exception as e:
print(u"ERROR: Cannot handle '{}' >>> ({})".format(furl, e))
return list()
html = HTML.fromstring(req.text)
eurls = html.xpath('//div[@name="FreeEbooks"]//a[@class="item-title"]/@href')
pool = multiprocessing.Pool(processes=self.processes)
urls = pool.map(self.expand_url, eurls)
urls = list(sorted([u for u in set(urls) if u]))
if self.categories:
urls = list(set([u for u in urls if self.category_from_url(u) in self.categories]))
categories = self.categories
else:
categories = pool.map(self.category_from_url, urls)
categories = list(sorted([c for c in set(categories) if c])) # remove duplicates
print("Downloading available free ebooks from the following categorie(s):\n{}".format(', '.join(categories)))
print("*" * 80)
print("Saving to the following directory:\n{}".format(os.path.abspath(self.outdir)))
print("*" * 80)
dl_urls = pool.map(self.get_dl_links, urls)
dl_flat_list = [item for sublist in dl_urls for item in sublist]
pool.map(self.download, dl_flat_list)
pool.close() # no more tasks
pool.join() # wrap up current tasks
if self.dryrun:
print("\nINFO: {} possible ebooks in '{}' categorie(s).".format(int(len(dl_flat_list)/3), ', '.join(categories)))
return
####################################################################################################
if __name__ == "__main__":
"""
Main Program
Examples:
1 : oreillyurls_2.py -p 4
2 : oreillyurls_2.py -c 'data'
3 : oreillyurls_2.py -c ['programming', 'data'] -o ~/Downloads/ebooks-tmp
"""
parser = AP(description='Download all free ebooks (epub, mobi, pdf) from oreilly.com')
parser.add_argument('-c', '--categories', dest='categories', help='Specify custom "categories". Can be single or list.')
parser.add_argument('-t', '--types', dest='etypes', help='Specify custom "ebook types". Can be single or list.')
parser.add_argument('-p', '--processes', dest='processes', help='Specify custom processes count. Used in multithreading.')
parser.add_argument('-o', '--out', dest='outdir', default='ebooks-oreilly', help='Output directory')
parser.add_argument('-d', '--dryrun', dest='dryrun', default=False, action='store_true', help='Do a dry run without downloading.')
parser.add_argument('-v', '--verbose', dest='verbose', default=0, action="count", help='Enable/Disable verbose logging.')
argcomplete.autocomplete(parser)
args = parser.parse_args()
OReillyEbookFreeScraper(**vars(args))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment