Forked from bielfrontera/oreilly_free_ebooks.py
Last active
November 12, 2018 18:00
-
-
Save dubirajara/43c1ed4a340817ef93c4899b02c9ae5e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import os | |
import shutil | |
import requests | |
import re | |
TOPIC_LIST = [ | |
'programming', | |
'web-platform', | |
'security', | |
'iot', | |
'data', | |
'business', | |
'webops-perf' | |
] | |
def download_free_ebooks(ebook_topic, ebook_formats=['epub', 'mobi', 'pdf']): | |
if not os.path.exists(ebook_topic): | |
os.mkdir(ebook_topic) | |
ebook_list = get_free_ebook_list(ebook_topic) | |
for ebook in ebook_list: | |
for ebook_format in ebook_formats: | |
download_ebook(ebook_topic, ebook, ebook_format) | |
def download_ebook(ebook_topic, ebook_slug, ebook_format): | |
ebook_url = get_ebook_url(ebook_topic, ebook_slug, ebook_format) | |
ebook_filename = get_ebook_filename(ebook_topic, ebook_slug, ebook_format) | |
r = requests.get(ebook_url, stream=True) | |
if r.ok: | |
with open(ebook_filename, 'wb') as out_file: | |
shutil.copyfileobj(r.raw, out_file) | |
del r | |
def get_free_ebook_list(ebook_topic): | |
ebook_list = [] | |
index_url = 'http://www.oreilly.com/{topic}/free/'.format(topic=ebook_topic) | |
r = requests.get(index_url) | |
if r.ok: | |
ebook_list = get_ebook_list_from_content(ebook_topic, r.content) | |
return ebook_list | |
def get_ebook_list_from_content(ebook_topic, html_content): | |
pattern = r'http://www.oreilly.com/{topic}/free/[\'"]?([^\'" >]+).csp'.format( | |
topic=ebook_topic | |
) | |
book_slugs = re.findall(pattern, html_content.decode("utf-8")) | |
return book_slugs | |
def get_ebook_url(ebook_topic, ebook_slug, ebook_format): | |
ebook_url = "http://www.oreilly.com/{topic}/free/files/{slug}.{ext}".format( | |
topic=ebook_topic, | |
slug=ebook_slug, | |
ext=ebook_format | |
) | |
return ebook_url | |
def get_ebook_filename(ebook_topic, ebook_slug, ebook_format): | |
ebook_fn = "{topic}/{slug}.{ext}".format( | |
topic=ebook_topic, | |
slug=ebook_slug, | |
ext=ebook_format | |
) | |
return ebook_fn | |
if __name__ == "__main__": | |
for topic in TOPIC_LIST: | |
download_free_ebooks(topic) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment