Skip to content

Instantly share code, notes, and snippets.

@bielfrontera
Last active September 19, 2017 09:33
Show Gist options
  • Save bielfrontera/88e55eb65e3f97ff3e60015a4aca2e1c to your computer and use it in GitHub Desktop.
Save bielfrontera/88e55eb65e3f97ff3e60015a4aca2e1c to your computer and use it in GitHub Desktop.
# coding: utf-8
import os
import shutil
import requests
import re
TOPIC_LIST = [
'programming',
'web-platform',
'security',
'iot',
'data',
'business',
'webops-perf'
]
def download_free_ebooks(ebook_topic, ebook_formats=['epub', 'mobi', 'pdf']):
if not os.path.exists(ebook_topic):
os.mkdir(ebook_topic)
ebook_list = get_free_ebook_list(ebook_topic)
for ebook in ebook_list:
for ebook_format in ebook_formats:
download_ebook(ebook_topic, ebook, ebook_format)
def download_ebook(ebook_topic, ebook_slug, ebook_format):
ebook_url = get_ebook_url(ebook_topic, ebook_slug, ebook_format)
ebook_filename = get_ebook_filename(ebook_topic, ebook_slug, ebook_format)
r = requests.get(ebook_url, stream=True)
if r.ok:
with open(ebook_filename, 'wb') as out_file:
shutil.copyfileobj(r.raw, out_file)
del r
def get_free_ebook_list(ebook_topic):
ebook_list = []
index_url = 'http://www.oreilly.com/{topic}/free/'.format(topic=ebook_topic)
r = requests.get(index_url)
if r.ok:
ebook_list = get_ebook_list_from_content(ebook_topic, r.content)
return ebook_list
def get_ebook_list_from_content(ebook_topic, html_content):
pattern = r'http://www.oreilly.com/{topic}/free/[\'"]?([^\'" >]+).csp'.format(
topic=ebook_topic
)
book_slugs = re.findall(pattern, html_content)
return book_slugs
def get_ebook_url(ebook_topic, ebook_slug, ebook_format):
ebook_url = "http://www.oreilly.com/{topic}/free/files/{slug}.{ext}".format(
topic=ebook_topic,
slug=ebook_slug,
ext=ebook_format
)
return ebook_url
def get_ebook_filename(ebook_topic, ebook_slug, ebook_format):
ebook_fn = "{topic}/{slug}.{ext}".format(
topic=ebook_topic,
slug=ebook_slug,
ext=ebook_format
)
return ebook_fn
if __name__ == "__main__":
for topic in TOPIC_LIST:
download_free_ebooks(topic)
@dtrillo
Copy link

dtrillo commented Oct 25, 2016

Si necesitais usar proxy, teneis que cambiar request por "sesion", y arriba del todo poner:

sesion = requests.Session()
proxy = "_proxy_server:puerto_proxy"
sesion.proxies = {"http": proxy, "https": proxy}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment