Skip to content

Instantly share code, notes, and snippets.

@toyg
Last active August 26, 2017 09:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save toyg/09ef7acae2ee97c6fd4c5016ef4ab8e0 to your computer and use it in GitHub Desktop.
Save toyg/09ef7acae2ee97c6fd4c5016ef4ab8e0 to your computer and use it in GitHub Desktop.
IT books downloader
# original source: https://www.reddit.com/r/opendirectories/comments/6vysrh/lots_of_italian_books_is_there_any_way_to/dm46nig/
# plus a few tweaks from me.
# This is a Python 2.7 script; you will also need Requests and BeautifulSoup.
# If you have virtualenv installed:
# $> virtualenv env
# $> source env/bin/activate
# $> pip install requests beautifulsoup
# $> python download.py
import codecs
from time import sleep
import requests
import sys
import os
from subprocess import Popen, STDOUT, PIPE
from BeautifulSoup import BeautifulSoup
import HTMLParser
curr_dir = os.path.dirname(os.path.abspath(__file__))
dldir = os.path.join(curr_dir, "downloads")
if not os.path.exists(dldir):
os.mkdir(dldir)
def down_filescdn(url, backoff=None):
link = None
if url:
# Getting id and rand
rand = "hmtr5wcosqa5m55xmlf7ax2xfzl4loqi2m6rrry"
id = url[-12:]
myheaders = {'Pragma':'no-cache',
'Origin': 'https://filescdn.com',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language' : 'en-US,en;q=0.8,et;q=0.6,it;q=0.4,nb;q=0.2',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control':'no-cache',
'Referer':'https://filescdn.com/fe9qupy2n90u' ,
'DNT':'1',
'Connection':'keep-alive'}
mycookies = {'t_pop':'1', 'lang': 'english'}
datadict = {'op':'download2','id':id, 'rand' : rand, 'referer': '', 'method_free':'', 'method_premium':''}
url = 'https://filescdn.com/'+ id
with requests.post(url, data=datadict, cookies=mycookies, headers=myheaders) as resp:
with codecs.open(os.path.join(curr_dir, "tmp.txt"), 'w', 'utf-8') as tmpf:
tmpf.write(resp.text)
with codecs.open("tmp.txt", 'r', 'utf-8') as f:
html = f.read()
soup = BeautifulSoup(html)
try:
name = soup.findAll('h6')[0].text
name = HTMLParser.HTMLParser().unescape(name)
except:
if 'You have to wait ' in html:
if backoff:
backoff = backoff + backoff
else:
backoff = 2
print "!! ERROR, possible throttling, trying again in {} seconds.".format(str(backoff))
sleep(backoff)
down_filescdn(url, backoff)
return False
links = soup.findAll('a')
for l in links:
myurl = l.get('href')
if not myurl:
continue
if myurl.endswith(('.epub', '.pdf', '.rar', '.mobi', '.zip', '.azw3', '.azw4', ".lit")):
link = myurl
break
print link
# download
if link is not None:
response = requests.get(link, stream=True)
response.raise_for_status()
with open(os.path.join(dldir,name), 'wb') as handle:
for block in response.iter_content(1024):
handle.write(block)
print "* File Downloaded"
else:
print "* File Skipped : " + myurl
status_save(name)
def build_list(root, start, end, interval):
# eg https://filescdn.com/f/l1tn8a2wt0xn/317/
mystatus = status_load()
print "* Downloading from list"
while (start < end):
print "* Getting page: " + str(start)
html = requests.get(root + "/" + str(start)).content
soup = BeautifulSoup(html)
divs = soup.findAll("div")
for d in divs:
if str(d.get('class')) == 'text-semibold':
name = d.findAll('a')[0].text
name = HTMLParser.HTMLParser().unescape(name)
if mystatus and name[0:100] != mystatus[0:100]:
print "- Skipping, already downloaded"
continue
if mystatus and name[0:100] == mystatus[0:100]:
print "- Resuming download"
mystatus = None
continue
link = "http:" + str(d.findAll('a')[0].get('href'))
print "{}\t\t{}".format(link, name.encode("utf-8"))
down_filescdn(link)
sleep(interval)
start += 1
def status_save(name):
with open("status.ini", "w") as f:
f.write(name.encode("utf-8"))
def status_load():
try:
with open("status.ini", "r") as f:
data = f.read().replace("\n", "").replace("\r", "")
print "* Found place for resuming"
name = HTMLParser.HTMLParser().unescape(data.decode("utf-8"))
return name
except IOError:
return None
if __name__ == '__main__':
start_page = 1
end_page = 317
sleep_interval = 5
build_list("https://filescdn.com/f/l1tn8a2wt0xn/", start_page, end_page, sleep_interval)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment