Last active
August 26, 2017 09:40
-
-
Save toyg/09ef7acae2ee97c6fd4c5016ef4ab8e0 to your computer and use it in GitHub Desktop.
IT books downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# original source: https://www.reddit.com/r/opendirectories/comments/6vysrh/lots_of_italian_books_is_there_any_way_to/dm46nig/ | |
# plus a few tweaks from me. | |
# This is a Python 2.7 script; you will also need Requests and BeautifulSoup. | |
# If you have virtualenv installed: | |
# $> virtualenv env | |
# $> source env/bin/activate | |
# $> pip install requests beautifulsoup | |
# $> python download.py | |
import codecs | |
from time import sleep | |
import requests | |
import sys | |
import os | |
from subprocess import Popen, STDOUT, PIPE | |
from BeautifulSoup import BeautifulSoup | |
import HTMLParser | |
curr_dir = os.path.dirname(os.path.abspath(__file__)) | |
dldir = os.path.join(curr_dir, "downloads") | |
if not os.path.exists(dldir): | |
os.mkdir(dldir) | |
def down_filescdn(url, backoff=None): | |
link = None | |
if url: | |
# Getting id and rand | |
rand = "hmtr5wcosqa5m55xmlf7ax2xfzl4loqi2m6rrry" | |
id = url[-12:] | |
myheaders = {'Pragma':'no-cache', | |
'Origin': 'https://filescdn.com', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept-Language' : 'en-US,en;q=0.8,et;q=0.6,it;q=0.4,nb;q=0.2', | |
'Upgrade-Insecure-Requests':'1', | |
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36', | |
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Cache-Control':'no-cache', | |
'Referer':'https://filescdn.com/fe9qupy2n90u' , | |
'DNT':'1', | |
'Connection':'keep-alive'} | |
mycookies = {'t_pop':'1', 'lang': 'english'} | |
datadict = {'op':'download2','id':id, 'rand' : rand, 'referer': '', 'method_free':'', 'method_premium':''} | |
url = 'https://filescdn.com/'+ id | |
with requests.post(url, data=datadict, cookies=mycookies, headers=myheaders) as resp: | |
with codecs.open(os.path.join(curr_dir, "tmp.txt"), 'w', 'utf-8') as tmpf: | |
tmpf.write(resp.text) | |
with codecs.open("tmp.txt", 'r', 'utf-8') as f: | |
html = f.read() | |
soup = BeautifulSoup(html) | |
try: | |
name = soup.findAll('h6')[0].text | |
name = HTMLParser.HTMLParser().unescape(name) | |
except: | |
if 'You have to wait ' in html: | |
if backoff: | |
backoff = backoff + backoff | |
else: | |
backoff = 2 | |
print "!! ERROR, possible throttling, trying again in {} seconds.".format(str(backoff)) | |
sleep(backoff) | |
down_filescdn(url, backoff) | |
return False | |
links = soup.findAll('a') | |
for l in links: | |
myurl = l.get('href') | |
if not myurl: | |
continue | |
if myurl.endswith(('.epub', '.pdf', '.rar', '.mobi', '.zip', '.azw3', '.azw4', ".lit")): | |
link = myurl | |
break | |
print link | |
# download | |
if link is not None: | |
response = requests.get(link, stream=True) | |
response.raise_for_status() | |
with open(os.path.join(dldir,name), 'wb') as handle: | |
for block in response.iter_content(1024): | |
handle.write(block) | |
print "* File Downloaded" | |
else: | |
print "* File Skipped : " + myurl | |
status_save(name) | |
def build_list(root, start, end, interval): | |
# eg https://filescdn.com/f/l1tn8a2wt0xn/317/ | |
mystatus = status_load() | |
print "* Downloading from list" | |
while (start < end): | |
print "* Getting page: " + str(start) | |
html = requests.get(root + "/" + str(start)).content | |
soup = BeautifulSoup(html) | |
divs = soup.findAll("div") | |
for d in divs: | |
if str(d.get('class')) == 'text-semibold': | |
name = d.findAll('a')[0].text | |
name = HTMLParser.HTMLParser().unescape(name) | |
if mystatus and name[0:100] != mystatus[0:100]: | |
print "- Skipping, already downloaded" | |
continue | |
if mystatus and name[0:100] == mystatus[0:100]: | |
print "- Resuming download" | |
mystatus = None | |
continue | |
link = "http:" + str(d.findAll('a')[0].get('href')) | |
print "{}\t\t{}".format(link, name.encode("utf-8")) | |
down_filescdn(link) | |
sleep(interval) | |
start += 1 | |
def status_save(name): | |
with open("status.ini", "w") as f: | |
f.write(name.encode("utf-8")) | |
def status_load(): | |
try: | |
with open("status.ini", "r") as f: | |
data = f.read().replace("\n", "").replace("\r", "") | |
print "* Found place for resuming" | |
name = HTMLParser.HTMLParser().unescape(data.decode("utf-8")) | |
return name | |
except IOError: | |
return None | |
if __name__ == '__main__': | |
start_page = 1 | |
end_page = 317 | |
sleep_interval = 5 | |
build_list("https://filescdn.com/f/l1tn8a2wt0xn/", start_page, end_page, sleep_interval) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment