toyg/download.py

## download.py
# original source: https://www.reddit.com/r/opendirectories/comments/6vysrh/lots_of_italian_books_is_there_any_way_to/dm46nig/
# plus a few tweaks from me.
# This is a Python 2.7 script; you will also need Requests and BeautifulSoup.
# If you have virtualenv installed:
# $> virtualenv env
# $> source env/bin/activate
# $> pip install requests beautifulsoup
# $> python download.py

import codecs
from time import sleep
import requests
import sys
import os
from subprocess import Popen, STDOUT, PIPE
from BeautifulSoup import BeautifulSoup
import HTMLParser

curr_dir = os.path.dirname(os.path.abspath(__file__))
dldir = os.path.join(curr_dir, "downloads")
if not os.path.exists(dldir):
    os.mkdir(dldir)

def down_filescdn(url, backoff=None):
    link = None
    if url:
        # Getting id and rand
        rand = "hmtr5wcosqa5m55xmlf7ax2xfzl4loqi2m6rrry"
        id = url[-12:]

        myheaders = {'Pragma':'no-cache',
                    'Origin': 'https://filescdn.com',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Accept-Language' : 'en-US,en;q=0.8,et;q=0.6,it;q=0.4,nb;q=0.2',
                    'Upgrade-Insecure-Requests':'1',
                    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
                    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Cache-Control':'no-cache',
                    'Referer':'https://filescdn.com/fe9qupy2n90u' ,
                    'DNT':'1',
                    'Connection':'keep-alive'}

        mycookies = {'t_pop':'1', 'lang': 'english'}
        datadict = {'op':'download2','id':id, 'rand' : rand, 'referer': '', 'method_free':'', 'method_premium':''}
        url = 'https://filescdn.com/'+ id
        with requests.post(url, data=datadict, cookies=mycookies, headers=myheaders) as resp:
            with codecs.open(os.path.join(curr_dir, "tmp.txt"), 'w', 'utf-8') as tmpf:
                tmpf.write(resp.text)
    with codecs.open("tmp.txt", 'r', 'utf-8') as f:
        html = f.read()
    soup = BeautifulSoup(html)
    try:
        name = soup.findAll('h6')[0].text
        name = HTMLParser.HTMLParser().unescape(name)
    except:
        if 'You have to wait ' in html:
            if backoff:
                backoff = backoff + backoff
            else:
                backoff = 2
            print "!! ERROR, possible throttling, trying again in {} seconds.".format(str(backoff))
            sleep(backoff)
            down_filescdn(url, backoff)
            return False
    links = soup.findAll('a')
    for l in links:
        myurl = l.get('href')
        if not myurl:
            continue
        if myurl.endswith(('.epub', '.pdf', '.rar', '.mobi', '.zip', '.azw3', '.azw4', ".lit")):
            link = myurl
            break
    print link
    # download
    if link is not None:
        response = requests.get(link, stream=True)
        response.raise_for_status()
        with open(os.path.join(dldir,name), 'wb') as handle:
            for block in response.iter_content(1024):
                handle.write(block)
        print "* File Downloaded"
    else:
        print "* File Skipped : " + myurl
    status_save(name)

def build_list(root, start, end, interval):
    # eg https://filescdn.com/f/l1tn8a2wt0xn/317/
    mystatus = status_load()
    print "* Downloading from list"
    while (start < end):
        print "* Getting page: " + str(start)
        html = requests.get(root + "/" + str(start)).content
        soup = BeautifulSoup(html)
        divs = soup.findAll("div")
        for d in divs:
            if str(d.get('class')) == 'text-semibold':

                name = d.findAll('a')[0].text
                name = HTMLParser.HTMLParser().unescape(name)
                if mystatus and name[0:100] != mystatus[0:100]:
                    print "- Skipping, already downloaded"
                    continue
                if mystatus and name[0:100] == mystatus[0:100]:
                    print "- Resuming download"
                    mystatus = None
                    continue
                link = "http:" + str(d.findAll('a')[0].get('href'))
                print "{}\t\t{}".format(link, name.encode("utf-8"))
                down_filescdn(link)
                sleep(interval)
        start += 1

def status_save(name):
    with open("status.ini", "w") as f:
        f.write(name.encode("utf-8"))

def status_load():
    try:
        with open("status.ini", "r") as f:
            data = f.read().replace("\n", "").replace("\r", "")
            print "* Found place for resuming"
            name = HTMLParser.HTMLParser().unescape(data.decode("utf-8"))
            return name
    except IOError:
        return None

if __name__ == '__main__':
    start_page = 1
    end_page = 317
    sleep_interval = 5
    build_list("https://filescdn.com/f/l1tn8a2wt0xn/", start_page, end_page, sleep_interval)
	# original source: https://www.reddit.com/r/opendirectories/comments/6vysrh/lots_of_italian_books_is_there_any_way_to/dm46nig/
	# plus a few tweaks from me.
	# This is a Python 2.7 script; you will also need Requests and BeautifulSoup.
	# If you have virtualenv installed:
	# $> virtualenv env
	# $> source env/bin/activate
	# $> pip install requests beautifulsoup
	# $> python download.py

	import codecs
	from time import sleep
	import requests
	import sys
	import os
	from subprocess import Popen, STDOUT, PIPE
	from BeautifulSoup import BeautifulSoup
	import HTMLParser

	curr_dir = os.path.dirname(os.path.abspath(__file__))
	dldir = os.path.join(curr_dir, "downloads")
	if not os.path.exists(dldir):
	os.mkdir(dldir)

	def down_filescdn(url, backoff=None):
	link = None
	if url:
	# Getting id and rand
	rand = "hmtr5wcosqa5m55xmlf7ax2xfzl4loqi2m6rrry"
	id = url[-12:]

	myheaders = {'Pragma':'no-cache',
	'Origin': 'https://filescdn.com',
	'Accept-Encoding': 'gzip, deflate, br',
	'Accept-Language' : 'en-US,en;q=0.8,et;q=0.6,it;q=0.4,nb;q=0.2',
	'Upgrade-Insecure-Requests':'1',
	'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
	'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Cache-Control':'no-cache',
	'Referer':'https://filescdn.com/fe9qupy2n90u' ,
	'DNT':'1',
	'Connection':'keep-alive'}

	mycookies = {'t_pop':'1', 'lang': 'english'}
	datadict = {'op':'download2','id':id, 'rand' : rand, 'referer': '', 'method_free':'', 'method_premium':''}
	url = 'https://filescdn.com/'+ id
	with requests.post(url, data=datadict, cookies=mycookies, headers=myheaders) as resp:
	with codecs.open(os.path.join(curr_dir, "tmp.txt"), 'w', 'utf-8') as tmpf:
	tmpf.write(resp.text)
	with codecs.open("tmp.txt", 'r', 'utf-8') as f:
	html = f.read()
	soup = BeautifulSoup(html)
	try:
	name = soup.findAll('h6')[0].text
	name = HTMLParser.HTMLParser().unescape(name)
	except:
	if 'You have to wait ' in html:
	if backoff:
	backoff = backoff + backoff
	else:
	backoff = 2
	print "!! ERROR, possible throttling, trying again in {} seconds.".format(str(backoff))
	sleep(backoff)
	down_filescdn(url, backoff)
	return False
	links = soup.findAll('a')
	for l in links:
	myurl = l.get('href')
	if not myurl:
	continue
	if myurl.endswith(('.epub', '.pdf', '.rar', '.mobi', '.zip', '.azw3', '.azw4', ".lit")):
	link = myurl
	break
	print link
	# download
	if link is not None:
	response = requests.get(link, stream=True)
	response.raise_for_status()
	with open(os.path.join(dldir,name), 'wb') as handle:
	for block in response.iter_content(1024):
	handle.write(block)
	print "* File Downloaded"
	else:
	print "* File Skipped : " + myurl
	status_save(name)

	def build_list(root, start, end, interval):
	# eg https://filescdn.com/f/l1tn8a2wt0xn/317/
	mystatus = status_load()
	print "* Downloading from list"
	while (start < end):
	print "* Getting page: " + str(start)
	html = requests.get(root + "/" + str(start)).content
	soup = BeautifulSoup(html)
	divs = soup.findAll("div")
	for d in divs:
	if str(d.get('class')) == 'text-semibold':

	name = d.findAll('a')[0].text
	name = HTMLParser.HTMLParser().unescape(name)
	if mystatus and name[0:100] != mystatus[0:100]:
	print "- Skipping, already downloaded"
	continue
	if mystatus and name[0:100] == mystatus[0:100]:
	print "- Resuming download"
	mystatus = None
	continue
	link = "http:" + str(d.findAll('a')[0].get('href'))
	print "{}\t\t{}".format(link, name.encode("utf-8"))
	down_filescdn(link)
	sleep(interval)
	start += 1

	def status_save(name):
	with open("status.ini", "w") as f:
	f.write(name.encode("utf-8"))

	def status_load():
	try:
	with open("status.ini", "r") as f:
	data = f.read().replace("\n", "").replace("\r", "")
	print "* Found place for resuming"
	name = HTMLParser.HTMLParser().unescape(data.decode("utf-8"))
	return name
	except IOError:
	return None

	if __name__ == '__main__':
	start_page = 1
	end_page = 317
	sleep_interval = 5
	build_list("https://filescdn.com/f/l1tn8a2wt0xn/", start_page, end_page, sleep_interval)