chocolatkey/scribdrip.py

## scribdrip.py
# -*- coding: utf-8 -*-
# Scribd Ripper 1.0
# chocolatkey 2017

import requests
import re
import http.cookiejar
import sys
import time
from bs4 import BeautifulSoup
sess = requests.Session()

def urlr(url):
    global sess
    try:
        req = sess.get(url)
        return req
    except Exception as e:
        print("Error getting page, cooling down: " + str(e) + "\n")
        time.sleep(3)
        return urlr(url)

def main():
    global sess
    doc = urlr(sys.argv[1]).text # e.g. https://www.scribd.com/doc/37359544/Donald-Fagen-the-Nightfly-Book
    for match in re.findall(r'contentUrl = "(.+)";', doc):
        print("Page: " + match)
        page = urlr(match).text
        image = re.findall(r'orig=\\"(.+)\\"', page)[0]
        image = re.sub(r'html\.scribd\.com', re.findall(r'ttps:\/\/([\w\d\-\.]+)\/', match)[0], image)
        fileloc = re.findall(r'id=\\"([\w\d]+)\\"', page)[0] + '.jpg'
        print("Image: " + fileloc + "\n")
        r = sess.get(image, stream=True, cookies=sess.cookies)
        if r.status_code == 200:
            with open(fileloc, 'wb') as f:
                for chunk in r:
                    f.write(chunk)


if __name__ == '__main__':
  sys.exit(main())
	# -- coding: utf-8 --
	# Scribd Ripper 1.0
	# chocolatkey 2017

	import requests
	import re
	import http.cookiejar
	import sys
	import time
	from bs4 import BeautifulSoup
	sess = requests.Session()

	def urlr(url):
	global sess
	try:
	req = sess.get(url)
	return req
	except Exception as e:
	print("Error getting page, cooling down: " + str(e) + "\n")
	time.sleep(3)
	return urlr(url)

	def main():
	global sess
	doc = urlr(sys.argv[1]).text # e.g. https://www.scribd.com/doc/37359544/Donald-Fagen-the-Nightfly-Book
	for match in re.findall(r'contentUrl = "(.+)";', doc):
	print("Page: " + match)
	page = urlr(match).text
	image = re.findall(r'orig=\\"(.+)\\"', page)[0]
	image = re.sub(r'html\.scribd\.com', re.findall(r'ttps:\/\/([\w\d\-\.]+)\/', match)[0], image)
	fileloc = re.findall(r'id=\\"([\w\d]+)\\"', page)[0] + '.jpg'
	print("Image: " + fileloc + "\n")
	r = sess.get(image, stream=True, cookies=sess.cookies)
	if r.status_code == 200:
	with open(fileloc, 'wb') as f:
	for chunk in r:
	f.write(chunk)


	if __name__ == '__main__':
	sys.exit(main())