Scribd document ripper
# -*- coding: utf-8 -*- | |
# Scribd Ripper 1.0 | |
# chocolatkey 2017 | |
import requests | |
import re | |
import http.cookiejar | |
import sys | |
import time | |
from bs4 import BeautifulSoup | |
sess = requests.Session() | |
def urlr(url): | |
global sess | |
try: | |
req = sess.get(url) | |
return req | |
except Exception as e: | |
print("Error getting page, cooling down: " + str(e) + "\n") | |
time.sleep(3) | |
return urlr(url) | |
def main(): | |
global sess | |
doc = urlr(sys.argv[1]).text # e.g. https://www.scribd.com/doc/37359544/Donald-Fagen-the-Nightfly-Book | |
for match in re.findall(r'contentUrl = "(.+)";', doc): | |
print("Page: " + match) | |
page = urlr(match).text | |
image = re.findall(r'orig=\\"(.+)\\"', page)[0] | |
image = re.sub(r'html\.scribd\.com', re.findall(r'ttps:\/\/([\w\d\-\.]+)\/', match)[0], image) | |
fileloc = re.findall(r'id=\\"([\w\d]+)\\"', page)[0] + '.jpg' | |
print("Image: " + fileloc + "\n") | |
r = sess.get(image, stream=True, cookies=sess.cookies) | |
if r.status_code == 200: | |
with open(fileloc, 'wb') as f: | |
for chunk in r: | |
f.write(chunk) | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment