Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scribd document ripper
# -*- coding: utf-8 -*-
# Scribd Ripper 1.0
# chocolatkey 2017
import requests
import re
import http.cookiejar
import sys
import time
from bs4 import BeautifulSoup
sess = requests.Session()
def urlr(url):
global sess
try:
req = sess.get(url)
return req
except Exception as e:
print("Error getting page, cooling down: " + str(e) + "\n")
time.sleep(3)
return urlr(url)
def main():
global sess
doc = urlr(sys.argv[1]).text # e.g. https://www.scribd.com/doc/37359544/Donald-Fagen-the-Nightfly-Book
for match in re.findall(r'contentUrl = "(.+)";', doc):
print("Page: " + match)
page = urlr(match).text
image = re.findall(r'orig=\\"(.+)\\"', page)[0]
image = re.sub(r'html\.scribd\.com', re.findall(r'ttps:\/\/([\w\d\-\.]+)\/', match)[0], image)
fileloc = re.findall(r'id=\\"([\w\d]+)\\"', page)[0] + '.jpg'
print("Image: " + fileloc + "\n")
r = sess.get(image, stream=True, cookies=sess.cookies)
if r.status_code == 200:
with open(fileloc, 'wb') as f:
for chunk in r:
f.write(chunk)
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.