Created
January 29, 2020 11:49
-
-
Save svenk/e35c9bc48eb8feb2d28d91459fb79d6b to your computer and use it in GitHub Desktop.
Scribd JPG scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Scraping non-downloadable books/documents | |
# by downloading them as JPG files | |
# Public Domain. | |
import requests, json, sys | |
# Attention, token changes regularly... You should copy all ~10 minutes from Firebug | |
params = { | |
'token': "12345ALSKDJASDLKJASLKDJALSKDJASDXXXXXXX===_xxxxxxxxxxxxxx" | |
} | |
# Cookies, Referer, UA, doesnt seem to be checked | |
headers = { | |
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0", | |
"Accept": "*/*", | |
"Referer": "https://de.scribd.com/read/123456/Probably-replace", | |
} | |
pubnum = 123456789 # the publication number | |
#chapters = range(1,82) | |
chapters = range(78,82) # the ch282599356apters you want to have | |
for chapter_id in chapters: | |
print(f"Slurping chapter {chapter_id}...") | |
imglist_url = f"https://de.scribd.com/scepub/{pubnum}/chapters/{chapter_id}/contents.json" | |
R = requests.get(imglist_url, params=params, headers=headers) | |
print(f"Asked {imglist_url=} -- Got {R=}") | |
if R.status_code >= 400: | |
print("Response: ",R.text) | |
response.raise_for_status() | |
imglist = R.json() | |
# store chapter metadata for later use | |
with open(f'chapter{chapter_id}.json', 'w') as fh: | |
fh.write(R.text) | |
for i,block in enumerate(imglist["blocks"]): | |
target = f"chapter{chapter_id}-image{i}.jpg" | |
print(f"Downloading {target=}") | |
image = f"https://de.scribd.com/scepub/{pubnum}/chapters/{chapter_id}/{block['src']}" | |
response = requests.get(image, params=params, headers=headers, stream=True) | |
response.raise_for_status() | |
with open(target, 'wb') as fh: | |
for block in response.iter_content(1024): | |
fh.write(block) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I don't claim authorship due to the grey hat character of the code. I have no idea who has written that marvellous piece of code.