svenk/scripd-scraping.py

## scripd-scraping.py
#!/usr/bin/env python3

# Scraping non-downloadable books/documents
# by downloading them as JPG files
# Public Domain.

import requests, json, sys

# Attention, token changes regularly... You should copy all ~10 minutes from Firebug

params = {
    'token': "12345ALSKDJASDLKJASLKDJALSKDJASDXXXXXXX===_xxxxxxxxxxxxxx"
}

# Cookies, Referer, UA, doesnt seem to be checked
headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0",
        "Accept": "*/*",
        "Referer": "https://de.scribd.com/read/123456/Probably-replace",
    }


pubnum = 123456789 # the publication number
#chapters = range(1,82)
chapters = range(78,82) # the ch282599356apters you want to have

for chapter_id in chapters:
    print(f"Slurping chapter {chapter_id}...")
    imglist_url = f"https://de.scribd.com/scepub/{pubnum}/chapters/{chapter_id}/contents.json"

    R = requests.get(imglist_url, params=params, headers=headers)
    print(f"Asked {imglist_url=} -- Got {R=}")
    if R.status_code >= 400:
        print("Response: ",R.text)
        response.raise_for_status()

    imglist = R.json()

    # store chapter metadata for later use
    with open(f'chapter{chapter_id}.json', 'w') as fh:
        fh.write(R.text)

    for i,block in enumerate(imglist["blocks"]):
        target = f"chapter{chapter_id}-image{i}.jpg"
        print(f"Downloading {target=}")
        image = f"https://de.scribd.com/scepub/{pubnum}/chapters/{chapter_id}/{block['src']}"
        response = requests.get(image, params=params, headers=headers, stream=True)
        response.raise_for_status()

        with open(target, 'wb') as fh:
            for block in response.iter_content(1024):
                fh.write(block)
	#!/usr/bin/env python3

	# Scraping non-downloadable books/documents
	# by downloading them as JPG files
	# Public Domain.

	import requests, json, sys

	# Attention, token changes regularly... You should copy all ~10 minutes from Firebug

	params = {
	'token': "12345ALSKDJASDLKJASLKDJALSKDJASDXXXXXXX===_xxxxxxxxxxxxxx"
	}

	# Cookies, Referer, UA, doesnt seem to be checked
	headers = {
	"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0",
	"Accept": "/",
	"Referer": "https://de.scribd.com/read/123456/Probably-replace",
	}


	pubnum = 123456789 # the publication number
	#chapters = range(1,82)
	chapters = range(78,82) # the ch282599356apters you want to have

	for chapter_id in chapters:
	print(f"Slurping chapter {chapter_id}...")
	imglist_url = f"https://de.scribd.com/scepub/{pubnum}/chapters/{chapter_id}/contents.json"

	R = requests.get(imglist_url, params=params, headers=headers)
	print(f"Asked {imglist_url=} -- Got {R=}")
	if R.status_code >= 400:
	print("Response: ",R.text)
	response.raise_for_status()

	imglist = R.json()

	# store chapter metadata for later use
	with open(f'chapter{chapter_id}.json', 'w') as fh:
	fh.write(R.text)

	for i,block in enumerate(imglist["blocks"]):
	target = f"chapter{chapter_id}-image{i}.jpg"
	print(f"Downloading {target=}")
	image = f"https://de.scribd.com/scepub/{pubnum}/chapters/{chapter_id}/{block['src']}"
	response = requests.get(image, params=params, headers=headers, stream=True)
	response.raise_for_status()

	with open(target, 'wb') as fh:
	for block in response.iter_content(1024):
	fh.write(block)