Skip to content

Instantly share code, notes, and snippets.

@9x3l6
Last active August 24, 2023 00:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 9x3l6/43212c28ae54064cf18697dca6bcba3d to your computer and use it in GitHub Desktop.
Save 9x3l6/43212c28ae54064cf18697dca6bcba3d to your computer and use it in GitHub Desktop.
Substack JSON Downloader Python Script (downloads all posts and downloads images too)

SUBSTACK DOWNLOADER JSON

This code is free to use and modify by anyone who loves the truth. If you don't love the truth then you are an enemy of humanity and therefore you are suicidal and need to seek mental help for being a crazy person that hates life.

The script is kept simple for the purpose of easier updates later on. It could be OOP but what's the point, for something simple as scraping contents it's better to keep it as short as possible.

python3 -m venv .venv
source .venv/bin/activate
pip3 install -r requirements.txt
./ssjl.py <SUBSTACK-URL> <DIRECTORY>
# or
echo -e "karenkingston\nanamihalceamdphd" > list.txt
./run.sh
anamihalceamdphd
karenkingston
gregreese
palexander
tlavagabond
petermcculloughmd
corbettreport
merylnass
drtenpenny
beautifulsoup4==4.12.2
certifi==2023.7.22
charset-normalizer==3.2.0
gazpacho==1.1
idna==3.4
markdownify==0.11.6
Pillow==10.0.0
requests==2.31.0
six==1.16.0
soupsieve==2.4.1
urllib3==2.0.4
#!/usr/bin/env bash
trap kill_it TERM # 15 - Termination signal
trap kill_it PIPE # 13 - Broken pipe: write to pipe with no
trap kill_it SEGV # 11 - Invalid memory reference
trap kill_it KILL # 9 - Kill signal
trap kill_it FPE # 8 - Floating point exception
trap kill_it ABRT # 6 - Abort signal from abort(3)
trap kill_it ILL # 4 - Illegal Instruction
trap kill_it QUIT # 3 - Quit from keyboard
trap kill_it INT # 2 - Interrupt from keyboard
trap kill_it HUP # 1 - Hangup detected on controlling terminal or death of controlling process
function kill_it() {
echo "Killed $@";
KILLED=true;
[ "$RUNNING" != "" ] && grep -v "$RUNNING" running.txt > running.tmp && mv running.tmp running.txt;
deactivate;
exit 1;
}
source .venv/bin/activate;
RUNNING="";
KILLED=false;
while [ $KILLED = false ]; do
for l in $(cat list.txt | shuf); do
echo ">>> $l";
if [ "$(grep "$l" running.txt)" == "" ]; then
echo $l >> running.txt;
RUNNING="$l";
./ssjl.py "https://$l.substack.com" "$l" --archive "$l.txt";
grep -v "$l" running.txt > running.tmp;
mv running.tmp running.txt;
RUNNING="";
echo "waiting 60 seconds";
sleep 60
fi
done
done
#!/usr/bin/env python3
# python3 -m venv .venv
# . .venv/bin/activate
# pip3 install requests markdownify pillow gazpacho
# ./ssjl.py https://karenkingston.substack.com ~/Downloads/karenkingston
import os
import sys
import argparse
import requests
from requests.exceptions import HTTPError
import markdownify
from PIL import Image
from gazpacho import Soup
from time import sleep, perf_counter
from random import randrange
import asyncio
import json
from pathlib import Path
def create_dir(directory):
if not os.path.isdir(directory):
if os.path.exists(directory):
raise ValueError('Path exists: %s' % directory)
else:
os.makedirs(directory)
def fetch_json(url, params):
if '/api/v1' in url:
endpoint = url
else:
endpoint = "%s/api/v1/archive" % url
try:
response = requests.get(endpoint, params=params)
response.raise_for_status()
return response.json()
except HTTPError as err:
print(f'HTTP error occurred: {err}')
raise ValueError(err)
except Exception as err:
print(f'Other error occurred: {err}')
def fetch_html(url):
try:
response = requests.get(url)
return response.text
except HTTPError as err:
print(f'HTTP error occurred: {err}')
raise ValueError(err)
except Exception as err:
print(f'Other error occurred: {err}')
def fetch_and_parse(url, archive=None):
try:
limit = 12
offset = 0
results_len = 1
items = []
if archive:
if not os.path.exists(archive):
Path(archive).touch()
files = open(archive, 'r').readlines()
else:
files = []
while results_len != 0:
params = {'limit': limit, 'offset': offset}
try:
entries = fetch_json(url, params=params)
except ValueError as err:
print('Waiting 5 minutes')
sleep(300)
entries = fetch_json(url, params=params)
for item in entries:
Link = item['canonical_url']
if '%s\n' % os.path.basename(Link) not in files:
Title = item['title']
Type = item['type']
Slug = item['slug']
Subtitle = item['subtitle']
Thumb = item['cover_image']
Date = item['post_date']
try:
Html = fetch_html(Link)
except ValueError as err:
print('Waiting 3 minutes')
sleep(180)
Html = fetch_html(Link)
soup = Soup(Html)
content = soup.find('div', {'class': 'markup'})
if content:
md = html2md(content.html)
images = content.find('img')
if Type == 'video':
videos = content.find('div', {'id': 'media-'}, partial=True)
else:
videos = []
# print(videos)
yield {
'title': Title,
'subtitle': Subtitle,
'type': Type,
'link': Link,
'thumb': Thumb,
'md': md,
'images': images,
'videos': videos,
'date': Date,
}
timeout = randrange(5, 60)
print('Waiting: %s' % timeout)
sleep(timeout)
else:
timeout = randrange(5, 10)
print('Waiting %s seconds: %s' % (timeout, Link))
sleep(timeout)
offset = limit + offset
results_len = len(entries)
except KeyboardInterrupt:
sys.exit()
def html2md(html):
return markdownify.markdownify(html)
def save_files(directory, items, archive=None):
try:
create_dir(directory)
start = perf_counter()
for item in items:
print(item['title'])
file_path = os.path.basename(item['link'])
# with open('%s%s%s.md' % (directory, os.path.sep, file_path), 'w') as file:
# file.write(item['md'])
# print('File saved: %s%s%s.md' % (directory, os.path.sep, file_path))
with open('%s%s%s.json' % (directory, os.path.sep, file_path), 'w') as file:
file.write(json.dumps({
'title': item['title'],
'subtitle': item['subtitle'],
'type': item['type'],
'link': item['link'],
'date': item['date'],
'md': item['md'],
}))
if archive:
with open('%s' % archive, 'a') as saved:
saved.write('%s\n' % file_path)
print('File saved: %s.json' % file_path)
save_article_thumb(directory, item)
asyncio.run(save_article_images(directory, item))
end = perf_counter()
print(f'It took {round(end-start, 0)} second(s) to complete.')
except KeyboardInterrupt:
sys.exit()
def save_image(url, file_path):
if url:
data = requests.get(url).content
ext = os.path.splitext(url)[1]
if ext:
with open('%s%s' % (file_path, ext), 'wb') as file:
file.write(data)
print('Image saved: %s%s' % (file_path, ext))
def save_article_thumb(directory, item):
url = item['thumb']
if url:
file_path = '%s%s%s' % (directory, os.path.sep, os.path.basename(item['link']))
save_image(url, file_path)
async def save_article_images(directory, item):
async def download_image(url):
if url:
ext = os.path.splitext(url)[1]
file_path = '%s%s%s%s%s' % (directory, os.path.sep, os.path.basename(item['link']), os.path.sep, os.path.basename(url).replace(ext, ''))
d = os.path.dirname(file_path)
if not os.path.isdir(d):
os.makedirs(d)
save_image(url, file_path)
if item['images']:
if type(item['images']) == list:
urls = [img.attrs['src'] for img in item['images']]
else:
urls = [item['images'].attrs['src']]
imgs = []
for img in urls:
imgs.append(asyncio.create_task(download_image(img)))
await asyncio.gather(*imgs)
def arguments():
parser = argparse.ArgumentParser(description='Substack Downloader')
parser.add_argument('url', help='Substack URL to download')
parser.add_argument('dir', help='Directory where to download')
parser.add_argument("--archive", required=False, help="Archive that saves list of downloaded files")
args = parser.parse_args()
return args
if __name__ == '__main__':
args = arguments()
save_files(args.dir, fetch_and_parse(args.url, args.archive), args.archive)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment