Skip to content

Instantly share code, notes, and snippets.

@fadenb
Last active July 10, 2020 22:26
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save fadenb/fc43ed8c1aee5c94da1fb785d2382593 to your computer and use it in GitHub Desktop.
Save fadenb/fc43ed8c1aee5c94da1fb785d2382593 to your computer and use it in GitHub Desktop.
Script for downloading all images posted on a soup (soup.io)
#!/usr/bin/env bash
# Author: Mike Herwig
# Description: Script for downloading all images posted on a soup (soup.io)
#where to download from
HOST="fadenb.soup.io"
#this is the regex for matching the images, you might want to update it when soup.io changes their urls
# 2017-02-14: Updated regex to catch new and old URLs
REGEX="http://asset-[a-z0-9]{1}\.(soup|soupcdn)\.(io|com)/asset/[0-9]{4}/[0-9]{4}_[a-z0-9]{4}(_[0-9]{2,3})?\.(jpeg|jpg|png|gif)"
BASE_URL="http://$HOST"
USER_AGENT="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"
COOKIE_FILE=".$HOST.cookie"
CACHE_CTRL="Cache-control: max-age=0"
OUTPUT_FILE="$HOST.html"
function browse {
#change -s to -v for verbose
curl -s -L -A "$USER_AGENT" -b "$COOKIE_FILE" -c "$COOKIE_FILE" -H "$CACHE_CTRL" -o "$OUTPUT_FILE" "$1"
downloadImages
NEXT_URL=`cat $OUTPUT_FILE | grep 'SOUP.Endless.next_url' | sed -n "s/.*'\(.*\)';/\1/p"`
return 0
}
function downloadImages {
URL_ARR=($(cat "$OUTPUT_FILE" | grep -Eo "$REGEX"))
INDEX=0
for i in ${URL_ARR[@]}; do
IMG_URL=${URL_ARR[$INDEX]}
ALREADY_DOWNLOADED=`echo "$PREV_URL" | grep "${IMG_URL%_*}"`
IS_ICON=`echo "$IMG_URL" | grep "_16.png"`
if [ -z "$ALREADY_DOWNLOADED" -a -z "$IS_ICON" ]; then
#if you want the actual remote timestamps remove '-no-use-server-timestamps'
# -4: Cloudflare IPv6 was having issues :/
# --timeout=10s: To avoid waiting to long for a single file
# --tries=2: To avoid waiting to long for a single file, run script multiple times to fetch everything
# &: continue while download runs in background
wget -4 -nc --no-use-server-timestamps --timeout=10s --tries=2 -P "images.$HOST" $IMG_URL &
fi
PREV_URL=$IMG_URL
INDEX=$((INDEX + 1))
done
}
while true; do
URL=$BASE_URL$NEXT_URL
echo $URL
browse $URL
if [ -z "$NEXT_URL" ]; then
break
fi
done
echo "done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment