Skip to content

Instantly share code, notes, and snippets.

@neingeist
Created September 18, 2018 11:29
Show Gist options
  • Save neingeist/74097939d45ef97572f62b627da95417 to your computer and use it in GitHub Desktop.
Save neingeist/74097939d45ef97572f62b627da95417 to your computer and use it in GitHub Desktop.
Script for downloading all images posted on a soup (soup.io)
#!/bin/bash
# Author: Mike Herwig
# Description: Script for downloading all images posted on a soup (soup.io)
#where to download from
HOST="neingeist.soup.io"
#this is the regex for matching the images, you might want to update it when soup.io changes their urls
REGEX="http://asset-[a-z0-9]{1}\.(soupcdn\.com|soup\.io)/asset/[0-9]{4}/[0-9]{4}_[a-z0-9]{4}(_[0-9]{2,3})?\.(jpeg|jpg|png|gif)"
BASE_URL="http://$HOST"
USER_AGENT="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"
COOKIE_FILE=".$HOST.cookie"
CACHE_CTRL="Cache-control: max-age=0"
OUTPUT_FILE="$HOST.html"
function browse {
#change -s to -v for verbose
curl -s -L -A "$USER_AGENT" -b "$COOKIE_FILE" -c "$COOKIE_FILE" -H "$CACHE_CTRL" -o "$OUTPUT_FILE" "$1"
downloadImages
NEXT_URL=`cat $OUTPUT_FILE | grep 'SOUP.Endless.next_url' | sed -n "s/.*'\(.*\)';/\1/p"`
return 0
}
function downloadImages {
URL_ARR=($(cat "$OUTPUT_FILE" | grep -Eo "$REGEX"))
INDEX=0
for i in ${URL_ARR[@]}; do
IMG_URL=${URL_ARR[$INDEX]}
ALREADY_DOWNLOADED=`echo "$PREV_URL" | grep "${IMG_URL%_*}"`
IS_ICON=`echo "$IMG_URL" | grep "_16.png"`
if [ -z "$ALREADY_DOWNLOADED" -a -z "$IS_ICON" ]; then
#if you want the actual remote timestamps remove '-no-use-server-timestamps'
wget -nc --no-use-server-timestamps -P "images.$HOST" $IMG_URL
fi
PREV_URL=$IMG_URL
INDEX=$((INDEX + 1))
done
}
while true; do
URL=$BASE_URL$NEXT_URL
echo $URL
browse $URL
if [ -z "$NEXT_URL" ]; then
break
fi
done
echo "done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment