fadenb/soup.io.sh

## soup.io.sh
#!/usr/bin/env bash

# Author: Mike Herwig
# Description: Script for downloading all images posted on a soup (soup.io)

#where to download from
HOST="fadenb.soup.io"

#this is the regex for matching the images, you might want to update it when soup.io changes their urls
# 2017-02-14: Updated regex to catch new and old URLs
REGEX="http://asset-[a-z0-9]{1}\.(soup|soupcdn)\.(io|com)/asset/[0-9]{4}/[0-9]{4}_[a-z0-9]{4}(_[0-9]{2,3})?\.(jpeg|jpg|png|gif)"


BASE_URL="http://$HOST"
USER_AGENT="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"
COOKIE_FILE=".$HOST.cookie"
CACHE_CTRL="Cache-control: max-age=0"
OUTPUT_FILE="$HOST.html"


function browse {
        #change -s to -v for verbose
        curl -s -L -A "$USER_AGENT" -b "$COOKIE_FILE" -c "$COOKIE_FILE" -H "$CACHE_CTRL" -o "$OUTPUT_FILE" "$1"
        downloadImages
        NEXT_URL=`cat $OUTPUT_FILE | grep 'SOUP.Endless.next_url' | sed -n "s/.*'\(.*\)';/\1/p"`

        return 0
}

function downloadImages {
        URL_ARR=($(cat "$OUTPUT_FILE" | grep -Eo "$REGEX"))

        INDEX=0
        for i in ${URL_ARR[@]}; do
                IMG_URL=${URL_ARR[$INDEX]}

                ALREADY_DOWNLOADED=`echo "$PREV_URL" | grep "${IMG_URL%_*}"`
                IS_ICON=`echo "$IMG_URL" | grep "_16.png"`

                if [ -z "$ALREADY_DOWNLOADED" -a -z "$IS_ICON" ]; then
                        #if you want the actual remote timestamps remove '-no-use-server-timestamps'
			# -4: Cloudflare IPv6 was having issues :/
			# --timeout=10s: To avoid waiting to long for a single file
			# --tries=2: To avoid waiting to long for a single file, run script multiple times to fetch everything
			# &: continue while download runs in background
                        wget -4 -nc --no-use-server-timestamps --timeout=10s --tries=2 -P "images.$HOST" $IMG_URL &
                fi

                PREV_URL=$IMG_URL
                INDEX=$((INDEX + 1))
        done
}

while true; do
        URL=$BASE_URL$NEXT_URL
        echo $URL
        browse $URL

        if [ -z "$NEXT_URL" ]; then
                break
        fi
done

echo "done."
	#!/usr/bin/env bash

	# Author: Mike Herwig
	# Description: Script for downloading all images posted on a soup (soup.io)

	#where to download from
	HOST="fadenb.soup.io"

	#this is the regex for matching the images, you might want to update it when soup.io changes their urls
	# 2017-02-14: Updated regex to catch new and old URLs
	REGEX="http://asset-[a-z0-9]{1}\.(soup\|soupcdn)\.(io\|com)/asset/[0-9]{4}/[0-9]{4}_[a-z0-9]{4}(_[0-9]{2,3})?\.(jpeg\|jpg\|png\|gif)"


	BASE_URL="http://$HOST"
	USER_AGENT="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"
	COOKIE_FILE=".$HOST.cookie"
	CACHE_CTRL="Cache-control: max-age=0"
	OUTPUT_FILE="$HOST.html"


	function browse {
	#change -s to -v for verbose
	curl -s -L -A "$USER_AGENT" -b "$COOKIE_FILE" -c "$COOKIE_FILE" -H "$CACHE_CTRL" -o "$OUTPUT_FILE" "$1"
	downloadImages
	NEXT_URL=`cat $OUTPUT_FILE \| grep 'SOUP.Endless.next_url' \| sed -n "s/.'\(.\)';/\1/p"`

	return 0
	}

	function downloadImages {
	URL_ARR=($(cat "$OUTPUT_FILE" \| grep -Eo "$REGEX"))

	INDEX=0
	for i in ${URL_ARR[@]}; do
	IMG_URL=${URL_ARR[$INDEX]}

	ALREADY_DOWNLOADED=`echo "$PREV_URL" \| grep "${IMG_URL%_*}"`
	IS_ICON=`echo "$IMG_URL" \| grep "_16.png"`

	if [ -z "$ALREADY_DOWNLOADED" -a -z "$IS_ICON" ]; then
	#if you want the actual remote timestamps remove '-no-use-server-timestamps'
	# -4: Cloudflare IPv6 was having issues :/
	# --timeout=10s: To avoid waiting to long for a single file
	# --tries=2: To avoid waiting to long for a single file, run script multiple times to fetch everything
	# &: continue while download runs in background
	wget -4 -nc --no-use-server-timestamps --timeout=10s --tries=2 -P "images.$HOST" $IMG_URL &
	fi

	PREV_URL=$IMG_URL
	INDEX=$((INDEX + 1))
	done
	}

	while true; do
	URL=$BASE_URL$NEXT_URL
	echo $URL
	browse $URL

	if [ -z "$NEXT_URL" ]; then
	break
	fi
	done

	echo "done."