ddnomad/scrape_fakku_authors.sh

## scrape_fakku_authors.sh
#!/usr/bin/env bash
set -euo pipefail

readonly BASE_URL_TPL='https://www.fakku.net/tags/@tag/page/@page_num'
readonly TAGS_URL='https://www.fakku.net/tags'
readonly USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0'

readonly SCRIPT_USAGE="$(cat <<EOF
Usage: $0 (--print-all-tags) TAG

Scrape a list of Fakku authors who have works tagged with TAG.
Pipe STDOUT to sort -u to deduplicate the results (if needed).

If --print-tags-list specified, print a list of all available
tags in 'TAG_PRETTY_NAME: TAG' format. Do not use pretty names
as arguments to this script as that will not work.

Example:
    # Print a list of available tags to STDOUT
    $0 --print-all-tags

    # Scrape all author names who have works tagged with 'ecchi'
    # tag. Deduplicate and save to ./fakku_ecchi_authors.txt file
    # in the current directory.
    $0 ecchi | sort -u | tee ./fakku_ecchi_authors.txt
EOF
)"


function main {
    if test "$#" -ne 1; then
        >&2 echo "${SCRIPT_USAGE}"
        exit 1
    fi

    if ! command -v curl &> /dev/null; then
        >&2 echo 'Error: Failed to locate curl command'
        exit 1
    fi

    local tag
    tag="$1"
    shift

    if test "${tag}" == "--print-all-tags"; then
        >&2 echo '---(i) INFO: Scraping a list of all Fakku tags'

        curl --fail --show-error --silent --header "User-Agent: ${USER_AGENT}"  "${TAGS_URL}" | \
            sed -ne 's@^.*href="/tags/\(..*\)">\(..*\)</.*$@\2: \1@p' | \
            sort -u

        exit 0
    fi

    local page_num
    page_num=1

    while true; do
        >&2 echo "---(i) INFO: Scraping Fakku author names: tag=\"${tag}\", page=${page_num}"

        local url
        url="${BASE_URL_TPL/@tag/"${tag}"}"
        url="${url/@page_num/"${page_num}"}"

        curl --fail --show-error --silent --header "User-Agent: ${USER_AGENT}" "${url}" | \
            grep -A 1 'href="/artists/' | \
            grep '</a>' | \
            cut -f1 -d'<' | \
            awk '{$1=$1};1'

        page_num=$((page_num + 1))
    done

}

main "$@"
	#!/usr/bin/env bash
	set -euo pipefail

	readonly BASE_URL_TPL='https://www.fakku.net/tags/@tag/page/@page_num'
	readonly TAGS_URL='https://www.fakku.net/tags'
	readonly USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0'

	readonly SCRIPT_USAGE="$(cat <<EOF
	Usage: $0 (--print-all-tags) TAG

	Scrape a list of Fakku authors who have works tagged with TAG.
	Pipe STDOUT to sort -u to deduplicate the results (if needed).

	If --print-tags-list specified, print a list of all available
	tags in 'TAG_PRETTY_NAME: TAG' format. Do not use pretty names
	as arguments to this script as that will not work.

	Example:
	# Print a list of available tags to STDOUT
	$0 --print-all-tags

	# Scrape all author names who have works tagged with 'ecchi'
	# tag. Deduplicate and save to ./fakku_ecchi_authors.txt file
	# in the current directory.
	$0 ecchi \| sort -u \| tee ./fakku_ecchi_authors.txt
	EOF
	)"


	function main {
	if test "$#" -ne 1; then
	>&2 echo "${SCRIPT_USAGE}"
	exit 1
	fi

	if ! command -v curl &> /dev/null; then
	>&2 echo 'Error: Failed to locate curl command'
	exit 1
	fi

	local tag
	tag="$1"
	shift

	if test "${tag}" == "--print-all-tags"; then
	>&2 echo '---(i) INFO: Scraping a list of all Fakku tags'

	curl --fail --show-error --silent --header "User-Agent: ${USER_AGENT}" "${TAGS_URL}" \| \
	sed -ne 's@^.href="/tags/\(..\)">\(..\)</.$@\2: \1@p' \| \
	sort -u

	exit 0
	fi

	local page_num
	page_num=1

	while true; do
	>&2 echo "---(i) INFO: Scraping Fakku author names: tag=\"${tag}\", page=${page_num}"

	local url
	url="${BASE_URL_TPL/@tag/"${tag}"}"
	url="${url/@page_num/"${page_num}"}"

	curl --fail --show-error --silent --header "User-Agent: ${USER_AGENT}" "${url}" \| \
	grep -A 1 'href="/artists/' \| \
	grep '</a>' \| \
	cut -f1 -d'<' \| \
	awk '{$1=$1};1'

	page_num=$((page_num + 1))
	done

	}

	main "$@"