fonic/cinemassacre-avgn-episode-scraper.sh

## cinemassacre-avgn-episode-scraper.sh
#!/usr/bin/env bash

# -------------------------------------------------------------------------
#                                                                         -
#  Cinemassacre.com Angry Video Game Nerd (AVGN) Episode Scraper          -
#                                                                         -
#  Created by Fonic <https://github.com/fonic>                            -
#  Date: 02/18/23 - 02/19/23                                              -
#                                                                         -
#  Why use cinemassacre.com instead of Youtube?:                          -
#  1) cinemassacre.com is THE reference source for AVGN episodes          -
#  2) video titles on cinemassacre.com contain episode numbers            -
#                                                                         -
#  Caveats:                                                               -
#  Most videos listed on cinemassacre.com are stored on odysee.com.       -
#  Downloads from there are slow and unstable. Thus, one might be         -
#  better off to download videos from Youtube instead and just use        -
#  episode and title information from this scraper.                       -
#                                                                         -
# -------------------------------------------------------------------------

# --------------------------------------
#  Globals                             -
# --------------------------------------

CURL_OPTS=("--fail" "--location" "--silent" "--show-error" "--retry" "2" "--connect-timeout" "60")

LIST_PAGE_URL="https://cinemassacre.com/category/angry-video-game-nerd/page/%d/" # also works for page 1 which redirects to URL without trailing '/page/x/'
LIST_PAGE_COUNT=6
VIDEO_PAGE_URL_RE='<a href="([^"]+)" class="video-title">([^<]+)</a>' # video URL, video title
VIDEO_TITLE_EPISODE_PART_RE="Episode ([0-9]+)( Part ([0-9]+))?" # part is optional (part number will be in BASH_REMATCH[3]!)
VIDEO_PAGE_STOP_MARKER='<div class="search-box-wrapper">' # stop parsing here to skip unrelated videos at bottom of listing page

VIDEO_PAGE_URLS_FILE="video-page-urls.txt"
VIDEO_TITLES_FILE="video-titles.txt"

VIDEO_DOWNLOAD_URL_RE='<div class="posts-video"><iframe.*src="([^"]+)".*</iframe>'
VIDEO_DOWNOAD_URLS_FILE="video-download-urls.txt"


# --------------------------------------
#  Functions                           -
# --------------------------------------

# Print normal/hilite/good/warn/error message [$*: message]
function printn() { echo -e "$*"; }
function printh() { echo -e "\e[1m$*\e[0m"; }
function printg() { echo -e "\e[1;32m$*\e[0m"; }
function printw() { echo -e "\e[1;33m$*\e[0m" >&2; }
function printe() { echo -e "\e[1;31m$*\e[0m" >&2; }


# --------------------------------------
#  Main                                -
# --------------------------------------

# Set up error handling
set -ue; trap "printe \"Error: an unhandled error occurred on line \${LINENO}, aborting.\"; exit 1" ERR

# Set up storages
declare -A video_page_urls=()
declare -A video_titles=()
declare -A video_download_urls=()

# Process video listing pages and extract video page URLs (both on cinemassacre.com)
for ((list_page_index=1; list_page_index <= ${LIST_PAGE_COUNT}; list_page_index++)); do
	printh "Processing video listing page ${list_page_index}:"
	printf -v list_page_url "${LIST_PAGE_URL}" ${list_page_index}
	printn "Video listing page '${list_page_url}'..."

	# Parse listing page, extract video page URLs
	url_title_count=0
	while read -r line; do
		if [[ "${line}" =~ ${VIDEO_PAGE_URL_RE} ]]; then
			video_page_url="${BASH_REMATCH[1]}"
			video_title="${BASH_REMATCH[2]}"

			# Determine episode and part
			if [[ "${video_title}" =~ ${VIDEO_TITLE_EPISODE_PART_RE} ]]; then
				if [[ -z "${BASH_REMATCH[3]}" ]]; then # got part?
					episode_part="E${BASH_REMATCH[1]}" # episode only
				else
					episode_part="E${BASH_REMATCH[1]}_P${BASH_REMATCH[3]}" # episode + part
				fi
			else
				printw "No episode/part match for title '${video_title}'"
				episode_part="E???"
			fi

			# Store video URL and video title
			video_page_urls["${episode_part}"]="${video_page_url}"
			video_titles["${episode_part}"]="${video_title}"
			url_title_count=$((url_title_count + 1))

			#printd "[DEBUG] Video URL: '${video_page_url}', video title: '${video_title}'"
		elif [[ "${line}" == "${VIDEO_PAGE_STOP_MARKER}" ]]; then
			break # stop parsing
		#else
		#	printd "[DEBUG] No match for line: '${line}'"
		fi
	done < <(curl "${CURL_OPTS[@]}" "${list_page_url}")

	printn "Got ${url_title_count} video page URLs and video titles from video listing page."
done
printn
printg "Got ${#video_page_urls[@]} video page URLs and video titles from video listing pages in total."
printn

# Save video page URLs and video titles to file
printh "Saving video page URLs and video titles to files:"
if [[ -f "${VIDEO_PAGE_URLS_FILE}" ]]; then
	printn "Backing up existing '${VIDEO_PAGE_URLS_FILE}'..."
	cp "${VIDEO_PAGE_URLS_FILE}" "${VIDEO_PAGE_URLS_FILE}.old"
fi
printn "Writing '${VIDEO_PAGE_URLS_FILE}'..."
for episode_part in "${!video_page_urls[@]}"; do
	echo "${episode_part}: ${video_page_urls["${episode_part}"]}"
done | sort -V | uniq > "${VIDEO_PAGE_URLS_FILE}"
if [[ -f "${VIDEO_TITLES_FILE}" ]]; then
	printn "Backing up existing '${VIDEO_TITLES_FILE}'..."
	cp "${VIDEO_TITLES_FILE}" "${VIDEO_TITLES_FILE}.old"
fi
printn "Writing '${VIDEO_TITLES_FILE}'..."
for episode_part in "${!video_titles[@]}"; do
	video_title="${video_titles["${episode_part}"]}"
	video_title="${video_title//"&#038;"/"&"}"  # &#038; == '&' -> '&'
	#video_title="${video_title//"&#8211;"/":"}" # &#8211; == ':' -> ':'
	video_title="${video_title//"&#8211;"/"-"}" # &#8211; == ':' -> '-' (better)
	video_title="${video_title//"&#8217;"/"'"}" # &#8217; == "'" -> "'"
	video_title="${video_title//"&#8220;"/"'"}" # &#8220; == left double quote -> "'"
	video_title="${video_title//"&#8221;"/"'"}" # &#8221; == right double quote -> "'"
	#video_title="${video_title//"&;"/""}" # &; == '' -> ''
	echo "${episode_part}: ${video_title}"
done | sort -V | uniq > "${VIDEO_TITLES_FILE}"
printn

# Process video pages (on cinemassacre.com) to extract video download URLs (videos are stored on odysee.com, lbry.tv, and Youtube)
printh "Processing video pages to extract video download URLs..."
for episode_part in "${!video_page_urls[@]}"; do
	video_page_url="${video_page_urls["${episode_part}"]}"
	printn "Video page '${video_page_url}'..."
	url_count=0
	while read -r line; do
		if [[ "${line}" =~ ${VIDEO_DOWNLOAD_URL_RE} ]]; then
			video_download_url="${BASH_REMATCH[1]}"
			video_download_urls["${episode_part}"]="${video_download_url}"
			url_count=$((url_count + 1))
			#printd "[DEBUG] Video download URL: '${video_download_url}'"
		#else
		#	printd "[DEBUG] No match for line: '${line}'"
		fi
	done < <(curl "${CURL_OPTS[@]}" "${video_page_url}")
	if (( ${url_count} != 1 )); then # could be 0 or more than 1
		printw "Got ${url_count} video download URLs from video page."
	fi
done
printn
printg "Got ${#video_download_urls[@]} video download URLs from video pages in total."
printn

# Save video download URLs to file
printh "Saving video download URLs to file..."
if [[ -f "${VIDEO_DOWNOAD_URLS_FILE}" ]]; then
	printn "Backing up existing '${VIDEO_DOWNOAD_URLS_FILE}'..."
	cp "${VIDEO_DOWNOAD_URLS_FILE}" "${VIDEO_DOWNOAD_URLS_FILE}.old"
fi
printn "Writing '${VIDEO_DOWNOAD_URLS_FILE}'..."
for episode_part in "${!video_download_urls[@]}"; do
	echo "${episode_part}: ${video_download_urls["${episode_part}"]}"
done | sort -V | uniq > "${VIDEO_DOWNOAD_URLS_FILE}"
	#!/usr/bin/env bash

	# -------------------------------------------------------------------------
	# -
	# Cinemassacre.com Angry Video Game Nerd (AVGN) Episode Scraper -
	# -
	# Created by Fonic <https://github.com/fonic> -
	# Date: 02/18/23 - 02/19/23 -
	# -
	# Why use cinemassacre.com instead of Youtube?: -
	# 1) cinemassacre.com is THE reference source for AVGN episodes -
	# 2) video titles on cinemassacre.com contain episode numbers -
	# -
	# Caveats: -
	# Most videos listed on cinemassacre.com are stored on odysee.com. -
	# Downloads from there are slow and unstable. Thus, one might be -
	# better off to download videos from Youtube instead and just use -
	# episode and title information from this scraper. -
	# -
	# -------------------------------------------------------------------------

	# --------------------------------------
	# Globals -
	# --------------------------------------

	CURL_OPTS=("--fail" "--location" "--silent" "--show-error" "--retry" "2" "--connect-timeout" "60")

	LIST_PAGE_URL="https://cinemassacre.com/category/angry-video-game-nerd/page/%d/" # also works for page 1 which redirects to URL without trailing '/page/x/'
	LIST_PAGE_COUNT=6
	VIDEO_PAGE_URL_RE='<a href="([^"]+)" class="video-title">([^<]+)</a>' # video URL, video title
	VIDEO_TITLE_EPISODE_PART_RE="Episode ([0-9]+)( Part ([0-9]+))?" # part is optional (part number will be in BASH_REMATCH[3]!)
	VIDEO_PAGE_STOP_MARKER='<div class="search-box-wrapper">' # stop parsing here to skip unrelated videos at bottom of listing page

	VIDEO_PAGE_URLS_FILE="video-page-urls.txt"
	VIDEO_TITLES_FILE="video-titles.txt"

	VIDEO_DOWNLOAD_URL_RE='<div class="posts-video"><iframe.src="([^"]+)".</iframe>'
	VIDEO_DOWNOAD_URLS_FILE="video-download-urls.txt"


	# --------------------------------------
	# Functions -
	# --------------------------------------

	# Print normal/hilite/good/warn/error message [$*: message]
	function printn() { echo -e "$*"; }
	function printh() { echo -e "\e[1m$*\e[0m"; }
	function printg() { echo -e "\e[1;32m$*\e[0m"; }
	function printw() { echo -e "\e[1;33m$*\e[0m" >&2; }
	function printe() { echo -e "\e[1;31m$*\e[0m" >&2; }


	# --------------------------------------
	# Main -
	# --------------------------------------

	# Set up error handling
	set -ue; trap "printe \"Error: an unhandled error occurred on line \${LINENO}, aborting.\"; exit 1" ERR

	# Set up storages
	declare -A video_page_urls=()
	declare -A video_titles=()
	declare -A video_download_urls=()

	# Process video listing pages and extract video page URLs (both on cinemassacre.com)
	for ((list_page_index=1; list_page_index <= ${LIST_PAGE_COUNT}; list_page_index++)); do
	printh "Processing video listing page ${list_page_index}:"
	printf -v list_page_url "${LIST_PAGE_URL}" ${list_page_index}
	printn "Video listing page '${list_page_url}'..."

	# Parse listing page, extract video page URLs
	url_title_count=0
	while read -r line; do
	if [[ "${line}" =~ ${VIDEO_PAGE_URL_RE} ]]; then
	video_page_url="${BASH_REMATCH[1]}"
	video_title="${BASH_REMATCH[2]}"

	# Determine episode and part
	if [[ "${video_title}" =~ ${VIDEO_TITLE_EPISODE_PART_RE} ]]; then
	if [[ -z "${BASH_REMATCH[3]}" ]]; then # got part?
	episode_part="E${BASH_REMATCH[1]}" # episode only
	else
	episode_part="E${BASH_REMATCH[1]}_P${BASH_REMATCH[3]}" # episode + part
	fi
	else
	printw "No episode/part match for title '${video_title}'"
	episode_part="E???"
	fi

	# Store video URL and video title
	video_page_urls["${episode_part}"]="${video_page_url}"
	video_titles["${episode_part}"]="${video_title}"
	url_title_count=$((url_title_count + 1))

	#printd "[DEBUG] Video URL: '${video_page_url}', video title: '${video_title}'"
	elif [[ "${line}" == "${VIDEO_PAGE_STOP_MARKER}" ]]; then
	break # stop parsing
	#else
	# printd "[DEBUG] No match for line: '${line}'"
	fi
	done < <(curl "${CURL_OPTS[@]}" "${list_page_url}")

	printn "Got ${url_title_count} video page URLs and video titles from video listing page."
	done
	printn
	printg "Got ${#video_page_urls[@]} video page URLs and video titles from video listing pages in total."
	printn

	# Save video page URLs and video titles to file
	printh "Saving video page URLs and video titles to files:"
	if [[ -f "${VIDEO_PAGE_URLS_FILE}" ]]; then
	printn "Backing up existing '${VIDEO_PAGE_URLS_FILE}'..."
	cp "${VIDEO_PAGE_URLS_FILE}" "${VIDEO_PAGE_URLS_FILE}.old"
	fi
	printn "Writing '${VIDEO_PAGE_URLS_FILE}'..."
	for episode_part in "${!video_page_urls[@]}"; do
	echo "${episode_part}: ${video_page_urls["${episode_part}"]}"
	done \| sort -V \| uniq > "${VIDEO_PAGE_URLS_FILE}"
	if [[ -f "${VIDEO_TITLES_FILE}" ]]; then
	printn "Backing up existing '${VIDEO_TITLES_FILE}'..."
	cp "${VIDEO_TITLES_FILE}" "${VIDEO_TITLES_FILE}.old"
	fi
	printn "Writing '${VIDEO_TITLES_FILE}'..."
	for episode_part in "${!video_titles[@]}"; do
	video_title="${video_titles["${episode_part}"]}"
	video_title="${video_title//"&"/"&"}" # & == '&' -> '&'
	#video_title="${video_title//"–"/":"}" # – == ':' -> ':'
	video_title="${video_title//"–"/"-"}" # – == ':' -> '-' (better)
	video_title="${video_title//"’"/"'"}" # ’ == "'" -> "'"
	video_title="${video_title//"“"/"'"}" # “ == left double quote -> "'"
	video_title="${video_title//"”"/"'"}" # ” == right double quote -> "'"
	#video_title="${video_title//"&;"/""}" # &; == '' -> ''
	echo "${episode_part}: ${video_title}"
	done \| sort -V \| uniq > "${VIDEO_TITLES_FILE}"
	printn

	# Process video pages (on cinemassacre.com) to extract video download URLs (videos are stored on odysee.com, lbry.tv, and Youtube)
	printh "Processing video pages to extract video download URLs..."
	for episode_part in "${!video_page_urls[@]}"; do
	video_page_url="${video_page_urls["${episode_part}"]}"
	printn "Video page '${video_page_url}'..."
	url_count=0
	while read -r line; do
	if [[ "${line}" =~ ${VIDEO_DOWNLOAD_URL_RE} ]]; then
	video_download_url="${BASH_REMATCH[1]}"
	video_download_urls["${episode_part}"]="${video_download_url}"
	url_count=$((url_count + 1))
	#printd "[DEBUG] Video download URL: '${video_download_url}'"
	#else
	# printd "[DEBUG] No match for line: '${line}'"
	fi
	done < <(curl "${CURL_OPTS[@]}" "${video_page_url}")
	if (( ${url_count} != 1 )); then # could be 0 or more than 1
	printw "Got ${url_count} video download URLs from video page."
	fi
	done
	printn
	printg "Got ${#video_download_urls[@]} video download URLs from video pages in total."
	printn

	# Save video download URLs to file
	printh "Saving video download URLs to file..."
	if [[ -f "${VIDEO_DOWNOAD_URLS_FILE}" ]]; then
	printn "Backing up existing '${VIDEO_DOWNOAD_URLS_FILE}'..."
	cp "${VIDEO_DOWNOAD_URLS_FILE}" "${VIDEO_DOWNOAD_URLS_FILE}.old"
	fi
	printn "Writing '${VIDEO_DOWNOAD_URLS_FILE}'..."
	for episode_part in "${!video_download_urls[@]}"; do
	echo "${episode_part}: ${video_download_urls["${episode_part}"]}"
	done \| sort -V \| uniq > "${VIDEO_DOWNOAD_URLS_FILE}"