Skip to content

Instantly share code, notes, and snippets.

@fonic
Created February 19, 2023 12:10
Show Gist options
  • Save fonic/8f65ba5913ba9e585be63cc03bbca165 to your computer and use it in GitHub Desktop.
Save fonic/8f65ba5913ba9e585be63cc03bbca165 to your computer and use it in GitHub Desktop.
Cinemassacre.com Angry Video Game Nerd (AVGN) Episode Scraper
#!/usr/bin/env bash
# -------------------------------------------------------------------------
# -
# Cinemassacre.com Angry Video Game Nerd (AVGN) Episode Scraper -
# -
# Created by Fonic <https://github.com/fonic> -
# Date: 02/18/23 - 02/19/23 -
# -
# Why use cinemassacre.com instead of Youtube?: -
# 1) cinemassacre.com is THE reference source for AVGN episodes -
# 2) video titles on cinemassacre.com contain episode numbers -
# -
# Caveats: -
# Most videos listed on cinemassacre.com are stored on odysee.com. -
# Downloads from there are slow and unstable. Thus, one might be -
# better off to download videos from Youtube instead and just use -
# episode and title information from this scraper. -
# -
# -------------------------------------------------------------------------
# --------------------------------------
# Globals -
# --------------------------------------
CURL_OPTS=("--fail" "--location" "--silent" "--show-error" "--retry" "2" "--connect-timeout" "60")
LIST_PAGE_URL="https://cinemassacre.com/category/angry-video-game-nerd/page/%d/" # also works for page 1 which redirects to URL without trailing '/page/x/'
LIST_PAGE_COUNT=6
VIDEO_PAGE_URL_RE='<a href="([^"]+)" class="video-title">([^<]+)</a>' # video URL, video title
VIDEO_TITLE_EPISODE_PART_RE="Episode ([0-9]+)( Part ([0-9]+))?" # part is optional (part number will be in BASH_REMATCH[3]!)
VIDEO_PAGE_STOP_MARKER='<div class="search-box-wrapper">' # stop parsing here to skip unrelated videos at bottom of listing page
VIDEO_PAGE_URLS_FILE="video-page-urls.txt"
VIDEO_TITLES_FILE="video-titles.txt"
VIDEO_DOWNLOAD_URL_RE='<div class="posts-video"><iframe.*src="([^"]+)".*</iframe>'
VIDEO_DOWNOAD_URLS_FILE="video-download-urls.txt"
# --------------------------------------
# Functions -
# --------------------------------------
# Print normal/hilite/good/warn/error message [$*: message]
function printn() { echo -e "$*"; }
function printh() { echo -e "\e[1m$*\e[0m"; }
function printg() { echo -e "\e[1;32m$*\e[0m"; }
function printw() { echo -e "\e[1;33m$*\e[0m" >&2; }
function printe() { echo -e "\e[1;31m$*\e[0m" >&2; }
# --------------------------------------
# Main -
# --------------------------------------
# Set up error handling
set -ue; trap "printe \"Error: an unhandled error occurred on line \${LINENO}, aborting.\"; exit 1" ERR
# Set up storages
declare -A video_page_urls=()
declare -A video_titles=()
declare -A video_download_urls=()
# Process video listing pages and extract video page URLs (both on cinemassacre.com)
for ((list_page_index=1; list_page_index <= ${LIST_PAGE_COUNT}; list_page_index++)); do
printh "Processing video listing page ${list_page_index}:"
printf -v list_page_url "${LIST_PAGE_URL}" ${list_page_index}
printn "Video listing page '${list_page_url}'..."
# Parse listing page, extract video page URLs
url_title_count=0
while read -r line; do
if [[ "${line}" =~ ${VIDEO_PAGE_URL_RE} ]]; then
video_page_url="${BASH_REMATCH[1]}"
video_title="${BASH_REMATCH[2]}"
# Determine episode and part
if [[ "${video_title}" =~ ${VIDEO_TITLE_EPISODE_PART_RE} ]]; then
if [[ -z "${BASH_REMATCH[3]}" ]]; then # got part?
episode_part="E${BASH_REMATCH[1]}" # episode only
else
episode_part="E${BASH_REMATCH[1]}_P${BASH_REMATCH[3]}" # episode + part
fi
else
printw "No episode/part match for title '${video_title}'"
episode_part="E???"
fi
# Store video URL and video title
video_page_urls["${episode_part}"]="${video_page_url}"
video_titles["${episode_part}"]="${video_title}"
url_title_count=$((url_title_count + 1))
#printd "[DEBUG] Video URL: '${video_page_url}', video title: '${video_title}'"
elif [[ "${line}" == "${VIDEO_PAGE_STOP_MARKER}" ]]; then
break # stop parsing
#else
# printd "[DEBUG] No match for line: '${line}'"
fi
done < <(curl "${CURL_OPTS[@]}" "${list_page_url}")
printn "Got ${url_title_count} video page URLs and video titles from video listing page."
done
printn
printg "Got ${#video_page_urls[@]} video page URLs and video titles from video listing pages in total."
printn
# Save video page URLs and video titles to file
printh "Saving video page URLs and video titles to files:"
if [[ -f "${VIDEO_PAGE_URLS_FILE}" ]]; then
printn "Backing up existing '${VIDEO_PAGE_URLS_FILE}'..."
cp "${VIDEO_PAGE_URLS_FILE}" "${VIDEO_PAGE_URLS_FILE}.old"
fi
printn "Writing '${VIDEO_PAGE_URLS_FILE}'..."
for episode_part in "${!video_page_urls[@]}"; do
echo "${episode_part}: ${video_page_urls["${episode_part}"]}"
done | sort -V | uniq > "${VIDEO_PAGE_URLS_FILE}"
if [[ -f "${VIDEO_TITLES_FILE}" ]]; then
printn "Backing up existing '${VIDEO_TITLES_FILE}'..."
cp "${VIDEO_TITLES_FILE}" "${VIDEO_TITLES_FILE}.old"
fi
printn "Writing '${VIDEO_TITLES_FILE}'..."
for episode_part in "${!video_titles[@]}"; do
video_title="${video_titles["${episode_part}"]}"
video_title="${video_title//"&#038;"/"&"}" # &#038; == '&' -> '&'
#video_title="${video_title//"&#8211;"/":"}" # &#8211; == ':' -> ':'
video_title="${video_title//"&#8211;"/"-"}" # &#8211; == ':' -> '-' (better)
video_title="${video_title//"&#8217;"/"'"}" # &#8217; == "'" -> "'"
video_title="${video_title//"&#8220;"/"'"}" # &#8220; == left double quote -> "'"
video_title="${video_title//"&#8221;"/"'"}" # &#8221; == right double quote -> "'"
#video_title="${video_title//"&;"/""}" # &; == '' -> ''
echo "${episode_part}: ${video_title}"
done | sort -V | uniq > "${VIDEO_TITLES_FILE}"
printn
# Process video pages (on cinemassacre.com) to extract video download URLs (videos are stored on odysee.com, lbry.tv, and Youtube)
printh "Processing video pages to extract video download URLs..."
for episode_part in "${!video_page_urls[@]}"; do
video_page_url="${video_page_urls["${episode_part}"]}"
printn "Video page '${video_page_url}'..."
url_count=0
while read -r line; do
if [[ "${line}" =~ ${VIDEO_DOWNLOAD_URL_RE} ]]; then
video_download_url="${BASH_REMATCH[1]}"
video_download_urls["${episode_part}"]="${video_download_url}"
url_count=$((url_count + 1))
#printd "[DEBUG] Video download URL: '${video_download_url}'"
#else
# printd "[DEBUG] No match for line: '${line}'"
fi
done < <(curl "${CURL_OPTS[@]}" "${video_page_url}")
if (( ${url_count} != 1 )); then # could be 0 or more than 1
printw "Got ${url_count} video download URLs from video page."
fi
done
printn
printg "Got ${#video_download_urls[@]} video download URLs from video pages in total."
printn
# Save video download URLs to file
printh "Saving video download URLs to file..."
if [[ -f "${VIDEO_DOWNOAD_URLS_FILE}" ]]; then
printn "Backing up existing '${VIDEO_DOWNOAD_URLS_FILE}'..."
cp "${VIDEO_DOWNOAD_URLS_FILE}" "${VIDEO_DOWNOAD_URLS_FILE}.old"
fi
printn "Writing '${VIDEO_DOWNOAD_URLS_FILE}'..."
for episode_part in "${!video_download_urls[@]}"; do
echo "${episode_part}: ${video_download_urls["${episode_part}"]}"
done | sort -V | uniq > "${VIDEO_DOWNOAD_URLS_FILE}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment