Created
February 19, 2023 12:10
-
-
Save fonic/8f65ba5913ba9e585be63cc03bbca165 to your computer and use it in GitHub Desktop.
Cinemassacre.com Angry Video Game Nerd (AVGN) Episode Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# ------------------------------------------------------------------------- | |
# - | |
# Cinemassacre.com Angry Video Game Nerd (AVGN) Episode Scraper - | |
# - | |
# Created by Fonic <https://github.com/fonic> - | |
# Date: 02/18/23 - 02/19/23 - | |
# - | |
# Why use cinemassacre.com instead of Youtube?: - | |
# 1) cinemassacre.com is THE reference source for AVGN episodes - | |
# 2) video titles on cinemassacre.com contain episode numbers - | |
# - | |
# Caveats: - | |
# Most videos listed on cinemassacre.com are stored on odysee.com. - | |
# Downloads from there are slow and unstable. Thus, one might be - | |
# better off to download videos from Youtube instead and just use - | |
# episode and title information from this scraper. - | |
# - | |
# ------------------------------------------------------------------------- | |
# -------------------------------------- | |
# Globals - | |
# -------------------------------------- | |
CURL_OPTS=("--fail" "--location" "--silent" "--show-error" "--retry" "2" "--connect-timeout" "60") | |
LIST_PAGE_URL="https://cinemassacre.com/category/angry-video-game-nerd/page/%d/" # also works for page 1 which redirects to URL without trailing '/page/x/' | |
LIST_PAGE_COUNT=6 | |
VIDEO_PAGE_URL_RE='<a href="([^"]+)" class="video-title">([^<]+)</a>' # video URL, video title | |
VIDEO_TITLE_EPISODE_PART_RE="Episode ([0-9]+)( Part ([0-9]+))?" # part is optional (part number will be in BASH_REMATCH[3]!) | |
VIDEO_PAGE_STOP_MARKER='<div class="search-box-wrapper">' # stop parsing here to skip unrelated videos at bottom of listing page | |
VIDEO_PAGE_URLS_FILE="video-page-urls.txt" | |
VIDEO_TITLES_FILE="video-titles.txt" | |
VIDEO_DOWNLOAD_URL_RE='<div class="posts-video"><iframe.*src="([^"]+)".*</iframe>' | |
VIDEO_DOWNOAD_URLS_FILE="video-download-urls.txt" | |
# -------------------------------------- | |
# Functions - | |
# -------------------------------------- | |
# Print normal/hilite/good/warn/error message [$*: message] | |
function printn() { echo -e "$*"; } | |
function printh() { echo -e "\e[1m$*\e[0m"; } | |
function printg() { echo -e "\e[1;32m$*\e[0m"; } | |
function printw() { echo -e "\e[1;33m$*\e[0m" >&2; } | |
function printe() { echo -e "\e[1;31m$*\e[0m" >&2; } | |
# -------------------------------------- | |
# Main - | |
# -------------------------------------- | |
# Set up error handling | |
set -ue; trap "printe \"Error: an unhandled error occurred on line \${LINENO}, aborting.\"; exit 1" ERR | |
# Set up storages | |
declare -A video_page_urls=() | |
declare -A video_titles=() | |
declare -A video_download_urls=() | |
# Process video listing pages and extract video page URLs (both on cinemassacre.com) | |
for ((list_page_index=1; list_page_index <= ${LIST_PAGE_COUNT}; list_page_index++)); do | |
printh "Processing video listing page ${list_page_index}:" | |
printf -v list_page_url "${LIST_PAGE_URL}" ${list_page_index} | |
printn "Video listing page '${list_page_url}'..." | |
# Parse listing page, extract video page URLs | |
url_title_count=0 | |
while read -r line; do | |
if [[ "${line}" =~ ${VIDEO_PAGE_URL_RE} ]]; then | |
video_page_url="${BASH_REMATCH[1]}" | |
video_title="${BASH_REMATCH[2]}" | |
# Determine episode and part | |
if [[ "${video_title}" =~ ${VIDEO_TITLE_EPISODE_PART_RE} ]]; then | |
if [[ -z "${BASH_REMATCH[3]}" ]]; then # got part? | |
episode_part="E${BASH_REMATCH[1]}" # episode only | |
else | |
episode_part="E${BASH_REMATCH[1]}_P${BASH_REMATCH[3]}" # episode + part | |
fi | |
else | |
printw "No episode/part match for title '${video_title}'" | |
episode_part="E???" | |
fi | |
# Store video URL and video title | |
video_page_urls["${episode_part}"]="${video_page_url}" | |
video_titles["${episode_part}"]="${video_title}" | |
url_title_count=$((url_title_count + 1)) | |
#printd "[DEBUG] Video URL: '${video_page_url}', video title: '${video_title}'" | |
elif [[ "${line}" == "${VIDEO_PAGE_STOP_MARKER}" ]]; then | |
break # stop parsing | |
#else | |
# printd "[DEBUG] No match for line: '${line}'" | |
fi | |
done < <(curl "${CURL_OPTS[@]}" "${list_page_url}") | |
printn "Got ${url_title_count} video page URLs and video titles from video listing page." | |
done | |
printn | |
printg "Got ${#video_page_urls[@]} video page URLs and video titles from video listing pages in total." | |
printn | |
# Save video page URLs and video titles to file | |
printh "Saving video page URLs and video titles to files:" | |
if [[ -f "${VIDEO_PAGE_URLS_FILE}" ]]; then | |
printn "Backing up existing '${VIDEO_PAGE_URLS_FILE}'..." | |
cp "${VIDEO_PAGE_URLS_FILE}" "${VIDEO_PAGE_URLS_FILE}.old" | |
fi | |
printn "Writing '${VIDEO_PAGE_URLS_FILE}'..." | |
for episode_part in "${!video_page_urls[@]}"; do | |
echo "${episode_part}: ${video_page_urls["${episode_part}"]}" | |
done | sort -V | uniq > "${VIDEO_PAGE_URLS_FILE}" | |
if [[ -f "${VIDEO_TITLES_FILE}" ]]; then | |
printn "Backing up existing '${VIDEO_TITLES_FILE}'..." | |
cp "${VIDEO_TITLES_FILE}" "${VIDEO_TITLES_FILE}.old" | |
fi | |
printn "Writing '${VIDEO_TITLES_FILE}'..." | |
for episode_part in "${!video_titles[@]}"; do | |
video_title="${video_titles["${episode_part}"]}" | |
video_title="${video_title//"&"/"&"}" # & == '&' -> '&' | |
#video_title="${video_title//"–"/":"}" # – == ':' -> ':' | |
video_title="${video_title//"–"/"-"}" # – == ':' -> '-' (better) | |
video_title="${video_title//"’"/"'"}" # ’ == "'" -> "'" | |
video_title="${video_title//"“"/"'"}" # “ == left double quote -> "'" | |
video_title="${video_title//"”"/"'"}" # ” == right double quote -> "'" | |
#video_title="${video_title//"&;"/""}" # &; == '' -> '' | |
echo "${episode_part}: ${video_title}" | |
done | sort -V | uniq > "${VIDEO_TITLES_FILE}" | |
printn | |
# Process video pages (on cinemassacre.com) to extract video download URLs (videos are stored on odysee.com, lbry.tv, and Youtube) | |
printh "Processing video pages to extract video download URLs..." | |
for episode_part in "${!video_page_urls[@]}"; do | |
video_page_url="${video_page_urls["${episode_part}"]}" | |
printn "Video page '${video_page_url}'..." | |
url_count=0 | |
while read -r line; do | |
if [[ "${line}" =~ ${VIDEO_DOWNLOAD_URL_RE} ]]; then | |
video_download_url="${BASH_REMATCH[1]}" | |
video_download_urls["${episode_part}"]="${video_download_url}" | |
url_count=$((url_count + 1)) | |
#printd "[DEBUG] Video download URL: '${video_download_url}'" | |
#else | |
# printd "[DEBUG] No match for line: '${line}'" | |
fi | |
done < <(curl "${CURL_OPTS[@]}" "${video_page_url}") | |
if (( ${url_count} != 1 )); then # could be 0 or more than 1 | |
printw "Got ${url_count} video download URLs from video page." | |
fi | |
done | |
printn | |
printg "Got ${#video_download_urls[@]} video download URLs from video pages in total." | |
printn | |
# Save video download URLs to file | |
printh "Saving video download URLs to file..." | |
if [[ -f "${VIDEO_DOWNOAD_URLS_FILE}" ]]; then | |
printn "Backing up existing '${VIDEO_DOWNOAD_URLS_FILE}'..." | |
cp "${VIDEO_DOWNOAD_URLS_FILE}" "${VIDEO_DOWNOAD_URLS_FILE}.old" | |
fi | |
printn "Writing '${VIDEO_DOWNOAD_URLS_FILE}'..." | |
for episode_part in "${!video_download_urls[@]}"; do | |
echo "${episode_part}: ${video_download_urls["${episode_part}"]}" | |
done | sort -V | uniq > "${VIDEO_DOWNOAD_URLS_FILE}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment