Skip to content

Instantly share code, notes, and snippets.

@nabijaczleweli
Created January 18, 2018 17:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nabijaczleweli/1147dfd931be77e776cc386b6d361d35 to your computer and use it in GitHub Desktop.
Save nabijaczleweli/1147dfd931be77e776cc386b6d361d35 to your computer and use it in GitHub Desktop.
Chapterwise scraper for archiveofourown, w/image support and normalisation.
#!/bin/bash
tempdir=""
if [[ "$TEMP" ]] && [[ ! "$tempdir" ]]; then tempdir="$TEMP/archiveofourown-scrapper"; fi
if [[ "$TMP" ]] && [[ ! "$tempdir" ]]; then tempdir="$TMP/archiveofourown-scrapper"; fi
if ! [[ "$tempdir" ]]; then
echo "Couldn't find temprary directory" > 1
exit 1
fi
mkdir -p "$tempdir"
raw_url="$1"
work_id="$(echo "$raw_url" | sed -r 's;((http(s?):\/\/)?archiveofourown.org\/works\/)?([[:digit:]]+).*;\4;')"
nav_url="https://archiveofourown.org/works/$work_id/navigate"
nav_out="$tempdir/nav-$work_id.html"
image_list="$tempdir/nav-$work_id-images"
if [[ ! -f "$nav_out" ]]; then
echo "Downloading chapter index from $nav_url to $nav_out"
curl -SL --compressed "$nav_url" -o "$nav_out" || (echo "Retrying w/o compression" && curl -SL "$nav_url" -o "$nav_out")
else
echo "Using cached chapter index in $nav_out"
fi
title="$(awk '
/"\/works\/'"$work_id"'"/ {
print(gensub(/.*'"$work_id"'">([^<]+)<\/a>.*/, "\\1", "g"));
}
' "$nav_out")"
title_noquote="${title//\"/ʹ}"
author="$(awk '
/"\/works\/'"$work_id"'"/ {
print(gensub(/.*<a rel="author" [^>]*>([^<]+)<\/a>.*/, "\\1", "g"));
}
' "$nav_out")"
echo > "$image_list"
chapter_num=1
awk -F '/' '
/<ol class="chapter index group" role="navigation">/,/<\/ol>/ {
if($0 !~ /<ol class="chapter index group" role="navigation">/ && $0 !~ /<\/ol>/)
print(gensub(/".*/, "", "g", $5));
}
' "$nav_out" | while read -r chapter_id; do
chapter_num_wide="$(printf "%03d" "$chapter_num")"
chapter_url="https://archiveofourown.org/works/$work_id/chapters/$chapter_id?view_adult=true"
chapter_out="$tempdir/ch-$work_id-$chapter_num_wide-$chapter_id.html"
chapter_final_out="$title ch$chapter_num_wide.html"
if [[ ! -f "$chapter_out" ]]; then
echo "Downloading chapter $chapter_num from $chapter_url to $chapter_out"
curl -SL --compressed "$chapter_url" -o "$chapter_out" || (echo "Retrying w/o compression" && curl -SL "$chapter_url" -o "$chapter_out")
else
echo "Using cached chapter $chapter_num in $chapter_out"
fi
{
echo '<!DOCTYPE html>'
echo '<html lang="en">'
echo ' <head>'
echo ' <meta charset="utf-8" />'
echo ' <title>'"$title"' - Chapter '"$chapter_num"'</title>'
echo ' <meta name="author" content="'"$author"'" />'
echo ' </head>'
echo ' <body>'
echo ' <!-- ePub title: "'"$title_noquote"' - Chapter '"$chapter_num"'" -->'
echo
awk '
BEGIN {
title_open=0;
title_close=0;
}
/<h2 class="title heading">/ {
if(!title_open) {
print(" <a href=\"https://archiveofourown.org/works/'"$work_id"'\">");
title_open=1;
}
}
/<!--.*BEGIN.*work skin.*-->/,/<!--.*END.*work skin.*-->/ {
result=gensub(/href="\//, "href=\"https://archiveofourown.org/", "g");
if(result ~ /(<img.*)src="http(s?):\/\/([^"/]+\/)+(.*".*\/>)/)
print(result) >> "'"$image_list"'";
while(result ~ /(<img.*)src="http(s?):\/\/([^"/]+\/)+(.*".*\/>)/)
result = gensub(/(<img.*)src="http(s?):\/\/([^"/]+\/)+(.*".*\/>)/, "\\1 src=\"\\4", "g", result);
print(result);
}
/<\/h2>/ {
if(title_open && !title_close) {
print(" </a>");
title_close=1;
}
}
' "$chapter_out" | html-beautify -qm 3 -te "\n" -n
echo
echo ' </body>'
echo '</html>'
} > "$chapter_final_out"
((++chapter_num))
done
sed 's;<;\n<;g' "$image_list" | awk '/img/ {print(gensub(/[[:space:] ]/, "", "g", gensub(/<img.*src="(http(s?):\/\/([^"/]+\/)+.*\.[[:alnum:]]*)".*\/>/, "\\1", "g")));}' | while read -r image_url; do
image_out="$(echo "$image_url" | sed -r 's;[[:alpha:]]+://(.*/)+;;')"
if [[ ! -f "$image_out" ]]; then
echo "Downloading image from $image_url to $image_out"
curl -SL --compressed "$image_url" -o "$image_out" || (echo "Retrying w/o compression" && curl -SL "$image_url" -o "$image_out")
else
echo "Using cached image in $image_out"
fi
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment