Last active
January 12, 2021 23:40
-
-
Save troyp/4f25a1df892f465227b3f540e5d89618 to your computer and use it in GitHub Desktop.
One Piece scripts for scraping and caching anime and manga data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
onepiece-episode-title: Get an episode title in english, romaji &/or japanese, or print a title according to a format string | |
prerequisites: | |
gnu tools: bash, sed, tr | |
curl | |
pup (https://github.com/ericchiang/pup) | |
xmlstarlet (unesc) | |
onepiece-episode-chapters: List the manga chapters adapted by an episode | |
prerequisites: | |
gnu tools: bash, sed, tr | |
curl | |
perl | |
HTML-XML-utils | |
html2text | |
onepiece-list-volume: List the manga chapters in a tankōbon volume | |
prerequisites: | |
gnu tools: bash, sed, tr | |
curl | |
HTML-XML-utils | |
html2text | |
onepiece-find-volume: Find the tankōbon volume containing a manga chapter | |
prerequisites: | |
gnu tools: bash, sed, tr | |
curl | |
HTML-XML-utils | |
html2text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# ,------, | |
# | Help | | |
# '------' | |
if [[ "$1" =~ ^-h$|^--help$ ]]; then | |
cat <<EOF | |
Usage: onepiece-episode-chapters [OPTION...] N | |
Print manga chapters corresponding to a One Piece anime episode | |
Data scraped from the One Piece Wiki at https://onepiece.fandom.com/wiki/ | |
Options: | |
-h, --help show help | |
-b, --bypass-cache ignore cache files in $ONEPIECE_EPISODE_DATA | |
-c, --generate-cache arguments: [FIRST] LAST | |
generate cache files in $ONEPIECE_EPISODE_DATA | |
-C, --generate-cache-here arguments: [FIRST] LAST | |
generate cache files in current directory | |
-o, --chapters-only list chapters one to a line | |
-v, --verbose show stderror output | |
EOF | |
exit 0; | |
fi; | |
# ------------------------------------------------------------------------------- | |
# ,-----------------, | |
# | Options (other) | | |
# '-----------------' | |
bypass=''; | |
cache=''; | |
maxep=''; | |
only=''; | |
verbose=''; | |
while [[ $# -gt 1 ]]; do | |
case "$1" in | |
(-h | --help) | |
shift 1; ;; | |
(-b|--bypass-cache) | |
bypass='t'; | |
shift 1; ;; | |
(-c | --generate-cache) | |
cache='t'; | |
shift 1; | |
break; | |
;; | |
(-C | --generate-cache-here) | |
cache='here'; | |
shift 1; | |
break; | |
;; | |
(-o | --chapters-only) | |
only='t'; | |
shift 1; ;; | |
(-v|--verbose) | |
verbose='t'; | |
shift 1; ;; | |
(*) echo "unrecognized option: $1" | |
return 1 ;; | |
esac | |
done | |
# --- end options --- | |
# ------------------------------------------------------------------------------- | |
# ,--------, | |
# | STDERR | | |
# '--------' | |
if [[ -n $verbose ]]; then | |
exec 3>&2; | |
else | |
exec 3>/dev/null; | |
fi; | |
# ------------------------------------------------------------------------------- | |
# ,-----------, | |
# | Functions | | |
# '-----------' | |
get_episode_result() { | |
url="https://onepiece.fandom.com/wiki/Episode_$1"; | |
# check that the Season | Piece table is present at the bottom of "Japanese Information" | |
if { curl "$url" | | |
hxclean | | |
hxselect -c "#mw-content-text>aside>section:nth-of-type(4)>table"|grep -i season | |
} > /dev/null 2>&1; | |
then | |
selector="#mw-content-text>aside>section:nth-of-type(7)>div:first-of-type>div"; | |
else | |
# if the table is missing, we need the 6th section instead of the 7th | |
selector="#mw-content-text>aside>section:nth-of-type(6)>div:first-of-type>div"; | |
fi; | |
if [[ -n $only ]]; then { | |
curl "$url" | | |
hxclean | | |
hxselect -c "$selector" | | |
pup "a attr{href}"; | |
} 2>&3; | |
else { | |
curl "$url" | | |
hxclean | | |
hxselect -c "$selector" | | |
html2text --ignore-links | | |
perl -0777 -pe 's/^\s+|\s+$//' ; | |
} 2>&3; | |
fi; | |
} | |
generate_cache() { | |
{ | |
for ep in `seq $@`; do | |
get_episode_result $ep >> "ep$ep-chapters" | |
done; | |
} | |
exit 0; | |
} | |
# ------------------------------------------------------------------------------- | |
# ,---------------, | |
# | Retrieve data | | |
# '---------------' | |
# Get the data from cache if: | |
# (1) we need to return a result (ie. we're not generating cache files) | |
# (2) --bypass wasn't specified | |
# (3) the cache directory was specified (with $ONEPIECE_EPISODE_DATA) | |
# (4) the cache file exists | |
if [[ -z $cache ]] && [[ -z $bypass ]] && [[ -n $ONEPIECE_EPISODE_DATA ]] && [[ -f "ep$1-chapters" ]]; then | |
result=$(cat "ep$1-chapters"); | |
else | |
result=$(get_episode_result $1); | |
fi; | |
# ------------------------------------------------------------------------------- | |
# ,------------------, | |
# | Main conditional | | |
# '------------------' | |
if [[ -n $cache ]]; then | |
# generate cache | |
if [[ $cache != 'here' ]]; then | |
cd "$ONEPIECE_EPISODE_DATA"; | |
fi; | |
generate_cache "$@"; | |
elif [[ -n $only ]]; then | |
# output chapters only | |
# echo "$result" | perl -ne 'while ($_ =~ /(?<![-.])\b(\d+)\b(?![-,])/g) {print "$1\n"}'; | |
echo "$result" | perl -0777 -ple 's/\/wiki\/Chapter_([0-9]+)/$1/g'; | |
else | |
# output full result | |
echo "$result"; | |
fi; | |
exit 0; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# ,------, | |
# | Help | | |
# '------' | |
if [[ "$1" =~ ^-h$|^--help$ ]]; then | |
cat <<'EOF' | |
Usage: onepiece-episode-title [OPTION...] N | |
Print the title of a One Piece anime episode | |
Data scraped from the One Piece Wiki at http://onepiece.wikia.com/wiki/ | |
Crunchyroll translated titles scraped from http://www.crunchyroll.com/one-piece | |
Options: | |
-h, --help show help | |
-b, --bypass-cache ignore cache files in $ONEPIECE_EPISODE_DATA | |
-c, --generate-cache arguments: [FIRST] LAST | |
generate cache files in $ONEPIECE_EPISODE_DATA | |
-C, --generate-cache-here arguments: [FIRST] LAST | |
generate cache files in current directory | |
-n, --newline print a newline after each output | |
-N, --number-format format for episode number in --title (printf format string) | |
-e, --english print english title | |
-E, --english-crunchy print Crunchyroll english title | |
-f, --format specify a format for --title (may use $n for ep no., | |
$e, $r, $j for english/romaji/japanese title) | |
-j, --japanese print japanese title | |
-r, --romaji print romaji title | |
-t, --title print whole title | |
-u, --url print Crunchyroll URL | |
-v, --verbose show stderror output | |
EOF | |
exit 0; | |
fi; | |
# ------------------------------------------------------------------------------- | |
# ,-----------------, | |
# | Options (other) | | |
# '-----------------' | |
unset bypass cache english crunchy japanese romaji title format newline numformat epurl verbose; | |
while [[ $# -gt 1 ]]; do | |
case "$1" in | |
(-h | --help) | |
shift 1; ;; | |
(-b|--bypass-cache) | |
bypass='t'; | |
shift 1; ;; | |
(-c | --generate-cache) | |
cache='t'; | |
shift 1; | |
break; ;; | |
(-C | --generate-cache-here) | |
cache='here'; | |
shift 1; | |
break; ;; | |
(-e | --english) | |
english='t'; | |
shift 1; ;; | |
(-E | --english-crunchy) | |
crunchy='t'; | |
shift 1; ;; | |
(-f | --format) | |
format="$2"; | |
shift 2 ;; | |
(-n | --newline) | |
newline='t'; | |
shift 1; ;; | |
(-N | --number-format) | |
numformat="$2"; | |
shift 2 ;; | |
(-r | --romaji) | |
romaji='t'; | |
shift 1; ;; | |
(-j | --japanese) | |
japanese='t'; | |
shift 1; ;; | |
(-t | --title) | |
title='t'; | |
shift 1; ;; | |
(-u | --url) | |
epurl='t'; | |
shift 1; ;; | |
(-v|--verbose) | |
verbose='t'; | |
shift 1; ;; | |
(*) echo "unrecognized option: $1" | |
exit 1 ;; | |
esac | |
done | |
# ------------------------------------------------------------------------------- | |
# ,--------, | |
# | STDERR | | |
# '--------' | |
if [[ -n $verbose ]]; then | |
exec 3>&2; | |
else | |
exec 3>/dev/null; | |
fi; | |
# ------------------------------------------------------------------------------- | |
# ,-----------, | |
# | Functions | | |
# '-----------' | |
get_title() { | |
ep="$1"; | |
lang="$2"; | |
# if HTML hasn't already been fetched, fetch it | |
url="http://onepiece.wikia.com/wiki/Episode_$1"; | |
[[ -z "$html" ]] && html=$(curl "$url" 2>&3;); | |
# Get the data from cache if: | |
# (1) we need to return a result (ie. we're not generating cache files) | |
# (2) --bypass wasn't specified | |
# (3) the cache directory was specified (with $ONEPIECE_EPISODE_DATA) | |
# (4) the cache file exists | |
if [[ -z $cache ]] && [[ -z $bypass ]] && [[ -n $ONEPIECE_EPISODE_DATA ]] && | |
[[ -f "$ONEPIECE_EPISODE_DATA"/"ep$1-title-$lang" ]]; then | |
{ | |
cat "$ONEPIECE_EPISODE_DATA"/"ep$1-title-$lang"; | |
} | |
elif [[ $lang == crunchy ]]; then | |
get_crunchyroll_title "$ep" 2>&3; | |
else | |
{ | |
[[ $lang == english ]] && selector="aside > h2"; | |
[[ $lang == romaji ]] && selector="aside > section:nth-of-type(2) > div:nth-of-type(2) > div"; | |
[[ $lang == japanese ]] && selector="aside > section:nth-of-type(2) > div:nth-of-type(1) > div"; | |
echo "$html" | | |
pup "$selector text{}" | | |
xmlstarlet unesc | | |
tr '\n' ' ' | | |
sed 's/ *$//' ; | |
} 2>&3; | |
fi; | |
[[ -n $newline ]] && echo 2>&3; | |
} | |
get_crunchyroll_title() { | |
crunchyepurl=$(get_crunchyroll_url "$1"); | |
{ | |
curl "$crunchyepurl" | | |
pup ".showmedia-header h1 text{}" | | |
grep -P 'Episode [0-9]+' | | |
sed -r 's/^.*Episode [0-9]+[ –-]+//' | | |
sed -r 's/ *$//' | tr -d '\n'; | |
} 2>&3; | |
} | |
get_crunchyroll_url_list() { | |
crunchyurl="http://www.crunchyroll.com/one-piece"; | |
[[ -z "$crunchyhtml" ]] && crunchyhtml=$(exec 2>&3; curl "$crunchyurl"); | |
{ | |
echo "$crunchyhtml" | | |
pup 'a attr{href}' | | |
grep -oP '^/one-piece/episode-\d+-.*' | | |
sed 's|^/one-piece/episode-||' | | |
sort -u -n; | |
} 2>&3; | |
} | |
get_crunchyroll_url() { | |
ep="$1"; | |
relative=$( | |
if [[ -z $cache ]] && [[ -z $bypass ]] && [[ -n $ONEPIECE_EPISODE_DATA ]] && | |
[[ -f "$ONEPIECE_EPISODE_DATA"/"crunchyroll-url-list" ]]; then | |
cat "$ONEPIECE_EPISODE_DATA"/"crunchyroll-url-list" | grep -P "^$ep-"; | |
else | |
get_crunchyroll_url_list | grep -P "^$ep-"; | |
fi; | |
); | |
echo "http://www.crunchyroll.com/one-piece/episode-$relative"; | |
} | |
generate_cache() { | |
bypass='t'; | |
for ep in `seq $@`; do | |
for lang in english romaji japanese; do | |
get_title $ep "$lang" > "ep$ep-title-$lang"; | |
done; | |
done; | |
get_crunchyroll_url_list > "crunchyroll-url-list"; | |
for ep in `seq $@`; do | |
get_crunchyroll_title "$ep" 2>&3 > "ep$ep-title-crunchy"; | |
done; | |
exit 0; | |
} | |
# ------------------------------------------------------------------------------- | |
# ,-----------, | |
# | Debugging | | |
# '-----------' | |
if [[ -n $DEBUG ]]; then | |
echo "-----DEBUG INFO-----" | |
echo "\$1 $1 " | |
echo "bypass $bypass " | |
echo "cache $cache " | |
echo "english $english " | |
echo "japanese $japanese " | |
echo "romaji $romaji " | |
echo "crunchy $crunchy " | |
echo "title $title " | |
echo "format $format " | |
echo "newline $newline " | |
echo "numformat $numformat" | |
echo "epurl $epurl " | |
echo "verbose $verbose " | |
echo "ep $ep " | |
echo "lang $lang " | |
echo "html $html " | |
echo "selector $selector " | |
echo "----- END DEBUG INFO-----" | |
fi; | |
# ------------------------------------------------------------------------------- | |
# ,------------, | |
# | Main logic | | |
# '------------' | |
if [[ ! -x $(command -v pup) ]]; then | |
echo "install pup (https://github.com/ericchiang/pup) and place on $PATH"; | |
exit 1; | |
fi; | |
if [[ -n $cache ]]; then | |
# generate cache | |
if [[ $cache != 'here' ]]; then | |
cd "$ONEPIECE_EPISODE_DATA"; | |
fi; | |
generate_cache "$@"; | |
else | |
if [[ -n $title ]]; then | |
e=$(get_title "$1" "english"); | |
r=$(get_title "$1" "romaji"); | |
j=$(get_title "$1" "japanese"); | |
c=$(get_title "$1" "crunchy"); | |
numformat="${numformat:-%03d}"; | |
n=$(printf "$numformat" "$1"); | |
format="${format:-$n. $e・$r ($j)}"; | |
printf "$(eval echo \"$format\")"; | |
fi; | |
[[ -n $english ]] && get_title "$1" "english"; | |
[[ -n $romaji ]] && get_title "$1" "romaji"; | |
[[ -n $japanese ]] && get_title "$1" "japanese"; | |
[[ -n $crunchy ]] && get_title "$1" "crunchy"; | |
[[ -n $epurl ]] && get_crunchyroll_url "$1"; | |
fi; | |
exit 0; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# ,------, | |
# | Help | | |
# '------' | |
if [[ "$1" =~ ^-h$|^--help$ ]]; then | |
cat <<EOF | |
Usage: onepiece-episode-volume [OPTION...] N | |
Print manga volumes corresponding to a One Piece anime episode | |
Data scraped from the One Piece Wiki at https://onepiece.fandom.com/wiki/ | |
Options: | |
-h, --help show help | |
-b, --bypass-cache ignore cache files in $ONEPIECE_EPISODE_DATA | |
-c, --generate-cache arguments: [FIRST] LAST | |
generate cache files in $ONEPIECE_EPISODE_DATA | |
-C, --generate-cache-here arguments: [FIRST] LAST | |
generate cache files in current directory | |
-v, --verbose show stderror output | |
EOF | |
exit 0; | |
fi; | |
# ------------------------------------------------------------------------------- | |
# ,-----------------, | |
# | Options (other) | | |
# '-----------------' | |
bypass=''; | |
verbose=''; | |
while [[ $# -gt 1 ]]; do | |
case "$1" in | |
(-h | --help) | |
shift 1; ;; | |
(-b|--bypass-cache) | |
bypass='t'; | |
shift 1; ;; | |
(-v|--verbose) | |
verbose='-v'; | |
shift 1; ;; | |
(*) echo "unrecognized option: $1" | |
return 1 ;; | |
esac | |
done | |
# --- end options --- | |
# ------------------------------------------------------------------------------- | |
# ,--------, | |
# | STDERR | | |
# '--------' | |
if [[ -n $verbose ]]; then | |
exec 3>&2; | |
else | |
exec 3>/dev/null; | |
fi; | |
# ------------------------------------------------------------------------------- | |
# ,------------------, | |
# | Main conditional | | |
# '------------------' | |
onepiece-episode-chapters -o $bypass $verbose "$1" | | |
while read -r ch || [[ -n $ch ]]; do | |
onepiece-find-volume $bypass $verbose "$ch"; | |
done; | |
exit 0; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# Find the One Piece tankōbon volume containing a given chapter | |
# Data scraped from the One Piece Wiki at: | |
# https://onepiece.fandom.com/wiki/Chapters_and_Volumes | |
# Uses onepiece-list-volume to extract the volume data. | |
# ,------, | |
# | Help | | |
# '------' | |
if [[ "$1" =~ ^-h$|^--help$ ]]; then | |
cat <<EOF | |
onepiece-find-volume CH | |
Find the volume containing chapter no. CH | |
Options: | |
-h, --help show help | |
-b, --bypass-cache ignore cache files in $ONEPIECE_VOLUME_DATA | |
-v, --verbose show stderror output | |
EOF | |
exit 0; | |
fi; | |
# ------------------------------------------------------------------------------- | |
# ,-----------------, | |
# | Options (other) | | |
# '-----------------' | |
bypass=''; | |
verbose=''; | |
while [[ $# -gt 1 ]]; do | |
case "$1" in | |
(-h | --help) | |
shift 1; ;; | |
(-b|--bypass-cache) | |
bypass='-b'; | |
shift 1; ;; | |
(-v|--verbose) | |
verbose='-v'; | |
shift 1; ;; | |
(*) echo "unrecognized option: $1" | |
return 1 ;; | |
esac | |
done | |
# ------------------------------------------------------------------------------- | |
# ,-----------, | |
# | Main loop | | |
# '-----------' | |
url='https://onepiece.fandom.com/wiki/Chapters_and_Volumes'; | |
ch=`printf '%03d' "$1"`; | |
for vol in {1..100}; do | |
if onepiece-list-volume $bypass $verbose $vol | grep -Pq "^$ch"; then | |
echo $vol; | |
exit | |
fi; | |
done; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# Print chapters in One Piece tankōbon vol. N | |
# Data scraped from the One Piece Wiki at: | |
# https://onepiece.fandom.com/wiki/Chapters_and_Volumes | |
# ------------------------------------------------------------------------------- | |
# ,------, | |
# | Help | | |
# '------' | |
if [[ "$1" =~ ^-h$|^--help$ ]]; then | |
cat <<EOF | |
Usage: onepiece-list-volume [OPTION...] N | |
onepiece-list-volume [-c|-C] [FIRST] LAST | |
Print chapters in One Piece tankōbon vol. N. | |
With -c/-C, generate cache files. | |
Options: | |
-h, --help show help | |
-b, --bypass-cache ignore cache files in $ONEPIECE_VOLUME_DATA | |
-c, --generate-cache arguments: [FIRST] LAST | |
generate cache files in $ONEPIECE_VOLUME_DATA | |
-C, --generate-cache-here arguments: [FIRST] LAST | |
generate cache files in current directory | |
-v, --verbose show stderror output | |
EOF | |
exit 0; | |
fi; | |
# ------------------------------------------------------------------------------- | |
# ,-----------------, | |
# | Options (other) | | |
# '-----------------' | |
while [[ $# -gt 1 ]]; do | |
case "$1" in | |
(-h | --help) | |
shift 1; ;; | |
(-b|--bypass-cache) | |
bypass='t'; | |
shift 1; ;; | |
(-c | --generate-cache) | |
cache='t'; | |
shift 1; | |
break; | |
;; | |
(-C | --generate-cache-here) | |
cache='here'; | |
shift 1; | |
break; | |
;; | |
(-v|--verbose) | |
verbose='t'; | |
shift 1; ;; | |
(*) echo "unrecognized option: $1" | |
return 1 ;; | |
esac | |
done | |
# ------------------------------------------------------------------------------- | |
# ,----------, | |
# | Function | | |
# '----------' | |
list_volume_chapters() { | |
if [[ -n $verbose ]]; then | |
exec 3>&2; | |
else | |
exec 3>/dev/null; | |
fi; | |
url='https://onepiece.fandom.com/wiki/Chapters_and_Volumes'; | |
{ | |
curl "$url" | | |
hxclean | | |
hxselect -c "table#Volume_$1>tbody>tr>td>table.collapsible>tbody>tr:nth-of-type(5)>td:first-child>ul" | | |
hxremove '.t_nihongo_icon' | | |
html2text --ignore-links --ignore-emphasis | | |
sed -r 's/^\* ([0-9]+)\\\./\1./' | | |
sed '/^$/d' ; | |
} 2>&3; | |
} | |
# ------------------------------------------------------------------------------- | |
# ,------------, | |
# | Main Logic | | |
# '------------' | |
if [[ -n $cache ]]; then | |
# generate cache | |
if [[ $cache != 'here' ]]; then | |
cd "$ONEPIECE_VOLUME_DATA"; | |
fi; | |
for vol in `seq $@`; do | |
list_volume_chapters $vol > "vol-$(printf '%03d' $vol)-chapters"; | |
done; | |
exit 0; | |
fi; | |
cachefile="$ONEPIECE_VOLUME_DATA"/vol-$(printf '%03d' $1)-chapters; | |
if [[ -n $ONEPIECE_VOLUME_DATA ]] && [[ -f "$cachefile" ]] && [[ -z $bypass ]]; then | |
cat "$cachefile"; | |
else | |
list_volume_chapters $1; | |
fi; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment