Skip to content

Instantly share code, notes, and snippets.

@troyp
Last active January 12, 2021 23:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save troyp/4f25a1df892f465227b3f540e5d89618 to your computer and use it in GitHub Desktop.
Save troyp/4f25a1df892f465227b3f540e5d89618 to your computer and use it in GitHub Desktop.
One Piece scripts for scraping and caching anime and manga data
onepiece-episode-title: Get an episode title in english, romaji &/or japanese, or print a title according to a format string
prerequisites:
gnu tools: bash, sed, tr
curl
pup (https://github.com/ericchiang/pup)
xmlstarlet (unesc)
onepiece-episode-chapters: List the manga chapters adapted by an episode
prerequisites:
gnu tools: bash, sed, tr
curl
perl
HTML-XML-utils
html2text
onepiece-list-volume: List the manga chapters in a tankōbon volume
prerequisites:
gnu tools: bash, sed, tr
curl
HTML-XML-utils
html2text
onepiece-find-volume: Find the tankōbon volume containing a manga chapter
prerequisites:
gnu tools: bash, sed, tr
curl
HTML-XML-utils
html2text
#! /bin/bash
# ,------,
# | Help |
# '------'
if [[ "$1" =~ ^-h$|^--help$ ]]; then
cat <<EOF
Usage: onepiece-episode-chapters [OPTION...] N
Print manga chapters corresponding to a One Piece anime episode
Data scraped from the One Piece Wiki at https://onepiece.fandom.com/wiki/
Options:
-h, --help show help
-b, --bypass-cache ignore cache files in $ONEPIECE_EPISODE_DATA
-c, --generate-cache arguments: [FIRST] LAST
generate cache files in $ONEPIECE_EPISODE_DATA
-C, --generate-cache-here arguments: [FIRST] LAST
generate cache files in current directory
-o, --chapters-only list chapters one to a line
-v, --verbose show stderror output
EOF
exit 0;
fi;
# -------------------------------------------------------------------------------
# ,-----------------,
# | Options (other) |
# '-----------------'
bypass='';
cache='';
maxep='';
only='';
verbose='';
while [[ $# -gt 1 ]]; do
case "$1" in
(-h | --help)
shift 1; ;;
(-b|--bypass-cache)
bypass='t';
shift 1; ;;
(-c | --generate-cache)
cache='t';
shift 1;
break;
;;
(-C | --generate-cache-here)
cache='here';
shift 1;
break;
;;
(-o | --chapters-only)
only='t';
shift 1; ;;
(-v|--verbose)
verbose='t';
shift 1; ;;
(*) echo "unrecognized option: $1"
return 1 ;;
esac
done
# --- end options ---
# -------------------------------------------------------------------------------
# ,--------,
# | STDERR |
# '--------'
if [[ -n $verbose ]]; then
exec 3>&2;
else
exec 3>/dev/null;
fi;
# -------------------------------------------------------------------------------
# ,-----------,
# | Functions |
# '-----------'
get_episode_result() {
url="https://onepiece.fandom.com/wiki/Episode_$1";
# check that the Season | Piece table is present at the bottom of "Japanese Information"
if { curl "$url" |
hxclean |
hxselect -c "#mw-content-text>aside>section:nth-of-type(4)>table"|grep -i season
} > /dev/null 2>&1;
then
selector="#mw-content-text>aside>section:nth-of-type(7)>div:first-of-type>div";
else
# if the table is missing, we need the 6th section instead of the 7th
selector="#mw-content-text>aside>section:nth-of-type(6)>div:first-of-type>div";
fi;
if [[ -n $only ]]; then {
curl "$url" |
hxclean |
hxselect -c "$selector" |
pup "a attr{href}";
} 2>&3;
else {
curl "$url" |
hxclean |
hxselect -c "$selector" |
html2text --ignore-links |
perl -0777 -pe 's/^\s+|\s+$//' ;
} 2>&3;
fi;
}
generate_cache() {
{
for ep in `seq $@`; do
get_episode_result $ep >> "ep$ep-chapters"
done;
}
exit 0;
}
# -------------------------------------------------------------------------------
# ,---------------,
# | Retrieve data |
# '---------------'
# Get the data from cache if:
# (1) we need to return a result (ie. we're not generating cache files)
# (2) --bypass wasn't specified
# (3) the cache directory was specified (with $ONEPIECE_EPISODE_DATA)
# (4) the cache file exists
if [[ -z $cache ]] && [[ -z $bypass ]] && [[ -n $ONEPIECE_EPISODE_DATA ]] && [[ -f "ep$1-chapters" ]]; then
result=$(cat "ep$1-chapters");
else
result=$(get_episode_result $1);
fi;
# -------------------------------------------------------------------------------
# ,------------------,
# | Main conditional |
# '------------------'
if [[ -n $cache ]]; then
# generate cache
if [[ $cache != 'here' ]]; then
cd "$ONEPIECE_EPISODE_DATA";
fi;
generate_cache "$@";
elif [[ -n $only ]]; then
# output chapters only
# echo "$result" | perl -ne 'while ($_ =~ /(?<![-.])\b(\d+)\b(?![-,])/g) {print "$1\n"}';
echo "$result" | perl -0777 -ple 's/\/wiki\/Chapter_([0-9]+)/$1/g';
else
# output full result
echo "$result";
fi;
exit 0;
#! /bin/bash
# ,------,
# | Help |
# '------'
if [[ "$1" =~ ^-h$|^--help$ ]]; then
cat <<'EOF'
Usage: onepiece-episode-title [OPTION...] N
Print the title of a One Piece anime episode
Data scraped from the One Piece Wiki at http://onepiece.wikia.com/wiki/
Crunchyroll translated titles scraped from http://www.crunchyroll.com/one-piece
Options:
-h, --help show help
-b, --bypass-cache ignore cache files in $ONEPIECE_EPISODE_DATA
-c, --generate-cache arguments: [FIRST] LAST
generate cache files in $ONEPIECE_EPISODE_DATA
-C, --generate-cache-here arguments: [FIRST] LAST
generate cache files in current directory
-n, --newline print a newline after each output
-N, --number-format format for episode number in --title (printf format string)
-e, --english print english title
-E, --english-crunchy print Crunchyroll english title
-f, --format specify a format for --title (may use $n for ep no.,
$e, $r, $j for english/romaji/japanese title)
-j, --japanese print japanese title
-r, --romaji print romaji title
-t, --title print whole title
-u, --url print Crunchyroll URL
-v, --verbose show stderror output
EOF
exit 0;
fi;
# -------------------------------------------------------------------------------
# ,-----------------,
# | Options (other) |
# '-----------------'
unset bypass cache english crunchy japanese romaji title format newline numformat epurl verbose;
while [[ $# -gt 1 ]]; do
case "$1" in
(-h | --help)
shift 1; ;;
(-b|--bypass-cache)
bypass='t';
shift 1; ;;
(-c | --generate-cache)
cache='t';
shift 1;
break; ;;
(-C | --generate-cache-here)
cache='here';
shift 1;
break; ;;
(-e | --english)
english='t';
shift 1; ;;
(-E | --english-crunchy)
crunchy='t';
shift 1; ;;
(-f | --format)
format="$2";
shift 2 ;;
(-n | --newline)
newline='t';
shift 1; ;;
(-N | --number-format)
numformat="$2";
shift 2 ;;
(-r | --romaji)
romaji='t';
shift 1; ;;
(-j | --japanese)
japanese='t';
shift 1; ;;
(-t | --title)
title='t';
shift 1; ;;
(-u | --url)
epurl='t';
shift 1; ;;
(-v|--verbose)
verbose='t';
shift 1; ;;
(*) echo "unrecognized option: $1"
exit 1 ;;
esac
done
# -------------------------------------------------------------------------------
# ,--------,
# | STDERR |
# '--------'
if [[ -n $verbose ]]; then
exec 3>&2;
else
exec 3>/dev/null;
fi;
# -------------------------------------------------------------------------------
# ,-----------,
# | Functions |
# '-----------'
get_title() {
ep="$1";
lang="$2";
# if HTML hasn't already been fetched, fetch it
url="http://onepiece.wikia.com/wiki/Episode_$1";
[[ -z "$html" ]] && html=$(curl "$url" 2>&3;);
# Get the data from cache if:
# (1) we need to return a result (ie. we're not generating cache files)
# (2) --bypass wasn't specified
# (3) the cache directory was specified (with $ONEPIECE_EPISODE_DATA)
# (4) the cache file exists
if [[ -z $cache ]] && [[ -z $bypass ]] && [[ -n $ONEPIECE_EPISODE_DATA ]] &&
[[ -f "$ONEPIECE_EPISODE_DATA"/"ep$1-title-$lang" ]]; then
{
cat "$ONEPIECE_EPISODE_DATA"/"ep$1-title-$lang";
}
elif [[ $lang == crunchy ]]; then
get_crunchyroll_title "$ep" 2>&3;
else
{
[[ $lang == english ]] && selector="aside > h2";
[[ $lang == romaji ]] && selector="aside > section:nth-of-type(2) > div:nth-of-type(2) > div";
[[ $lang == japanese ]] && selector="aside > section:nth-of-type(2) > div:nth-of-type(1) > div";
echo "$html" |
pup "$selector text{}" |
xmlstarlet unesc |
tr '\n' ' ' |
sed 's/ *$//' ;
} 2>&3;
fi;
[[ -n $newline ]] && echo 2>&3;
}
get_crunchyroll_title() {
crunchyepurl=$(get_crunchyroll_url "$1");
{
curl "$crunchyepurl" |
pup ".showmedia-header h1 text{}" |
grep -P 'Episode [0-9]+' |
sed -r 's/^.*Episode [0-9]+[ –-]+//' |
sed -r 's/ *$//' | tr -d '\n';
} 2>&3;
}
get_crunchyroll_url_list() {
crunchyurl="http://www.crunchyroll.com/one-piece";
[[ -z "$crunchyhtml" ]] && crunchyhtml=$(exec 2>&3; curl "$crunchyurl");
{
echo "$crunchyhtml" |
pup 'a attr{href}' |
grep -oP '^/one-piece/episode-\d+-.*' |
sed 's|^/one-piece/episode-||' |
sort -u -n;
} 2>&3;
}
get_crunchyroll_url() {
ep="$1";
relative=$(
if [[ -z $cache ]] && [[ -z $bypass ]] && [[ -n $ONEPIECE_EPISODE_DATA ]] &&
[[ -f "$ONEPIECE_EPISODE_DATA"/"crunchyroll-url-list" ]]; then
cat "$ONEPIECE_EPISODE_DATA"/"crunchyroll-url-list" | grep -P "^$ep-";
else
get_crunchyroll_url_list | grep -P "^$ep-";
fi;
);
echo "http://www.crunchyroll.com/one-piece/episode-$relative";
}
generate_cache() {
bypass='t';
for ep in `seq $@`; do
for lang in english romaji japanese; do
get_title $ep "$lang" > "ep$ep-title-$lang";
done;
done;
get_crunchyroll_url_list > "crunchyroll-url-list";
for ep in `seq $@`; do
get_crunchyroll_title "$ep" 2>&3 > "ep$ep-title-crunchy";
done;
exit 0;
}
# -------------------------------------------------------------------------------
# ,-----------,
# | Debugging |
# '-----------'
if [[ -n $DEBUG ]]; then
echo "-----DEBUG INFO-----"
echo "\$1 $1 "
echo "bypass $bypass "
echo "cache $cache "
echo "english $english "
echo "japanese $japanese "
echo "romaji $romaji "
echo "crunchy $crunchy "
echo "title $title "
echo "format $format "
echo "newline $newline "
echo "numformat $numformat"
echo "epurl $epurl "
echo "verbose $verbose "
echo "ep $ep "
echo "lang $lang "
echo "html $html "
echo "selector $selector "
echo "----- END DEBUG INFO-----"
fi;
# -------------------------------------------------------------------------------
# ,------------,
# | Main logic |
# '------------'
if [[ ! -x $(command -v pup) ]]; then
echo "install pup (https://github.com/ericchiang/pup) and place on $PATH";
exit 1;
fi;
if [[ -n $cache ]]; then
# generate cache
if [[ $cache != 'here' ]]; then
cd "$ONEPIECE_EPISODE_DATA";
fi;
generate_cache "$@";
else
if [[ -n $title ]]; then
e=$(get_title "$1" "english");
r=$(get_title "$1" "romaji");
j=$(get_title "$1" "japanese");
c=$(get_title "$1" "crunchy");
numformat="${numformat:-%03d}";
n=$(printf "$numformat" "$1");
format="${format:-$n. $e・$r ($j)}";
printf "$(eval echo \"$format\")";
fi;
[[ -n $english ]] && get_title "$1" "english";
[[ -n $romaji ]] && get_title "$1" "romaji";
[[ -n $japanese ]] && get_title "$1" "japanese";
[[ -n $crunchy ]] && get_title "$1" "crunchy";
[[ -n $epurl ]] && get_crunchyroll_url "$1";
fi;
exit 0;
#! /bin/bash
# ,------,
# | Help |
# '------'
if [[ "$1" =~ ^-h$|^--help$ ]]; then
cat <<EOF
Usage: onepiece-episode-volume [OPTION...] N
Print manga volumes corresponding to a One Piece anime episode
Data scraped from the One Piece Wiki at https://onepiece.fandom.com/wiki/
Options:
-h, --help show help
-b, --bypass-cache ignore cache files in $ONEPIECE_EPISODE_DATA
-c, --generate-cache arguments: [FIRST] LAST
generate cache files in $ONEPIECE_EPISODE_DATA
-C, --generate-cache-here arguments: [FIRST] LAST
generate cache files in current directory
-v, --verbose show stderror output
EOF
exit 0;
fi;
# -------------------------------------------------------------------------------
# ,-----------------,
# | Options (other) |
# '-----------------'
bypass='';
verbose='';
while [[ $# -gt 1 ]]; do
case "$1" in
(-h | --help)
shift 1; ;;
(-b|--bypass-cache)
bypass='t';
shift 1; ;;
(-v|--verbose)
verbose='-v';
shift 1; ;;
(*) echo "unrecognized option: $1"
return 1 ;;
esac
done
# --- end options ---
# -------------------------------------------------------------------------------
# ,--------,
# | STDERR |
# '--------'
if [[ -n $verbose ]]; then
exec 3>&2;
else
exec 3>/dev/null;
fi;
# -------------------------------------------------------------------------------
# ,------------------,
# | Main conditional |
# '------------------'
onepiece-episode-chapters -o $bypass $verbose "$1" |
while read -r ch || [[ -n $ch ]]; do
onepiece-find-volume $bypass $verbose "$ch";
done;
exit 0;
#! /bin/bash
# Find the One Piece tankōbon volume containing a given chapter
# Data scraped from the One Piece Wiki at:
# https://onepiece.fandom.com/wiki/Chapters_and_Volumes
# Uses onepiece-list-volume to extract the volume data.
# ,------,
# | Help |
# '------'
if [[ "$1" =~ ^-h$|^--help$ ]]; then
cat <<EOF
onepiece-find-volume CH
Find the volume containing chapter no. CH
Options:
-h, --help show help
-b, --bypass-cache ignore cache files in $ONEPIECE_VOLUME_DATA
-v, --verbose show stderror output
EOF
exit 0;
fi;
# -------------------------------------------------------------------------------
# ,-----------------,
# | Options (other) |
# '-----------------'
bypass='';
verbose='';
while [[ $# -gt 1 ]]; do
case "$1" in
(-h | --help)
shift 1; ;;
(-b|--bypass-cache)
bypass='-b';
shift 1; ;;
(-v|--verbose)
verbose='-v';
shift 1; ;;
(*) echo "unrecognized option: $1"
return 1 ;;
esac
done
# -------------------------------------------------------------------------------
# ,-----------,
# | Main loop |
# '-----------'
url='https://onepiece.fandom.com/wiki/Chapters_and_Volumes';
ch=`printf '%03d' "$1"`;
for vol in {1..100}; do
if onepiece-list-volume $bypass $verbose $vol | grep -Pq "^$ch"; then
echo $vol;
exit
fi;
done;
#! /bin/bash
# Print chapters in One Piece tankōbon vol. N
# Data scraped from the One Piece Wiki at:
# https://onepiece.fandom.com/wiki/Chapters_and_Volumes
# -------------------------------------------------------------------------------
# ,------,
# | Help |
# '------'
if [[ "$1" =~ ^-h$|^--help$ ]]; then
cat <<EOF
Usage: onepiece-list-volume [OPTION...] N
onepiece-list-volume [-c|-C] [FIRST] LAST
Print chapters in One Piece tankōbon vol. N.
With -c/-C, generate cache files.
Options:
-h, --help show help
-b, --bypass-cache ignore cache files in $ONEPIECE_VOLUME_DATA
-c, --generate-cache arguments: [FIRST] LAST
generate cache files in $ONEPIECE_VOLUME_DATA
-C, --generate-cache-here arguments: [FIRST] LAST
generate cache files in current directory
-v, --verbose show stderror output
EOF
exit 0;
fi;
# -------------------------------------------------------------------------------
# ,-----------------,
# | Options (other) |
# '-----------------'
while [[ $# -gt 1 ]]; do
case "$1" in
(-h | --help)
shift 1; ;;
(-b|--bypass-cache)
bypass='t';
shift 1; ;;
(-c | --generate-cache)
cache='t';
shift 1;
break;
;;
(-C | --generate-cache-here)
cache='here';
shift 1;
break;
;;
(-v|--verbose)
verbose='t';
shift 1; ;;
(*) echo "unrecognized option: $1"
return 1 ;;
esac
done
# -------------------------------------------------------------------------------
# ,----------,
# | Function |
# '----------'
list_volume_chapters() {
if [[ -n $verbose ]]; then
exec 3>&2;
else
exec 3>/dev/null;
fi;
url='https://onepiece.fandom.com/wiki/Chapters_and_Volumes';
{
curl "$url" |
hxclean |
hxselect -c "table#Volume_$1>tbody>tr>td>table.collapsible>tbody>tr:nth-of-type(5)>td:first-child>ul" |
hxremove '.t_nihongo_icon' |
html2text --ignore-links --ignore-emphasis |
sed -r 's/^\* ([0-9]+)\\\./\1./' |
sed '/^$/d' ;
} 2>&3;
}
# -------------------------------------------------------------------------------
# ,------------,
# | Main Logic |
# '------------'
if [[ -n $cache ]]; then
# generate cache
if [[ $cache != 'here' ]]; then
cd "$ONEPIECE_VOLUME_DATA";
fi;
for vol in `seq $@`; do
list_volume_chapters $vol > "vol-$(printf '%03d' $vol)-chapters";
done;
exit 0;
fi;
cachefile="$ONEPIECE_VOLUME_DATA"/vol-$(printf '%03d' $1)-chapters;
if [[ -n $ONEPIECE_VOLUME_DATA ]] && [[ -f "$cachefile" ]] && [[ -z $bypass ]]; then
cat "$cachefile";
else
list_volume_chapters $1;
fi;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment