osirisgothra/generate_dubs.sh

## generate_dubs.sh
#!/bin/zsh
#
# DISCLAIMER
# the author of this script is not responsible for any content that might be on the websites it is used on
# it is up to the user of the script to make sure they are not accessing copyrighted works.
#
# You will need to type in the website's name (just the domain name and extension)
# like "example.com" will use https://www.example.com but you will have to type the
# page name in as I cannot legally put gogoplay or any other website that may have
# copyrighted material on it, into this script. This script is not intended to be used
# to break any laws. It was written as a test only, not to be used for such purposes.
#
# compile gogoplay-style anime dub pages into a single local html file for browsing
# be sure to run this in a directory that zsh can write to AND read from
#
# required installed programs:
#	zsh, at least version 5.8 x86_64
#        with commands 'read' and 'echo' enabled
#        with current path permissions read/write at least for your group or you
#   wget, at least version 1.20.3 linux-gnu build
#  grep, at least GNU grep version 3.4
#   sed, at least GNU sed version 4.7
#  perl, at least perl v5.30.0 x86_64 (build gnu-thread-multi)
#
#   cat, at least GNU cat version 8.30       --
#    tr, at least GNU tr version 8.30         |
#  sort, at least GNU sort version 8.30       |-- from coreutils package
#  uniq, at least GNU uniq version 8.30      _|
#
#  Tested platforms: Linux Ubuntu (linux-gnu build)
#
# TODO: image items were originally interpreted as being png of their same title on cdnimg
#       however this is not always the case, this needs to be fixed by just ripping out the
#       image file's name. I was in a hurry so some images *might* be missing at this point
#       however ALL links, even if the image appears broken, DO work. At the time, this was
#       the important thing. You can probably fix this little issue in a few seconds of editing.
#
if [[ $1 == --help ]]; then
	echo "syntax: $0 [website name] [output filename]"
	echo ""
	echo "any items not put on the command line will be asked for!"
	echo ""
	exit 1
fi
if [[ $# -eq 1 ]]; then
[[ $# -ge 1 ]] && websitename=$1 || read "Enter Website Name":websitename
[[ $# -eq 2 ]] && finalname=$2 || read "Enter Final Filename":finalname
[[ $# -gt 2 ]] && { echo "too many arguments, use --help if need be"; exit 2; }
echo "Website: $websitename"
echo "Final Output: $finalname"
echo "press ENTER to proceed, CTRL+C to abort"
read -sk1
for ((x=1;x<105;x++)); do
	echo "getting page $x ...";
	# kept using -O- > instead since I have verified that this works in both linux and windows subsystem without any problems
	wget 'https://'${websitename}'/search.html?keyword=(Dub)&page='$x -O- > animedubs-page${x}.html
done

# see TODO above, might want to pick that out
# in all the previous pages retrieved
# - merges all lines to a single line, then
# - grabs all text from the first start of anchor tag-> <a href="/videos <-encountered in the file
#   to the very LAST /a> (end of anchor tag) in the file
# - next, splits the text up so only ONE tag is on each line of the text
# - next, grabs ONLY the lines that are the image tag or the video links
# - next, replaces image tag with anchor end instead***** you might want to change this to include images
for ((x=0;x<105;x++)); do
	cat animedubs-page${x}.html |
		tr -d '\n' |
		grep '<a href="/videos.*/a>' -o |
		grep '<.*?>' -o |
		grep '<(img src="https://cdnimg.|a href="/videos)' |
		sed -r 's/<img src.*alt="//g;s/" \/>/<\/a>/g' >
		episodes-page${x}.html
done

# combines ALL results into a single page
{
echo '<html>
<head><title>Gogo-Play Anime Dubs ALL OF THEM</title>
<style>
:root
{
	grid-area:5;
}
img
{
	/* for image margins that will be showing on the edges (about 10px, 6px min on the corners) */
	background: linear-gradient(12deg,#453d3db3,#2525287a);
	padding: 10px;
	margin: 5px;
	border-radius: 4px;
	width: 2in;
	height: 2.75in;

}
a
{
	/* set this to whatever font you like in the browser */
	font-family: "Source Code Pro";
	font-size: 9pt;
	display: inline-grid;
	text-decoration: none;
	text-wrap: normal;
	padding: 5px;
	margin: 5px;
	border-radius: 5px;


}
content a:nth-child(even)
{
	background-color: rgba(12,24,36,1.0);
	grid-column: 1;
}
content a:nth-child(odd)
{
	background-color: rgba(36,24,12,1.0);
	grid-column: 0;
}
</style>
</head>
	<body>
	<header>
		Anime Dubs (All)
	</header>
	<content>
	<base href="https://www.gogoplay1.com"/>'

for ((x=0;x<105;x++)); do
	cat "episodes-page${x}.html" |
	while read line1; do
		read line2
		echo "$line1$line2";
	done
done
echo '</content>
	<footer>
	</footer>
</body>
</html>'

} > episodes.html

cat episodes.html | perl -wpe 's/episode-([0-9]+)/episode-1/g' | sort | uniq > unique.html
cat unique.html | perl -wpe 's#(?<=videos/)(.*)(-episode-1)#\1-episode-1"><img src="'$r'\1.png#g' > $finalname
rm unique.html
rm episodes-page*.html
rm episodes.html
rm animedubs*.html
echo "finished, generated $finalname"
	#!/bin/zsh
	#
	# DISCLAIMER
	# the author of this script is not responsible for any content that might be on the websites it is used on
	# it is up to the user of the script to make sure they are not accessing copyrighted works.
	#
	# You will need to type in the website's name (just the domain name and extension)
	# like "example.com" will use https://www.example.com but you will have to type the
	# page name in as I cannot legally put gogoplay or any other website that may have
	# copyrighted material on it, into this script. This script is not intended to be used
	# to break any laws. It was written as a test only, not to be used for such purposes.
	#
	# compile gogoplay-style anime dub pages into a single local html file for browsing
	# be sure to run this in a directory that zsh can write to AND read from
	#
	# required installed programs:
	# zsh, at least version 5.8 x86_64
	# with commands 'read' and 'echo' enabled
	# with current path permissions read/write at least for your group or you
	# wget, at least version 1.20.3 linux-gnu build
	# grep, at least GNU grep version 3.4
	# sed, at least GNU sed version 4.7
	# perl, at least perl v5.30.0 x86_64 (build gnu-thread-multi)
	#
	# cat, at least GNU cat version 8.30 --
	# tr, at least GNU tr version 8.30 \|
	# sort, at least GNU sort version 8.30 \|-- from coreutils package
	# uniq, at least GNU uniq version 8.30 _\|
	#
	# Tested platforms: Linux Ubuntu (linux-gnu build)
	#
	# TODO: image items were originally interpreted as being png of their same title on cdnimg
	# however this is not always the case, this needs to be fixed by just ripping out the
	# image file's name. I was in a hurry so some images might be missing at this point
	# however ALL links, even if the image appears broken, DO work. At the time, this was
	# the important thing. You can probably fix this little issue in a few seconds of editing.
	#
	if [[ $1 == --help ]]; then
	echo "syntax: $0 [website name] [output filename]"
	echo ""
	echo "any items not put on the command line will be asked for!"
	echo ""
	exit 1
	fi
	if [[ $# -eq 1 ]]; then
	[[ $# -ge 1 ]] && websitename=$1 \|\| read "Enter Website Name":websitename
	[[ $# -eq 2 ]] && finalname=$2 \|\| read "Enter Final Filename":finalname
	[[ $# -gt 2 ]] && { echo "too many arguments, use --help if need be"; exit 2; }
	echo "Website: $websitename"
	echo "Final Output: $finalname"
	echo "press ENTER to proceed, CTRL+C to abort"
	read -sk1
	for ((x=1;x<105;x++)); do
	echo "getting page $x ...";
	# kept using -O- > instead since I have verified that this works in both linux and windows subsystem without any problems
	wget 'https://'${websitename}'/search.html?keyword=(Dub)&page='$x -O- > animedubs-page${x}.html
	done

	# see TODO above, might want to pick that out
	# in all the previous pages retrieved
	# - merges all lines to a single line, then
	# - grabs all text from the first start of anchor tag-> <a href="/videos <-encountered in the file
	# to the very LAST /a> (end of anchor tag) in the file
	# - next, splits the text up so only ONE tag is on each line of the text
	# - next, grabs ONLY the lines that are the image tag or the video links
	# - next, replaces image tag with anchor end instead***** you might want to change this to include images
	for ((x=0;x<105;x++)); do
	cat animedubs-page${x}.html \|
	tr -d '\n' \|
	grep '<a href="/videos.*/a>' -o \|
	grep '<.*?>' -o \|
	grep '<(img src="https://cdnimg.\|a href="/videos)' \|
	sed -r 's/<img src.*alt="//g;s/" \/>/<\/a>/g' >
	episodes-page${x}.html
	done

	# combines ALL results into a single page
	{
	echo '<html>
	<head><title>Gogo-Play Anime Dubs ALL OF THEM</title>
	<style>
	:root
	{
	grid-area:5;
	}
	img
	{
	/* for image margins that will be showing on the edges (about 10px, 6px min on the corners) */
	background: linear-gradient(12deg,#453d3db3,#2525287a);
	padding: 10px;
	margin: 5px;
	border-radius: 4px;
	width: 2in;
	height: 2.75in;

	}
	a
	{
	/* set this to whatever font you like in the browser */
	font-family: "Source Code Pro";
	font-size: 9pt;
	display: inline-grid;
	text-decoration: none;
	text-wrap: normal;
	padding: 5px;
	margin: 5px;
	border-radius: 5px;



	}
	content a:nth-child(even)
	{
	background-color: rgba(12,24,36,1.0);
	grid-column: 1;
	}
	content a:nth-child(odd)
	{
	background-color: rgba(36,24,12,1.0);
	grid-column: 0;
	}
	</style>
	</head>
	<body>
	<header>
	Anime Dubs (All)
	</header>
	<content>
	<base href="https://www.gogoplay1.com"/>'

	for ((x=0;x<105;x++)); do
	cat "episodes-page${x}.html" \|
	while read line1; do
	read line2
	echo "$line1$line2";
	done
	done
	echo '</content>
	<footer>
	</footer>
	</body>
	</html>'

	} > episodes.html

	cat episodes.html \| perl -wpe 's/episode-([0-9]+)/episode-1/g' \| sort \| uniq > unique.html
	cat unique.html \| perl -wpe 's#(?<=videos/)(.*)(-episode-1)#\1-episode-1"><img src="'$r'\1.png#g' > $finalname
	rm unique.html
	rm episodes-page*.html
	rm episodes.html
	rm animedubs*.html
	echo "finished, generated $finalname"