Created
November 8, 2023 20:31
-
-
Save onesixromcom/0e2a16da86096c4f6c4af4a7834dd57d to your computer and use it in GitHub Desktop.
readukrainianbooks.com downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# store arguments in a special array | |
args=("$@") | |
URL=${args[0]} | |
# Check if link to page is present. | |
if [ -z "$URL" ]; then | |
echo "No url supplied. Please set collection name. (ex: https://readukrainianbooks.com/6340-faktor-cherchillja-jak-odna-ljudina-zminila-istoriju-boris-dzhonson.html)" | |
exit | |
fi | |
get_book_text() | |
{ | |
cat $1 | | |
hxnormalize -x -e -s | | |
hxselect -i div#texts | | |
sed -e 's/<div.*>//g' -e 's/<\/div>//g' | | |
sed -e ':a;N;$!ba;s/\n//g' | | |
sed -e $'s/ / /g' | |
} | |
get_book_text_online() | |
{ | |
echo $1 | | |
wget -O- -i- --no-verbose --quiet | | |
hxnormalize -x -e -s | | |
hxselect -i div#texts | | |
sed -e 's/<div.*>//g' -e 's/<\/div>//g' | | |
sed -e ':a;N;$!ba;s/\n//g' | | |
sed -e $'s/ / /g' | |
} | |
get_book_title() | |
{ | |
cat $1 | | |
hxnormalize -x -e -s | | |
hxselect -i h1.short-title | | |
sed -e 's/<[^>]*>//g' | | |
sed -e 's/Читати книгу - //g' | | |
sed -e 's/"//g' | | |
sed -e ':a;N;$!ba;s/\n//g' | | |
sed -e $'s/ / /g' | |
} | |
get_book_description() | |
{ | |
cat $1 | | |
hxnormalize -x -e -s | | |
hxselect -i div.shortstory-text | | |
sed -e 's/<[^>]*>//g' | | |
sed -e ':a;N;$!ba;s/\n//g' | | |
sed -e $'s/ / /g' | |
} | |
get_book_author() | |
{ | |
cat $1 | | |
hxnormalize -x -e -s | | |
hxselect -i span.fa.fa-pencil+a | | |
sed -e 's/<[^>]*>//g' -e 's/\r//' | |
} | |
get_book_genre() | |
{ | |
cat $1 | | |
hxnormalize -x -e -s | | |
hxselect -i span.fa.fa-book+a | | |
sed -e 's/<[^>]*>//g' | | |
sed -e ':a;N;$!ba;s/\n//g' | | |
sed -e $'s/ / /g' | |
} | |
get_book_image_url() | |
{ | |
cat $1 | | |
hxnormalize -x -e -s | | |
hxselect -i ".fimg.img-wide img" | | |
sed -e "s/.* data-src=\"\(.*\)\".*/\1/" | |
} | |
get_pages_list() | |
{ | |
cat $1 | | |
hxnormalize -x -e -s | | |
hxselect -i div.storenumber.ftext div.to-page | | |
sed 's/<option/<a/g' | #replacements to make hxwls work | |
sed 's/option>/a>/g' | #replacements to make hxwls work | |
sed 's/value=/href=/g' | #replacements to make hxwls work | |
hxwls | |
} | |
write_fb2_header() | |
{ | |
if [ -f "$FILENAME" ]; then | |
rm "$FILENAME" | |
fi | |
touch "$FILENAME"; | |
echo '<?xml version="1.0" encoding="utf-8"?><FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">' > "$FILENAME"; | |
} | |
write_fb2_footer() | |
{ | |
echo '</section></body>' >> "$FILENAME"; | |
if [ ! -z "$IMG" ]; then | |
IMG="https://readukrainianbooks.com$IMG" | |
echo '<binary id="cover.jpg" content-type="image/jpeg">' >> "$FILENAME"; | |
wget -O ./cover.jpg --no-verbose --quiet $IMG | |
base64 ./cover.jpg >> "$FILENAME"; | |
echo '</binary>' >> "$FILENAME"; | |
fi | |
echo '</FictionBook>' >> "$FILENAME"; | |
} | |
write_fb2_description() | |
{ | |
echo '<description><title-info>' >> "$FILENAME"; | |
echo "<book-title>$1</book-title>" >> "$FILENAME"; | |
echo "<genre>$3</genre>" >> "$FILENAME"; | |
echo "<author><first-name>$2</first-name><middle-name></middle-name><last-name></last-name></author>" >> "$FILENAME"; | |
echo '<from>Downloaded from readukrainianbooks.com. Converted by 16rom.com</from>' >> "$FILENAME"; | |
echo "<annotation><p>$4</p></annotation>" >> "$FILENAME"; | |
if [ ! -z "$IMG" ]; then | |
echo '<coverpage><image l:href="#cover.jpg"></image></coverpage>' >> "$FILENAME"; | |
fi | |
echo '<lang>ua</lang></title-info></description><body xmlns:fb="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink"><section>' >> "$FILENAME"; | |
} | |
write_fb2_text() | |
{ | |
echo "$1" >> "$FILENAME"; | |
} | |
#================== START ================== | |
echo "readukrainianbooks.com downloader is starting..." | |
FILENAME=$(echo "$URL" | sed "s/https:\/\/readukrainianbooks.com\///" | sed 's/.html.*//' | sed -e 's/^[0-9]\+-*//g').fb2 | |
PAGE_HTML="./page1.html" | |
if [ -f "$PAGE_HTML" ]; then rm "$PAGE_HTML" | |
fi | |
if [ -f "./cover.jpg" ]; then rm "./cover.jpg" | |
fi | |
# Download and save file to process | |
wget -O "$PAGE_HTML" --no-verbose --quiet $URL | |
echo $FILENAME | |
IMG=$(get_book_image_url $PAGE_HTML) | |
PAGES_URLS=($(get_pages_list $PAGE_HTML)) | |
if [ -z "$PAGES_URLS" ]; then | |
echo "No pages were found! exit"; | |
exit; | |
fi | |
BOOK_TITLE=$(get_book_title $PAGE_HTML) | |
BOOK_AUTHOR=$(get_book_author $PAGE_HTML) | |
BOOK_GENRE=$(get_book_genre $PAGE_HTML) | |
BOOK_DESCRIPTION=$(get_book_description $PAGE_HTML) | |
write_fb2_header | |
write_fb2_description "$BOOK_TITLE" "$BOOK_AUTHOR" "$BOOK_GENRE" "$BOOK_DESCRIPTION" | |
echo "Processing ${#PAGES_URLS[@]} pages." | |
for page_url in "${PAGES_URLS[@]}"; | |
do | |
echo "Process: $page_url" | |
BOOK_TEXT=$(get_book_text_online $page_url) | |
write_fb2_text "$BOOK_TEXT" | |
done | |
write_fb2_footer | |
echo "readukrainianbooks.com downloader finished." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment