|
#!/bin/bash |
|
# |
|
# Copyright 2019 Yulio Aleman Jimenez (@yulioaj290) |
|
# |
|
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions |
|
# are met: |
|
# |
|
# 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. |
|
# |
|
# 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in |
|
# the documentation and/or other materials provided with the distribution. |
|
# |
|
# 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived |
|
# from this software without specific prior written permission. |
|
# |
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT |
|
# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL |
|
# THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
|
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
# |
|
# WEBSITE_DOWNLOADER.sh |
|
# This script download an entire website, and finally compress all files into one 7zip package. |
|
# |
|
|
|
echo "--------------------------------------------------" |
|
echo "| |" |
|
echo "| WEBSITE DOWNLOADER |" |
|
echo "| |" |
|
echo "--------------------------------------------------" |
|
echo "| Yulio Aleman Jimenez |" |
|
echo "| @yulioaj290 |" |
|
echo "--------------------------------------------------" |
|
|
|
|
|
WEBSITE_URL=$1 # https://google.com/docs/get-started |
|
WEBSITE_DOMAIN="$(echo $WEBSITE_URL | awk -F[/:] '{print $4}')" # google.com |
|
# WEBSITE_DOMAIN="$(echo $WEBSITE_URL | cut -d'/' -f1)" # google.com |
|
WEBSITE_DOMAIN_LIST="$WEBSITE_DOMAIN,$2" # google.com,bootstrapcdn.com,otherdomain.com |
|
|
|
if [[ $WEBSITE_URL = '' ]]; then |
|
echo "[ ERROR ]: You must to provide the complete 'WEBSITE URL', with protocol [ http | https | ftp ]." |
|
echo " --> SYNTAX EXAMPLE: bash WEBSITE_DOWNLOADER.sh https://google.com/docs/get-started google.com,bootstrapcdn.com,otherdomain.com" |
|
exit 128 |
|
fi |
|
|
|
echo "--------------------------------------------------" |
|
echo "[ Website Domain ]: $WEBSITE_DOMAIN" |
|
echo "[ Website URL ]: $WEBSITE_URL" |
|
echo "--------------------------------------------------" |
|
|
|
echo "Downloading website ........." |
|
echo "--------------------------------------------------" |
|
|
|
wget --quiet --recursive --no-clobber --page-requisites --adjust-extension --convert-links --restrict-file-names=windows --span-hosts --no-parent --level=inf --domains=$WEBSITE_DOMAIN_LIST $WEBSITE_URL |
|
|
|
echo "Building compressed file for \"$WEBSITE_DOMAIN\" ..." |
|
echo "--------------------------------------------------" |
|
|
|
7za a -t7z -m0=lzma -mx=9 -ms=on $WEBSITE_DOMAIN.7z $WEBSITE_DOMAIN > /dev/null |
|
|
|
|
|
FILE_WEIGHT="$(echo `du $FILE_NAME` | cut -d' ' -f1)" |
|
|
|
WEIGHT_IN_MB="$(( ($FILE_WEIGHT + (1024 - 1) ) / 1024 ))" |
|
|
|
echo "[ File Weight ]: $WEBSITE_DOMAIN $WEIGHT_IN_MB"" MB" |
|
echo "--------------------------------------------------" |
|
echo "Removing original files from directory \"$WEBSITE_DOMAIN\" ..." |
|
echo "--------------------------------------------------" |
|
|
|
WEBSITE_DOMAIN_LIST_REMOVE=$(echo $WEBSITE_DOMAIN_LIST | sed -e 's/\,/\t/g') |
|
|
|
rm -rf $WEBSITE_DOMAIN_LIST_REMOVE |
|
|
|
echo "[ INFO ]: Website downloaded successfully!!!" |
|
echo "[ INFO ]: Compressed file $WEBSITE_DOMAIN.7z !!!" |
|
echo "--------------------------------------------------" |
|
|
|
|
|
# The options of the WGET are: |
|
# --quiet: quiet mode |
|
# --recursive: download the entire Web site. |
|
# --domains website.org: don't follow links outside website.org. |
|
# --no-parent: don't follow links outside the directory tutorials/html/. |
|
# --page-requisites: get all the elements that compose the page (images, CSS and so on). |
|
# --adjust-extension: save files with the best extension. |
|
# --convert-links: convert links so that they work locally, off-line. |
|
# --restrict-file-names=windows: modify filenames so that they will work in Windows as well. |
|
# --no-clobber: don't overwrite any existing files (used in case the download is interrupted and resumed). |
|
# --span-hosts: include resources from domain list provided in "--domain-list" |
|
# --domain-list=<domain-list> domain list to span or retrieve resources |
|
# --user-agent=<agent-string> user agent to simulate, like "Mozilla" |
|
# --level=inf |