yulioaj290/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Website Downloader

Script Bash for downloading entire websites and compress it in a 7z file.
You must pass 1 required argument: url of the website to download.
This script is capable to download all web pages related to the website you want.
It respects all external resources neccesary to properly view the website offline in a browser.
It also respects the parent directory you specify in the url.
Requirements


P7zip-full 16.02 or higher.

To prepare the environment for the script follow this steps (based on Ubuntu 14.04):

(Optional) Update your linux repositories and upgrade all packages.

$ sudo apt-get update
$ sudo apt-get upgrade

Install p7zip-full on your system, it is neccesary to compress and divide in chunks all the file/directory included.

$ sudo apt-get install p7zip-full

Execute the script.

$ bash WEBSITE_DOWNLOADER.sh https://google.com/docs/get-started

  
## WEBSITE_DOWNLOADER.sh
#!/bin/bash
#
# Copyright 2019 Yulio Aleman Jimenez (@yulioaj290)
#
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
# THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# WEBSITE_DOWNLOADER.sh
# This script download an entire website, and finally compress all files into one 7zip package.
#

echo "--------------------------------------------------"
echo "|                                                |"
echo "|               WEBSITE DOWNLOADER               |"
echo "|                                                |"
echo "--------------------------------------------------"
echo "|             Yulio Aleman Jimenez               |"
echo "|                  @yulioaj290                   |"
echo "--------------------------------------------------"


WEBSITE_URL=$1                                                  # https://google.com/docs/get-started
WEBSITE_DOMAIN="$(echo $WEBSITE_URL | awk -F[/:] '{print $4}')" # google.com
# WEBSITE_DOMAIN="$(echo $WEBSITE_URL | cut -d'/' -f1)"           # google.com
WEBSITE_DOMAIN_LIST="$WEBSITE_DOMAIN,$2"                        # google.com,bootstrapcdn.com,otherdomain.com

if [[ $WEBSITE_URL = '' ]]; then
    echo "[ ERROR ]: You must to provide the complete 'WEBSITE URL', with protocol [ http | https | ftp ]."
    echo "  --> SYNTAX EXAMPLE: bash WEBSITE_DOWNLOADER.sh https://google.com/docs/get-started google.com,bootstrapcdn.com,otherdomain.com"
    exit 128
fi

echo "--------------------------------------------------"
echo "[ Website Domain ]: $WEBSITE_DOMAIN"
echo "[ Website URL ]: $WEBSITE_URL"
echo "--------------------------------------------------"

echo "Downloading website ........."
echo "--------------------------------------------------"

wget --quiet --recursive --no-clobber --page-requisites --adjust-extension --convert-links --restrict-file-names=windows --span-hosts --no-parent --level=inf --domains=$WEBSITE_DOMAIN_LIST $WEBSITE_URL

echo "Building compressed file for \"$WEBSITE_DOMAIN\" ..."
echo "--------------------------------------------------"

7za a -t7z -m0=lzma -mx=9 -ms=on $WEBSITE_DOMAIN.7z $WEBSITE_DOMAIN > /dev/null


FILE_WEIGHT="$(echo `du $FILE_NAME` | cut -d' ' -f1)"

WEIGHT_IN_MB="$(( ($FILE_WEIGHT + (1024 - 1) ) / 1024 ))"

echo "[ File Weight ]: $WEBSITE_DOMAIN   $WEIGHT_IN_MB"" MB"
echo "--------------------------------------------------"
echo "Removing original files from directory \"$WEBSITE_DOMAIN\" ..."
echo "--------------------------------------------------"

WEBSITE_DOMAIN_LIST_REMOVE=$(echo $WEBSITE_DOMAIN_LIST | sed -e 's/\,/\t/g')

rm -rf $WEBSITE_DOMAIN_LIST_REMOVE

echo "[ INFO ]: Website downloaded successfully!!!"
echo "[ INFO ]: Compressed file $WEBSITE_DOMAIN.7z !!!"
echo "--------------------------------------------------"


# The options of the WGET are:
#    --quiet:                       quiet mode
#    --recursive:                   download the entire Web site.
#    --domains website.org:         don't follow links outside website.org.
#    --no-parent:                   don't follow links outside the directory tutorials/html/.
#    --page-requisites:             get all the elements that compose the page (images, CSS and so on).
#    --adjust-extension:            save files with the best extension.
#    --convert-links:               convert links so that they work locally, off-line.
#    --restrict-file-names=windows: modify filenames so that they will work in Windows as well.
#    --no-clobber:                  don't overwrite any existing files (used in case the download is interrupted and resumed).
#    --span-hosts:                  include resources from domain list provided in "--domain-list"
#    --domain-list=<domain-list>    domain list to span or retrieve resources
#    --user-agent=<agent-string>    user agent to simulate, like "Mozilla"
#    --level=inf
	#!/bin/bash
	#
	# Copyright 2019 Yulio Aleman Jimenez (@yulioaj290)
	#
	# Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions
	# are met:
	#
	# 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
	#
	# 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in
	# the documentation and/or other materials provided with the distribution.
	#
	# 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived
	# from this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
	# NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
	# THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	#
	# WEBSITE_DOWNLOADER.sh
	# This script download an entire website, and finally compress all files into one 7zip package.
	#

	echo "--------------------------------------------------"
	echo "\| \|"
	echo "\| WEBSITE DOWNLOADER \|"
	echo "\| \|"
	echo "--------------------------------------------------"
	echo "\| Yulio Aleman Jimenez \|"
	echo "\| @yulioaj290 \|"
	echo "--------------------------------------------------"


	WEBSITE_URL=$1 # https://google.com/docs/get-started
	WEBSITE_DOMAIN="$(echo $WEBSITE_URL \| awk -F[/:] '{print $4}')" # google.com
	# WEBSITE_DOMAIN="$(echo $WEBSITE_URL \| cut -d'/' -f1)" # google.com
	WEBSITE_DOMAIN_LIST="$WEBSITE_DOMAIN,$2" # google.com,bootstrapcdn.com,otherdomain.com

	if [[ $WEBSITE_URL = '' ]]; then
	echo "[ ERROR ]: You must to provide the complete 'WEBSITE URL', with protocol [ http \| https \| ftp ]."
	echo " --> SYNTAX EXAMPLE: bash WEBSITE_DOWNLOADER.sh https://google.com/docs/get-started google.com,bootstrapcdn.com,otherdomain.com"
	exit 128
	fi

	echo "--------------------------------------------------"
	echo "[ Website Domain ]: $WEBSITE_DOMAIN"
	echo "[ Website URL ]: $WEBSITE_URL"
	echo "--------------------------------------------------"

	echo "Downloading website ........."
	echo "--------------------------------------------------"

	wget --quiet --recursive --no-clobber --page-requisites --adjust-extension --convert-links --restrict-file-names=windows --span-hosts --no-parent --level=inf --domains=$WEBSITE_DOMAIN_LIST $WEBSITE_URL

	echo "Building compressed file for \"$WEBSITE_DOMAIN\" ..."
	echo "--------------------------------------------------"

	7za a -t7z -m0=lzma -mx=9 -ms=on $WEBSITE_DOMAIN.7z $WEBSITE_DOMAIN > /dev/null


	FILE_WEIGHT="$(echo `du $FILE_NAME` \| cut -d' ' -f1)"

	WEIGHT_IN_MB="$(( ($FILE_WEIGHT + (1024 - 1) ) / 1024 ))"

	echo "[ File Weight ]: $WEBSITE_DOMAIN $WEIGHT_IN_MB"" MB"
	echo "--------------------------------------------------"
	echo "Removing original files from directory \"$WEBSITE_DOMAIN\" ..."
	echo "--------------------------------------------------"

	WEBSITE_DOMAIN_LIST_REMOVE=$(echo $WEBSITE_DOMAIN_LIST \| sed -e 's/\,/\t/g')

	rm -rf $WEBSITE_DOMAIN_LIST_REMOVE

	echo "[ INFO ]: Website downloaded successfully!!!"
	echo "[ INFO ]: Compressed file $WEBSITE_DOMAIN.7z !!!"
	echo "--------------------------------------------------"


	# The options of the WGET are:
	# --quiet: quiet mode
	# --recursive: download the entire Web site.
	# --domains website.org: don't follow links outside website.org.
	# --no-parent: don't follow links outside the directory tutorials/html/.
	# --page-requisites: get all the elements that compose the page (images, CSS and so on).
	# --adjust-extension: save files with the best extension.
	# --convert-links: convert links so that they work locally, off-line.
	# --restrict-file-names=windows: modify filenames so that they will work in Windows as well.
	# --no-clobber: don't overwrite any existing files (used in case the download is interrupted and resumed).
	# --span-hosts: include resources from domain list provided in "--domain-list"
	# --domain-list=<domain-list> domain list to span or retrieve resources
	# --user-agent=<agent-string> user agent to simulate, like "Mozilla"
	# --level=inf