knbknb/wget-examples.sh

## wget-examples.sh
######  Spider Websites with Wget  20 Practical Examples
# original, from 2019:
# https://www.labnol.org/software/wget-command-examples/28750/
######  Spider Websites with Wget  20 Practical Examples

# wget is extremely powerful, but like with most other command line programs,
# the plethora of options it supports can be intimidating to new users.
# Thus what we have here are a collection of wget commands that you can use
# to accomplish common tasks from downloading single files to mirroring entire websites.
# It will help if you can read through the wget manual but for the busy souls,
# these commands are ready to execute.

# wget works best for sites comprising only static HTML pages.
# For sites that implement all intrs-site linking with JavaScript code,
# or for javascript-only sites, wget will perform POORLY.

################################################
# Single page download
################################################

# 01. Get single file from the Internet
wget --no-config <URL>

# 02. Get file but save it locally under a different name
wget --no-config --output-document=filename.html example.com

# 03. Get file and save it in a specific folder
wget --no-config --directory-prefix=folder/subfolder example.com

# 04. Resume an interrupted download previously started by wget itself
wget --no-config --continue example.com/big.file.iso

# 05. Get file but only if the version on server is newer than your local copy
wget --no-config --continue --timestamping wordpress.org/latest.zip

################################################
## data-driven dowload of single files/URLs
################################################

# 06. Get multiple URLs from textfile, 1 URL/line.
wget --no-config --input list-of-file-urls.txt

# 07. Get list of sequentially numbered files from a server
wget --no-config <URL>

# 08. Get web page with all assets  like stylesheets and inline images  that are required to properly display the web page offline.
wget --no-config --page-requisites --span-hosts --convert-links --adjust-extension <URL>

################################################
## Mirror websites with wget
################################################

# 09. Get entire website including all the linked pages and files
wget --no-config --execute robots=off --no-check-certificate --recursive --no-parent --continue --no-clobber <URL>

# 10. Get all the MP3 files from a sub directory
wget --no-config --level=1 --recursive --no-parent --accept mp3,MP3 <URL>

# 11. Get a images from a website in a common folder
wget --no-config --directory-prefix=files/pictures --no-directories --recursive --no-clobber --accept jpg,gif,png,jpeg <URL>

# 12. Get PDFs through recursion but stay within specific domains.
wget --no-config --mirror --domains=<domain1,domain2...> --accept=pdf <URL>

# 13. Get a files from a website but exclude a few directories.
wget --no-config --recursive --no-clobber --no-parent --exclude-directories /forums,/support <URL>

################################################
## wget for Downloading Restricted Content
################################################

## wget can be used for downloading content from sites that are behind a login screen
## or ones that check for the HTTP referer and the User Agent strings of the bot
## to prevent screen scraping.

# 14. Download files from websites that check the User Agent and the HTTP Referer
wget --no-config --refer=<URL>--user-agent=”Mozilla/5.0 Firefox/4.0.1″ <URL>

# 15. Download files from a password protected sites
wget --no-config --http-user=<user> --http-password=<password> <URL>

; You need to replace user and password with the actual form fields
; while the URL should point to the Form Submit (action) page.
# 16. Fetch pages that are behind a login page.
wget --no-config --cookies=on --save-cookies cookies.txt --keep-session-cookies --post-data 'user=<user>&password=<password>' <URL>
wget --no-config --cookies=on --load-cookies cookies.txt --keep-session-cookies <URL>

################################################
## Retrieve File Details with wget
################################################

; (look for Content Length in the response, the size is in bytes)
# 17. Find the size of a file without downloading it
wget --no-config --spider --server-response <URL> | grep -i "Content-Length"

# 18. Get file and display the content on screen without saving it locally.
wget --no-config --output-document -  --quiet <URL>

# 19. Know the last modified date of a web page (check the Last Modified tag in the HTTP header).
wget --no-config --quiet --server-response --spider <URL> 2&>1 | grep -C 3 -i -P '(?<=modified:) .*$'

; The spider option will not save the pages locally.
# 20. Check links on website to ensure that they are working. No files saved.
wget --no-config --output-file=logfile.txt --recursive --spider <URL>
	###### Spider Websites with Wget 20 Practical Examples
	# original, from 2019:
	# https://www.labnol.org/software/wget-command-examples/28750/
	###### Spider Websites with Wget 20 Practical Examples

	# wget is extremely powerful, but like with most other command line programs,
	# the plethora of options it supports can be intimidating to new users.
	# Thus what we have here are a collection of wget commands that you can use
	# to accomplish common tasks from downloading single files to mirroring entire websites.
	# It will help if you can read through the wget manual but for the busy souls,
	# these commands are ready to execute.

	# wget works best for sites comprising only static HTML pages.
	# For sites that implement all intrs-site linking with JavaScript code,
	# or for javascript-only sites, wget will perform POORLY.

	################################################
	# Single page download
	################################################

	# 01. Get single file from the Internet
	wget --no-config <URL>

	# 02. Get file but save it locally under a different name
	wget --no-config --output-document=filename.html example.com

	# 03. Get file and save it in a specific folder
	wget --no-config --directory-prefix=folder/subfolder example.com

	# 04. Resume an interrupted download previously started by wget itself
	wget --no-config --continue example.com/big.file.iso

	# 05. Get file but only if the version on server is newer than your local copy
	wget --no-config --continue --timestamping wordpress.org/latest.zip

	################################################
	## data-driven dowload of single files/URLs
	################################################

	# 06. Get multiple URLs from textfile, 1 URL/line.
	wget --no-config --input list-of-file-urls.txt

	# 07. Get list of sequentially numbered files from a server
	wget --no-config <URL>

	# 08. Get web page with all assets like stylesheets and inline images that are required to properly display the web page offline.
	wget --no-config --page-requisites --span-hosts --convert-links --adjust-extension <URL>

	################################################
	## Mirror websites with wget
	################################################

	# 09. Get entire website including all the linked pages and files
	wget --no-config --execute robots=off --no-check-certificate --recursive --no-parent --continue --no-clobber <URL>

	# 10. Get all the MP3 files from a sub directory
	wget --no-config --level=1 --recursive --no-parent --accept mp3,MP3 <URL>

	# 11. Get a images from a website in a common folder
	wget --no-config --directory-prefix=files/pictures --no-directories --recursive --no-clobber --accept jpg,gif,png,jpeg <URL>

	# 12. Get PDFs through recursion but stay within specific domains.
	wget --no-config --mirror --domains=<domain1,domain2...> --accept=pdf <URL>

	# 13. Get a files from a website but exclude a few directories.
	wget --no-config --recursive --no-clobber --no-parent --exclude-directories /forums,/support <URL>

	################################################
	## wget for Downloading Restricted Content
	################################################

	## wget can be used for downloading content from sites that are behind a login screen
	## or ones that check for the HTTP referer and the User Agent strings of the bot
	## to prevent screen scraping.

	# 14. Download files from websites that check the User Agent and the HTTP Referer
	wget --no-config --refer=<URL>--user-agent=”Mozilla/5.0 Firefox/4.0.1″ <URL>

	# 15. Download files from a password protected sites
	wget --no-config --http-user=<user> --http-password=<password> <URL>

	; You need to replace user and password with the actual form fields
	; while the URL should point to the Form Submit (action) page.
	# 16. Fetch pages that are behind a login page.
	wget --no-config --cookies=on --save-cookies cookies.txt --keep-session-cookies --post-data 'user=<user>&password=<password>' <URL>
	wget --no-config --cookies=on --load-cookies cookies.txt --keep-session-cookies <URL>

	################################################
	## Retrieve File Details with wget
	################################################

	; (look for Content Length in the response, the size is in bytes)
	# 17. Find the size of a file without downloading it
	wget --no-config --spider --server-response <URL> \| grep -i "Content-Length"

	# 18. Get file and display the content on screen without saving it locally.
	wget --no-config --output-document - --quiet <URL>

	# 19. Know the last modified date of a web page (check the Last Modified tag in the HTTP header).
	wget --no-config --quiet --server-response --spider <URL> 2&>1 \| grep -C 3 -i -P '(?<=modified:) .*$'

	; The spider option will not save the pages locally.
	# 20. Check links on website to ensure that they are working. No files saved.
	wget --no-config --output-file=logfile.txt --recursive --spider <URL>