oliveratgithub/curl-crawler.sh

## curl-crawler.sh
#!/bin/sh
timezone="Europe/Zurich"
# List of valid timezones: wikipedia.org/wiki/List_of_tz_database_time_zones
script="${0##*/}"
rootdir=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd)
logfile="$script.log"
log="$rootdir/$logfile"
now=$(TZ=":$timezone" date)
# Uncomment 'mailto=' (remove #) to enable emailing the log upon completion
#mailto="your@email.com"
mailsubj="$script log from $now"
logging() {
  now=$(TZ=":$timezone" date)
  if [[ -z "$1" || -z "$2" ]]; then
    echo "$now [ERROR] Nothing to log. Use:\nlogging <level> <result>"
    exit 2
  else
    echo "$now [$1] $2" >> $log
  fi
}
if [ -z "$1" ]; then
  echo "$now [ERROR] Missing file input. Use:\n$rootdir/$script /path/to/urls.txt"
  exit 2
else
  input="$1"
fi
logging "INFO" "Reading file: $input"
cat $input|while read line; do
  logging "INFO" "Crawling URL: $line"
  curlstart=$(date +"%s")
  curlresult=`curl -sSL -w '%{http_code} %{url_effective}' $line -o /dev/null`
  # curl parameters: -sS = silent; -L = follow redirects; -w = custom output format; -o = trash output
  logging "INFO" "$curlresult"
  curldone=$(date +"%s")
  difftime=$(($curldone-$curlstart))
  logging "INFO" "Crawl-time: $(($difftime / 3600)):$(($difftime / 60)):$(($difftime % 60))"
done
logging "INFO" "Done reading file: $input"
if [ ! -z "$mailto" -a "$mailto" != " " ]; then
  logging "INFO" "Sending Email to: $mailto"
  # Using postfix mail command to email the logfile contents
  cat $log | mail -s "$mailsubj" $mailto
fi
exit

## urls.txt
https://www.apple.com/
https://wikipedia.org
https://swissmacuser.ch/
https://twitter.com/swissmacuser

## zresult.log
# This is an example output generated by curl-crawler
Sun Feb 19 21:56:07 CET 2017 [INFO] Reading file: ./urls.txt
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://www.apple.com/
Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.apple.com/
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://wikipedia.org
Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.wikipedia.org/
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://swissmacuser.ch/
Sun Feb 19 21:56:08 CET 2017 [INFO] 200 https://swissmacuser.ch/
Sun Feb 19 21:56:08 CET 2017 [INFO] Crawl-time: 0:0:1
Sun Feb 19 21:56:08 CET 2017 [INFO] Crawling URL: https://twitter.com/swissmacuser
Sun Feb 19 21:56:09 CET 2017 [INFO] 200 https://twitter.com/swissmacuser
Sun Feb 19 21:56:09 CET 2017 [INFO] Crawl-time: 0:0:1
Sun Feb 19 21:56:09 CET 2017 [INFO] Done reading file: ./urls.txt
	#!/bin/sh
	timezone="Europe/Zurich"
	# List of valid timezones: wikipedia.org/wiki/List_of_tz_database_time_zones
	script="${0##*/}"
	rootdir=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd)
	logfile="$script.log"
	log="$rootdir/$logfile"
	now=$(TZ=":$timezone" date)
	# Uncomment 'mailto=' (remove #) to enable emailing the log upon completion
	#mailto="your@email.com"
	mailsubj="$script log from $now"
	logging() {
	now=$(TZ=":$timezone" date)
	if [[ -z "$1" \|\| -z "$2" ]]; then
	echo "$now [ERROR] Nothing to log. Use:\nlogging <level> <result>"
	exit 2
	else
	echo "$now [$1] $2" >> $log
	fi
	}
	if [ -z "$1" ]; then
	echo "$now [ERROR] Missing file input. Use:\n$rootdir/$script /path/to/urls.txt"
	exit 2
	else
	input="$1"
	fi
	logging "INFO" "Reading file: $input"
	cat $input\|while read line; do
	logging "INFO" "Crawling URL: $line"
	curlstart=$(date +"%s")
	curlresult=`curl -sSL -w '%{http_code} %{url_effective}' $line -o /dev/null`
	# curl parameters: -sS = silent; -L = follow redirects; -w = custom output format; -o = trash output
	logging "INFO" "$curlresult"
	curldone=$(date +"%s")
	difftime=$(($curldone-$curlstart))
	logging "INFO" "Crawl-time: $(($difftime / 3600)):$(($difftime / 60)):$(($difftime % 60))"
	done
	logging "INFO" "Done reading file: $input"
	if [ ! -z "$mailto" -a "$mailto" != " " ]; then
	logging "INFO" "Sending Email to: $mailto"
	# Using postfix mail command to email the logfile contents
	cat $log \| mail -s "$mailsubj" $mailto
	fi
	exit
	https://www.apple.com/
	https://wikipedia.org
	https://swissmacuser.ch/
	https://twitter.com/swissmacuser
	# This is an example output generated by curl-crawler
	Sun Feb 19 21:56:07 CET 2017 [INFO] Reading file: ./urls.txt
	Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://www.apple.com/
	Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.apple.com/
	Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0
	Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://wikipedia.org
	Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.wikipedia.org/
	Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0
	Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://swissmacuser.ch/
	Sun Feb 19 21:56:08 CET 2017 [INFO] 200 https://swissmacuser.ch/
	Sun Feb 19 21:56:08 CET 2017 [INFO] Crawl-time: 0:0:1
	Sun Feb 19 21:56:08 CET 2017 [INFO] Crawling URL: https://twitter.com/swissmacuser
	Sun Feb 19 21:56:09 CET 2017 [INFO] 200 https://twitter.com/swissmacuser
	Sun Feb 19 21:56:09 CET 2017 [INFO] Crawl-time: 0:0:1
	Sun Feb 19 21:56:09 CET 2017 [INFO] Done reading file: ./urls.txt