Last active
October 24, 2024 06:06
-
-
Save oliveratgithub/18d2daa087d9aa3a5ac3f7b2df7da30a to your computer and use it in GitHub Desktop.
Unix Shell-Script to crawl a list of website URLs using curl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
timezone="Europe/Zurich" | |
# List of valid timezones: wikipedia.org/wiki/List_of_tz_database_time_zones | |
script="${0##*/}" | |
rootdir=$(cd `dirname "${BASH_SOURCE[0]}"` && pwd) | |
logfile="$script.log" | |
log="$rootdir/$logfile" | |
now=$(TZ=":$timezone" date) | |
# Uncomment 'mailto=' (remove #) to enable emailing the log upon completion | |
#mailto="your@email.com" | |
mailsubj="$script log from $now" | |
logging() { | |
now=$(TZ=":$timezone" date) | |
if [[ -z "$1" || -z "$2" ]]; then | |
echo "$now [ERROR] Nothing to log. Use:\nlogging <level> <result>" | |
exit 2 | |
else | |
echo "$now [$1] $2" >> $log | |
fi | |
} | |
if [ -z "$1" ]; then | |
echo "$now [ERROR] Missing file input. Use:\n$rootdir/$script /path/to/urls.txt" | |
exit 2 | |
else | |
input="$1" | |
fi | |
logging "INFO" "Reading file: $input" | |
cat $input|while read line; do | |
logging "INFO" "Crawling URL: $line" | |
curlstart=$(date +"%s") | |
curlresult=`curl -sSL -w '%{http_code} %{url_effective}' $line -o /dev/null` | |
# curl parameters: -sS = silent; -L = follow redirects; -w = custom output format; -o = trash output | |
logging "INFO" "$curlresult" | |
curldone=$(date +"%s") | |
difftime=$(($curldone-$curlstart)) | |
logging "INFO" "Crawl-time: $(($difftime / 3600)):$(($difftime / 60)):$(($difftime % 60))" | |
done | |
logging "INFO" "Done reading file: $input" | |
if [ ! -z "$mailto" -a "$mailto" != " " ]; then | |
logging "INFO" "Sending Email to: $mailto" | |
# Using postfix mail command to email the logfile contents | |
cat $log | mail -s "$mailsubj" $mailto | |
fi | |
exit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
https://www.apple.com/ | |
https://wikipedia.org | |
https://swissmacuser.ch/ | |
https://twitter.com/swissmacuser |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is an example output generated by curl-crawler | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Reading file: ./urls.txt | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://www.apple.com/ | |
Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.apple.com/ | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0 | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://wikipedia.org | |
Sun Feb 19 21:56:07 CET 2017 [INFO] 200 https://www.wikipedia.org/ | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawl-time: 0:0:0 | |
Sun Feb 19 21:56:07 CET 2017 [INFO] Crawling URL: https://swissmacuser.ch/ | |
Sun Feb 19 21:56:08 CET 2017 [INFO] 200 https://swissmacuser.ch/ | |
Sun Feb 19 21:56:08 CET 2017 [INFO] Crawl-time: 0:0:1 | |
Sun Feb 19 21:56:08 CET 2017 [INFO] Crawling URL: https://twitter.com/swissmacuser | |
Sun Feb 19 21:56:09 CET 2017 [INFO] 200 https://twitter.com/swissmacuser | |
Sun Feb 19 21:56:09 CET 2017 [INFO] Crawl-time: 0:0:1 | |
Sun Feb 19 21:56:09 CET 2017 [INFO] Done reading file: ./urls.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment