-
-
Save enrico-atzeni/874856ef2c2ae314082b513205265348 to your computer and use it in GitHub Desktop.
Bash Web Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# bash web crawler | |
# $ bash crawl.sh http://example.com 1 | |
# NOTE: this version ignores static files, see in last pipe of the visit function | |
# TODO: | |
# 1. avoid loops by calling same urls ony once per entire script | |
# 2. do not use txt lists | |
# 3. use a variable to set the depth limit (currently hardcoded to 3) | |
# 4. more debug | |
# 5. autorestart | |
rm -f urls.txt | |
rm -f sub-urls.txt | |
rm -f sub-2-urls.txt | |
site=$1 | |
gracetime=$2 | |
verbose=1 | |
totalcalled=0 | |
totaltime=0 | |
function visit(){ | |
if [ "$verbose" -eq "1" ]; then | |
echo "visiting $1" | |
fi | |
start=`date +%s%N` | |
wget -q -O- $1 | egrep 'href=["'"'"'](\/[^\/]|'"$site"')' | egrep -o '(https?:\/\/[^"'"'"']+)' | egrep -vi '\.(jpe?g|png|gif|ico|svg|xml|js|css|mp[34]|ogg|map|webp)' >> $2 | |
end=`date +%s%N` | |
runtime=$((end-start)) | |
totalcalled=$((totalcalled +1)) | |
totaltime=$((totaltime + runtime)) | |
} | |
# ********************************************************************************** | |
# STEP 1 - get all urls from GIVEN link | |
visit $site urls.txt | |
# replace relative url in file with absolute one | |
sed -i 's~^/~'"$1"/'~g' urls.txt | |
sort -uo urls.txt urls.txt | |
sleep $gracetime | |
# ********************************************************************************** | |
# STEP 2 - scan all urls and add them to the sub-urls list | |
echo "Scanning "$(grep -c '' urls.txt)" urls" | |
while read line | |
do | |
visit $line sub-urls.txt | |
sleep $gracetime | |
done < urls.txt | |
# %N is nanoseconds, we have to divide by 10^6 | |
echo "Average time: "$(( (totaltime/totalcalled) / 1000000))" ms" | |
# replace relative url in file with absolute one | |
sed -i 's~^/~'"$1"/'~g' sub-urls.txt | |
# unique lines | |
sort -uo sub-urls.txt sub-urls.txt | |
# ********************************************************************************** | |
# STEP 3 - scan all sub urls and add to a thirs list, but we do not use it in this script to avoid loops | |
echo "Scanning "$(grep -c '' sub-urls.txt)" urls" | |
while read line | |
do | |
visit $line sub-2-urls.txt | |
sleep $gracetime | |
done < sub-urls.txt | |
# %N is nanoseconds, we have to divide by 10^6 | |
echo "Average time: "$(( (totaltime/totalcalled) / 1000000))" ms" | |
# replace relative url in file with absolute one | |
sed -i 's~^/~'"$1"/'~g' sub-2-urls.txt | |
# unique lines | |
sort -uo sub-2-urls.txt sub-2-urls.txt | |
# ********************************************************************************** | |
echo "Done." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment