Skip to content

Instantly share code, notes, and snippets.

@enrico-atzeni
Forked from antoineMoPa/crawl.sh
Last active March 11, 2023 14:36
Show Gist options
  • Save enrico-atzeni/874856ef2c2ae314082b513205265348 to your computer and use it in GitHub Desktop.
Save enrico-atzeni/874856ef2c2ae314082b513205265348 to your computer and use it in GitHub Desktop.
Bash Web Crawler
#!/bin/bash
# bash web crawler
# $ bash crawl.sh http://example.com 1
# NOTE: this version ignores static files, see in last pipe of the visit function
# TODO:
# 1. avoid loops by calling same urls ony once per entire script
# 2. do not use txt lists
# 3. use a variable to set the depth limit (currently hardcoded to 3)
# 4. more debug
# 5. autorestart
rm -f urls.txt
rm -f sub-urls.txt
rm -f sub-2-urls.txt
site=$1
gracetime=$2
verbose=1
totalcalled=0
totaltime=0
function visit(){
if [ "$verbose" -eq "1" ]; then
echo "visiting $1"
fi
start=`date +%s%N`
wget -q -O- $1 | egrep 'href=["'"'"'](\/[^\/]|'"$site"')' | egrep -o '(https?:\/\/[^"'"'"']+)' | egrep -vi '\.(jpe?g|png|gif|ico|svg|xml|js|css|mp[34]|ogg|map|webp)' >> $2
end=`date +%s%N`
runtime=$((end-start))
totalcalled=$((totalcalled +1))
totaltime=$((totaltime + runtime))
}
# **********************************************************************************
# STEP 1 - get all urls from GIVEN link
visit $site urls.txt
# replace relative url in file with absolute one
sed -i 's~^/~'"$1"/'~g' urls.txt
sort -uo urls.txt urls.txt
sleep $gracetime
# **********************************************************************************
# STEP 2 - scan all urls and add them to the sub-urls list
echo "Scanning "$(grep -c '' urls.txt)" urls"
while read line
do
visit $line sub-urls.txt
sleep $gracetime
done < urls.txt
# %N is nanoseconds, we have to divide by 10^6
echo "Average time: "$(( (totaltime/totalcalled) / 1000000))" ms"
# replace relative url in file with absolute one
sed -i 's~^/~'"$1"/'~g' sub-urls.txt
# unique lines
sort -uo sub-urls.txt sub-urls.txt
# **********************************************************************************
# STEP 3 - scan all sub urls and add to a thirs list, but we do not use it in this script to avoid loops
echo "Scanning "$(grep -c '' sub-urls.txt)" urls"
while read line
do
visit $line sub-2-urls.txt
sleep $gracetime
done < sub-urls.txt
# %N is nanoseconds, we have to divide by 10^6
echo "Average time: "$(( (totaltime/totalcalled) / 1000000))" ms"
# replace relative url in file with absolute one
sed -i 's~^/~'"$1"/'~g' sub-2-urls.txt
# unique lines
sort -uo sub-2-urls.txt sub-2-urls.txt
# **********************************************************************************
echo "Done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment