enrico-atzeni/crawl.sh

## crawl.sh
#!/bin/bash

# bash web crawler
# $ bash crawl.sh http://example.com 1

# NOTE: this version ignores static files, see in last pipe of the visit function

# TODO:
# 1. avoid loops by calling same urls ony once per entire script
# 2. do not use txt lists
# 3. use a variable to set the depth limit (currently hardcoded to 3)
# 4. more debug
# 5. autorestart

rm -f urls.txt
rm -f sub-urls.txt
rm -f sub-2-urls.txt

site=$1
gracetime=$2

verbose=1

totalcalled=0
totaltime=0

function visit(){
    if [ "$verbose" -eq "1" ]; then
            echo "visiting $1"
    fi

    start=`date +%s%N`
    wget -q -O- $1 | egrep 'href=["'"'"'](\/[^\/]|'"$site"')' | egrep -o '(https?:\/\/[^"'"'"']+)' | egrep -vi '\.(jpe?g|png|gif|ico|svg|xml|js|css|mp[34]|ogg|map|webp)'  >> $2
    end=`date +%s%N`

    runtime=$((end-start))

    totalcalled=$((totalcalled +1))
    totaltime=$((totaltime + runtime))
}

# **********************************************************************************
# STEP 1  -  get all urls from GIVEN link
visit $site urls.txt

# replace relative url in file with absolute one
sed -i 's~^/~'"$1"/'~g' urls.txt

sort -uo urls.txt urls.txt

sleep $gracetime

# **********************************************************************************
# STEP 2  -  scan all urls and add them to the sub-urls list
echo "Scanning "$(grep -c '' urls.txt)" urls"

while read line
do
    visit $line sub-urls.txt
    sleep $gracetime
done < urls.txt

# %N is nanoseconds, we have to divide by 10^6
echo "Average time: "$(( (totaltime/totalcalled) / 1000000))" ms"

# replace relative url in file with absolute one
sed -i 's~^/~'"$1"/'~g' sub-urls.txt

# unique lines
sort -uo sub-urls.txt sub-urls.txt

# **********************************************************************************
# STEP 3  -  scan all sub urls and add to a thirs list, but we do not use it in this script to avoid loops
echo "Scanning "$(grep -c '' sub-urls.txt)" urls"

while read line
do
    visit $line sub-2-urls.txt
    sleep $gracetime
done < sub-urls.txt

# %N is nanoseconds, we have to divide by 10^6
echo "Average time: "$(( (totaltime/totalcalled) / 1000000))" ms"

# replace relative url in file with absolute one
sed -i 's~^/~'"$1"/'~g' sub-2-urls.txt

# unique lines
sort -uo sub-2-urls.txt sub-2-urls.txt

# **********************************************************************************
echo "Done."
	#!/bin/bash

	# bash web crawler
	# $ bash crawl.sh http://example.com 1

	# NOTE: this version ignores static files, see in last pipe of the visit function

	# TODO:
	# 1. avoid loops by calling same urls ony once per entire script
	# 2. do not use txt lists
	# 3. use a variable to set the depth limit (currently hardcoded to 3)
	# 4. more debug
	# 5. autorestart

	rm -f urls.txt
	rm -f sub-urls.txt
	rm -f sub-2-urls.txt

	site=$1
	gracetime=$2

	verbose=1

	totalcalled=0
	totaltime=0

	function visit(){
	if [ "$verbose" -eq "1" ]; then
	echo "visiting $1"
	fi

	start=`date +%s%N`
	wget -q -O- $1 \| egrep 'href=["'"'"'](\/[^\/]\|'"$site"')' \| egrep -o '(https?:\/\/[^"'"'"']+)' \| egrep -vi '\.(jpe?g\|png\|gif\|ico\|svg\|xml\|js\|css\|mp[34]\|ogg\|map\|webp)' >> $2
	end=`date +%s%N`

	runtime=$((end-start))

	totalcalled=$((totalcalled +1))
	totaltime=$((totaltime + runtime))
	}

	# **********************************************************************************
	# STEP 1 - get all urls from GIVEN link
	visit $site urls.txt

	# replace relative url in file with absolute one
	sed -i 's~^/~'"$1"/'~g' urls.txt

	sort -uo urls.txt urls.txt

	sleep $gracetime

	# **********************************************************************************
	# STEP 2 - scan all urls and add them to the sub-urls list
	echo "Scanning "$(grep -c '' urls.txt)" urls"

	while read line
	do
	visit $line sub-urls.txt
	sleep $gracetime
	done < urls.txt

	# %N is nanoseconds, we have to divide by 10^6
	echo "Average time: "$(( (totaltime/totalcalled) / 1000000))" ms"

	# replace relative url in file with absolute one
	sed -i 's~^/~'"$1"/'~g' sub-urls.txt

	# unique lines
	sort -uo sub-urls.txt sub-urls.txt

	# **********************************************************************************
	# STEP 3 - scan all sub urls and add to a thirs list, but we do not use it in this script to avoid loops
	echo "Scanning "$(grep -c '' sub-urls.txt)" urls"

	while read line
	do
	visit $line sub-2-urls.txt
	sleep $gracetime
	done < sub-urls.txt

	# %N is nanoseconds, we have to divide by 10^6
	echo "Average time: "$(( (totaltime/totalcalled) / 1000000))" ms"

	# replace relative url in file with absolute one
	sed -i 's~^/~'"$1"/'~g' sub-2-urls.txt

	# unique lines
	sort -uo sub-2-urls.txt sub-2-urls.txt

	# **********************************************************************************
	echo "Done."