Skip to content

Instantly share code, notes, and snippets.

@noqqe
Created June 12, 2012 19:11
Show Gist options
  • Save noqqe/2919518 to your computer and use it in GitHub Desktop.
Save noqqe/2919518 to your computer and use it in GitHub Desktop.
0ocrawl.bash: Easy webcrawling with redis
#!/bin/bash
# 0ocrawl.bash: Easy webcrawling with redis
# Copyright: (C) 2012 Florian Baumann
# License: GPL-3 <http://www.gnu.org/licenses/gpl-3.0.txt>
# Date: Friday 2012-06-08
## Configuration
REDIS="/usr/bin/redis-cli"
REDISOPTS="--raw -h localhost"
## The heart
# curl the url, isolate the urls
# and push it into the redis database
function d.crawl () {
OIFS=$IFS
IFS='
'
#for s in $(curl -L -m 30 --silent "$1" | sed -rn 's#.*<a href="http://([^/]*)([^"]*)">.*#\1 \2#p'); do
for s in $(curl -L -m 30 --silent "$1" | sed -rn 's#.*<a href="http://([^/]*|[^"]*|[^\ ]*)([^"]*)".*>.*#\1 \2#p'); do
IFS=$OIFS
set -- $s
echo $#
echo "INFO: Found ${1} PARAM ${2:-/}"
$REDIS $REDISOPTS ZINCRBY $1 1 ${2:-/} &>/dev/null
$REDIS $REDISOPTS ZINCRBY d.scores 1 $1 &>/dev/null
$REDIS $REDISOPTS SADD to.crawl "${1}${2:-/}" &>/dev/null
set --
done
}
## The scoring
# get a list of scored domains
function d.score () {
local DOMAINS=$($REDIS $REDISOPTS ZRANGE d.scores 0 -1)
for x in $DOMAINS; do
echo -n "${x//\"}: "
$REDIS $REDISOPTS ZSCORE d.scores $x
done | awk '{print $2" "$1}'
}
## The queue
# get a list of scored domains
function d.queue () {
if [ "$1" == "list" ]; then
$REDIS $REDISOPTS SMEMBERS to.crawl
echo -n "Queue entries: "
$REDIS $REDISOPTS SMEMBERS to.crawl | wc -l
elif [ "$1" == "get" ]; then
$REDIS --raw -h localhost SPOP to.crawl
fi
}
## The warmup
# crawling a bunch of default sites to fill queue
function d.warmup.cache () {
d.crawl http://spiegel.de
d.crawl http://fefe.de
d.crawl http://heise.de
d.crawl http://zeit.de
d.crawl http://planet.ubuntuusers.de
d.crawl http://shutdown-system.de
d.crawl http://noqqe.de
}
## The runtime menue
case $1 in
--warmup ) d.warmup.cache ;;
--queue) d.queue ;;
--queue-list) d.queue list ;;
--score) d.score ;;
--auto)
while true; do
d="$(d.queue get)"
if [ -n "$d" ]; then
echo "INFO: Crawling $d"
d.crawl "http://$d"
fi
done ;;
*) d.crawl $1 ;;
esac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment