Skip to content

Instantly share code, notes, and snippets.

@Pikrass
Created November 3, 2019 21:53
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Pikrass/f8462ff8a9af18f97f08d2a90533af31 to your computer and use it in GitHub Desktop.
Save Pikrass/f8462ff8a9af18f97f08d2a90533af31 to your computer and use it in GitHub Desktop.
Improved generated script for google-group-crawler
#!/usr/bin/env bash
UA="Mozilla/5.0 (X11; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0"
COOKIES="cookies-google-com.txt"
_URLS_PER_PROC=20
_MAX_PROCS=6
_METER_WIDTH=50
queue=0
req=
tmpdir="$(mktemp -d --tmpdir fetch-googlegroups-XXX)"
fifo="$tmpdir/fifo"
mkfifo $fifo
__atexit__()
{
rm -f $fifo
rmdir $tmpdir
}
trap __atexit__ EXIT
(
for i in $(seq -w 1 $_MAX_PROCS); do
echo "$i: idle"
done
echo -n "Completed: 0"
col=$(($(echo -n $_MAX_PROCS | wc -c) + 3))
total=0
while read proc job progress; do
if [ "$progress" -gt 0 ]; then
total=$(($total + 1))
fi
if [ "$progress" -eq $_URLS_PER_PROC ]; then
msg="idle"
else
msg="$(printf "#%05d %02d/%02d [" $job $progress $_URLS_PER_PROC)"
meter=$(($_METER_WIDTH * $progress / $_URLS_PER_PROC))
if [ $meter -gt 0 ]; then
for i in $(seq 1 $(($meter - 1))); do
msg="$msg="
done
msg="$msg>"
fi
for i in $(seq $(($meter + 1)) $_METER_WIDTH); do
msg="$msg "
done
msg="$msg]"
fi
rows_up=$(($_MAX_PROCS - $proc + 1))
echo -en "\r\e[${rows_up}A\e[${col}G${msg}\e[K\e[${rows_up}B\rCompleted: ${total}"
done
) < $fifo &
starting_proc=1
job_id=1
__flush__ ()
{
if [ $queue -eq 0 ]; then
return 0
fi
if [ $starting_proc -le $_MAX_PROCS ]; then
__batch__ "$req" $starting_proc $job_id >> $fifo &
starting_proc=$(($starting_proc + 1))
else
wait -n
__batch__ "$req" $? $job_id >> $fifo &
fi
job_id=$(($job_id + 1))
queue=0
req=
}
__batch__ ()
{
curl -A "$_USER_AGENT" -b $COOKIES $1 2>&1 | grep --line-buffered Xferd | ( awk "{print \"$2 $3\", FNR - 1; fflush()}"; echo "$2 $3 $_URLS_PER_PROC" )
return $2
}
__wget__ ()
{
if [[ ! -f "$1" ]]; then
queue=$(($queue + 1))
req="$req -o $1 $2"
if [[ $queue -ge $_URLS_PER_PROC ]]; then
__flush__
fi
fi
}
source ./list
__flush__
wait
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment