Skip to content

Instantly share code, notes, and snippets.

@nihilismus
Created May 29, 2020 02:32
Show Gist options
  • Save nihilismus/e9c2d0b4c1dffd77d432530cd0ba86c9 to your computer and use it in GitHub Desktop.
Save nihilismus/e9c2d0b4c1dffd77d432530cd0ba86c9 to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
# Mini Scraper 2
# https://pywombat.com/exercises/e770767b
# Author: Antonio Hernández Blas <hba.nihilismus<at>gmail.com>
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
# Version 2, December 2004
#
# Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
#
# Everyone is permitted to copy and distribute verbatim or modified
# copies of this license document, and changing it is allowed as long
# as the name is changed.
#
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
# TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
#
# 0. You just DO WHAT THE FUCK YOU WANT TO.
NUMBER_OF_THREADS_IN_A_SET_OF_THREADS=20
SECONDS_TO_WAIT_BEFORE_EXECUTING_A_NEW_SET_OF_THREADS=5
get_name() {
echo "$@" \
| grep -E '^<h1>' \
| sed -e 's_<h1>__' -e 's_</h1>__'
}
get_species() {
echo "$@" \
| grep -E '^<td>.*</td>' \
| sed -e 's_<td>__' -e 's_</td>__' \
| sort \
| uniq \
| sed 's/&mdash;//g' \
| xargs
}
get_types() {
echo "$@" \
| grep 'type-icon' \
| sed 's_</a>_\n_g' \
| grep 'type-icon' \
| sed 's/^.*">//' \
| sort \
| uniq \
| xargs
}
get_and_print_info() {
info="$(
lynx -source "$1" \
| grep -E -A 2 '^<h1>|^<th>Type|^<th>Species' \
| grep -v Generation
)"
cat <<EOF
name: $(get_name "$info") - species: $(get_species "$info") - types: $(get_types "$info")
EOF
}
pokemones="$(
lynx -dump https://pokemondb.net/pokedex/all \
| grep 'https://pokemondb.net/pokedex/' \
| grep -vE 'pokedex/all|pokedex/game' \
| sed 's/^.*https/https/' \
| sort \
| uniq
)"
i=0
for pokemon in $pokemones; do
if [ $i -lt $NUMBER_OF_THREADS_IN_A_SET_OF_THREADS ]; then
get_and_print_info $pokemon &
i=$(( $i + 1 ))
else
sleep $SECONDS_TO_WAIT_BEFORE_EXECUTING_A_NEW_SET_OF_THREADS
i=0
fi
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment