Created
April 6, 2022 18:00
-
-
Save tinabme/fb9e295fca901ab3b27f26eda150e792 to your computer and use it in GitHub Desktop.
Pulling Alexa top sites
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
PATH=/usr/local/bin:/usr/local/sbin:~/bin:/usr/bin:/bin:/usr/sbin:/sbin | |
# This is a quick bash script to grab the top 1 mil alexa urls and drop then in redis. | |
# Alexa limits the request to 50 per second and 100 per request. This can get expensive quickly so we only pull what you need. | |
# Set the initial startpoint var to 1 and increment by 100 with each group collected | |
# multiple sidekiq jobs can be set up pull many chunks at once | |
# To set/create the initial startpoint - before the first collection begins | |
# redis-cli SET startpoint 1 | |
# Get the list of countries and the number of sites for that country | |
# curl -H "x-api-key: TOKEN" "https://ats.api.alexa.com/api?Action=TopSites&ResponseGroup=ListCountries&Output=json" | jq -r '.. | objects | select(has("Code")) | [.TotalSites, .Code] | @csv' | sed 's/"//g' | xargs redis-cli -x sadd alexaCountries | |
# Copy the countries so you can pop them off (using spop countrypop) | |
# redis-cli sunionstore countrypop alexaCountries | |
startpoint=$(redis-cli get startpoint | jq -r .) | |
# because of the random number of sites for each country, limiting startpoint check to a little over 1 mil | |
if [ "$startpoint" -lt 1200000 ] | |
then | |
# so pop a country off and let's get this party started | |
IFS=','; cnt=$(redis-cli spop countrypop); read -ra APP <<< "$cnt" | |
# sadly sometimes the country data is just a space need to check for that | |
if [ "$(echo ${#cnt})" > "2" ]; | |
then | |
country=$(echo ${APP[1]}) | |
country_count=$(echo ${APP[0]}) | |
while_count=1 | |
while [ $while_count -le $country_count ] | |
do | |
# get the limit of 100 urls from alexa and pipe them to the redis list urlSet | |
curl -H "x-api-key: TOKEN" "https://ats.api.alexa.com/api?Action=Topsites&Count=100&ResponseGroup=Country&Output=json&CountryCode=${country}&Start="$while_count | jq -r '.. | objects | select(has("DataUrl")) | [.Global.Rank, .DataUrl] | @csv' | xargs redis-cli -x sadd urlSet | |
sleep 5 | |
# if there are no errors increment the count | |
if [ 0 -eq $? ]; then | |
while_count=$(( while_count + 100 )) | |
redis-cli INCRBY startpoint 100 | |
fi; | |
done | |
else | |
echo $(tput setaf 1) $(tput bold) "Bad country skipping" | |
exit 3 | |
fi | |
else | |
echo $(tput setaf 1) $(tput bold) "we got them all" | |
exit 3 | |
fi | |
# Helpful Notes | |
# To see all in the list | |
# echo $(redis-cli smembers urlSet) | |
# Then to clean it all up you can drop it in a csv | |
# echo $(redis-cli smembers urlSet) >> ~/Desktop/alexatop1mil_20210325a.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment