Skip to content

Instantly share code, notes, and snippets.

@allen-munsch
Created October 13, 2018 16:14
Show Gist options
  • Save allen-munsch/d808c23e34cffcfed450e0ff8e325f07 to your computer and use it in GitHub Desktop.
Save allen-munsch/d808c23e34cffcfed450e0ff8e325f07 to your computer and use it in GitHub Desktop.
simple common crawl curl bash example for wat, and warc
#!/bin/bash
function search_indexes() {
declare -a indexes=(/CC-MAIN-2018-39-index /CC-MAIN-2018-34-index /CC-MAIN-2018-30-index /CC-MAIN-2018-26-index /CC-MAIN-2018-22-index /CC-MAIN-2018-17-index /CC-MAIN-2018-13-index /CC-MAIN-2018-09-index /CC-MAIN-2018-05-index /CC-MAIN-2017-51-index /CC-MAIN-2017-47-index /CC-MAIN-2017-43-index /CC-MAIN-2017-39-index /CC-MAIN-2017-34-index /CC-MAIN-2017-30-index /CC-MAIN-2017-26-index /CC-MAIN-2017-22-index /CC-MAIN-2017-17-index /CC-MAIN-2017-13-index /CC-MAIN-2017-09-index /CC-MAIN-2017-04-index /CC-MAIN-2016-50-index /CC-MAIN-2016-44-index /CC-MAIN-2016-40-index /CC-MAIN-2016-36-index /CC-MAIN-2016-30-index /CC-MAIN-2016-26-index /CC-MAIN-2016-22-index /CC-MAIN-2016-18-index /CC-MAIN-2016-07-index /CC-MAIN-2015-48-index /CC-MAIN-2015-40-index /CC-MAIN-2015-35-index /CC-MAIN-2015-32-index /CC-MAIN-2015-27-index /CC-MAIN-2015-22-index /CC-MAIN-2015-18-index /CC-MAIN-2015-14-index /CC-MAIN-2015-11-index /CC-MAIN-2015-06-index /CC-MAIN-2014-52-index /CC-MAIN-2014-49-index /CC-MAIN-2014-42-index /CC-MAIN-2014-41-index /CC-MAIN-2014-35-index /CC-MAIN-2014-23-index /CC-MAIN-2014-15-index /CC-MAIN-2014-10-index /CC-MAIN-2013-48-index /CC-MAIN-2013-20-index)
for index in "${indexes[@]}"
do
export URL="http://index.commoncrawl.org$index?url=*.amazon.com&output=json&page=1&limit=10&status=200,301"
curl $URL | python -c 'import sys, json
for line in sys.stdin.readlines():
print(json.loads(line).items())
'
done
}
function byterange_warc() {
curl https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2018-39/wat.paths.gz | zcat > /tmp/CC-MAIN-2018-39_wat.paths.gz
export bytes=0
export PREFIX=https://commoncrawl.s3.amazonaws.com/
export URL=$PREFIX$(head -n 1 /tmp/CC-MAIN-2018-39_wat.paths.gz)
echo $URL
while read -r wats
do
bytes=$((bytes + $(echo $wats | wc -c)))
echo $(echo $bytes) $wats
done < <(curl -r 0-2048 "$URL" | zcat)
}
search_indexes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment