Last active
April 18, 2018 16:30
-
-
Save simontraill/956c1750883f0b3ad1c7b3cc802c9a8a to your computer and use it in GitHub Desktop.
Scrapes the top 100 books from the Gutenberg project (https://www.gutenberg.org) and uploads them to Swift - with metadata.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Quick and Dirty Project Scraper. Pulls top 100 text Ebooks, massages the metadata to make it more Elasticsearch friendly, and uploads to Swift. | |
# Usage: Edit the params below and run me. | |
# It's slow. This is deliberate, because otherwise Gutenberg will block your connection (for up to 24 hours). Which is slower. | |
# straill 2018/04/18 | |
SWIFT_HOST=localhost:3880 | |
SWIFT_PROTO=http # ( http | https ) | |
SWIFT_USER=gutenberg | |
SWIFT_PASSWORD=password | |
SWIFT_AUTH_URL=${SWIFT_PROTO}://${SWIFT_HOST}/auth/v1.0 | |
SWIFT_STORAGE_URL=${SWIFT_PROTO}://${SWIFT_HOST}/v1/AUTH_${SWIFT_USER} | |
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | |
cd $DIR || ( echo "Failed to navigate to ${DIR}" && exit 1 ) | |
LINES=100 | |
# Grab the top 100, but be polite. | |
if [ ! -f ./top100.html ]; then | |
curl https://www.gutenberg.org/browse/scores/top > ./top100.html | |
fi | |
cat top100.html | sed -n -e '/Top 100 EBooks yesterday/,$p' | grep 'href="/ebooks/' | head -100 | awk -F '/ebooks/' '{print $2}' | awk -F '"' '{print $1}' > ./top100.data | |
TOKEN=$( curl -s -i "${SWIFT_PROTO}://${SWIFT_HOST}/auth/v1.0" -H "x-auth-user: ${SWIFT_USER}" -H "x-auth-key: ${SWIFT_PASSWORD}" 2>/dev/null | egrep '^X-Auth-Token' | awk '{print $NF}' | sed 's/\x0D$//' ) # Fun times with carriage returns | |
for i in $( cat top100.data ); do | |
BOOK=$i | |
echo Scraping book ${BOOK}... | |
if [ ! -f ./meta/${BOOK} ]; then | |
curl http://www.gutenberg.org/ebooks/${BOOK} -o ./meta/${BOOK} | |
fi | |
if [ ! -f ./books/${BOOK}.txt ]; then | |
curl http://www.gutenberg.org/files/${BOOK}/${BOOK}.txt > ./books/${BOOK}.txt | |
fi | |
if [ ! -f ./books/$BOOK.txt ]; then | |
echo " >> could not find book, requesting alternate at http://www.gutenberg.org/files/${BOOK}/${BOOK}-0.txt" | |
curl http://www.gutenberg.org/files/${BOOK}/${BOOK}-0.txt > ./books/${BOOK}.txt | |
elif [ $( du -sk books/${BOOK}.txt | awk '{print $1}' ) -eq 0 ]; then | |
echo " >> book size is 0, requesting alternate at http://www.gutenberg.org/files/${BOOK}/${BOOK}-0.txt" | |
curl http://www.gutenberg.org/files/${BOOK}/${BOOK}-0.txt > ./books/${BOOK}.txt | |
fi | |
if [ ! -f ./books/$BOOK.txt ]; then | |
echo " >> FAILED to download book ${BOOK}" | |
exit 1 | |
fi | |
if [ ! -f ./meta/$BOOK ]; then | |
echo " >> FAILED to get meta for book ${BOOK}" | |
exit 1 | |
fi | |
AUTHOR=$( grep 'typeof="pgterms:agent"' ./meta/${BOOK} | awk -F'>' '{print $2}' | awk -F '<' '{print $1}' ) | |
LANGUAGE=$( cat ./meta/${BOOK} | perl -e '$x = ""; while (<STDIN>) { $x .= $_ ; }; $x =~ m/<th>Language<\/th>\s*([^\n]+)/m; $y = $1; $y =~ m/>([^<]+)/; print $1;' ) | |
CATEGORY=$( cat ./meta/${BOOK} | grep 'property="dcterms:type"' | awk -F'>' '{print $2}' | awk -F '<' '{print $1}' ) | |
EBOOK_NO=$BOOK | |
RDATE=$( cat ./meta/${BOOK} | grep 'itemprop="datePublished"' | awk -F'>' '{print $2}' | awk -F '<' '{print $1}' ) | |
RELEASE_DATE=$( python -c "import datetime; from dateutil.parser import parse; d=parse('${RDATE}'); print d.strftime('%Y/%m/%d %H:%M:%S')" ) # ES friendly | |
COPYRIGHT_STATUS=$( cat ./meta/${BOOK} | grep 'property="dcterms:rights"' | awk -F'>' '{print $2}' | awk -F '<' '{print $1}' ) | |
echo ${COPYRIGHT_STATUS} | egrep '^\s*Copyrighted\.' >/dev/null | |
if [ $? -eq 0 ]; then | |
IS_COPYRIGHTED=true | |
else | |
IS_COPYRIGHTED=false | |
fi | |
DOWNLOADS=$( cat ./meta/${BOOK} | grep 'itemprop="interactionCount"' | awk -F'>' '{print $2}' | awk -F '<' '{print $1}' | awk '{print $1}' ) | |
PRICE_USD=$( cat ./meta/${BOOK} | grep 'itemprop="priceCurrency"' | perl -ne 'm/"price">([^<]+)/; $y = $1; $y =~ s/\$//; print $y ' ) | |
TITLE=$( cat meta/${BOOK} | perl -e '$x = ""; while (<STDIN>) { $x .= $_ ; }; $x =~ m/<td itemprop="headline">\s*([^<]+)/m; print $1' ) | |
OPENING_LINES=$( cat ./books/${BOOK}.txt | egrep -v '^\s*$' | egrep -v '^\s+' | sed -n -e '/\*\*\* START OF/,$p' | head -$(( $LINES + 1 )) | tail -${LINES} | perl -pe 's/\n/ /g; s/\r/ /g' | perl -pe 's/\n/ /g; s/\s+/SPACECHARACTER/g; s/[^\w]+//g; s/SPACECHARACTER/ /g;' | cut -c 1-900 ) | |
CONTAINER=books | |
curl -H "x-auth-token: ${TOKEN}" -X PUT ${SWIFT_STORAGE_URL}/${CONTAINER} >/dev/null 2>&1 | |
echo | |
curl -H "x-auth-token: ${TOKEN}" -X PUT ${SWIFT_STORAGE_URL}/${CONTAINER}/${BOOK}.txt --data-binary @./books/${BOOK}.txt \ | |
-H "x-object-meta-author: ${AUTHOR}" \ | |
-H "x-object-meta-language: ${LANGUAGE}" \ | |
-H "x-object-meta-category: ${CATEGORY}" \ | |
-H "x-object-meta-ebook-number: ${EBOOK_NO}" \ | |
-H "x-object-meta-release-date: ${RELEASE_DATE}" \ | |
-H "x-object-meta-copyright-status: ${COPYRIGHT_STATUS}" \ | |
-H "x-object-meta-downloads: ${DOWNLOADS}" \ | |
-H "x-object-meta-price-usd: ${PRICE_USD}" \ | |
-H "x-object-meta-title: ${TITLE}" \ | |
-H "x-object-meta-is-copyrighted: ${IS_COPYRIGHTED}" \ | |
-H "x-object-meta-opening-lines: ${OPENING_LINES}" | |
echo " >> Scraped book ${BOOK} OK." | |
sleep 10 | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment