Skip to content

Instantly share code, notes, and snippets.

@simontraill
Last active April 18, 2018 16:30
Show Gist options
  • Save simontraill/956c1750883f0b3ad1c7b3cc802c9a8a to your computer and use it in GitHub Desktop.
Save simontraill/956c1750883f0b3ad1c7b3cc802c9a8a to your computer and use it in GitHub Desktop.
Scrapes the top 100 books from the Gutenberg project (https://www.gutenberg.org) and uploads them to Swift - with metadata.
# Quick and Dirty Project Scraper. Pulls top 100 text Ebooks, massages the metadata to make it more Elasticsearch friendly, and uploads to Swift.
# Usage: Edit the params below and run me.
# It's slow. This is deliberate, because otherwise Gutenberg will block your connection (for up to 24 hours). Which is slower.
# straill 2018/04/18
SWIFT_HOST=localhost:3880
SWIFT_PROTO=http # ( http | https )
SWIFT_USER=gutenberg
SWIFT_PASSWORD=password
SWIFT_AUTH_URL=${SWIFT_PROTO}://${SWIFT_HOST}/auth/v1.0
SWIFT_STORAGE_URL=${SWIFT_PROTO}://${SWIFT_HOST}/v1/AUTH_${SWIFT_USER}
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
cd $DIR || ( echo "Failed to navigate to ${DIR}" && exit 1 )
LINES=100
# Grab the top 100, but be polite.
if [ ! -f ./top100.html ]; then
curl https://www.gutenberg.org/browse/scores/top > ./top100.html
fi
cat top100.html | sed -n -e '/Top 100 EBooks yesterday/,$p' | grep 'href="/ebooks/' | head -100 | awk -F '/ebooks/' '{print $2}' | awk -F '"' '{print $1}' > ./top100.data
TOKEN=$( curl -s -i "${SWIFT_PROTO}://${SWIFT_HOST}/auth/v1.0" -H "x-auth-user: ${SWIFT_USER}" -H "x-auth-key: ${SWIFT_PASSWORD}" 2>/dev/null | egrep '^X-Auth-Token' | awk '{print $NF}' | sed 's/\x0D$//' ) # Fun times with carriage returns
for i in $( cat top100.data ); do
BOOK=$i
echo Scraping book ${BOOK}...
if [ ! -f ./meta/${BOOK} ]; then
curl http://www.gutenberg.org/ebooks/${BOOK} -o ./meta/${BOOK}
fi
if [ ! -f ./books/${BOOK}.txt ]; then
curl http://www.gutenberg.org/files/${BOOK}/${BOOK}.txt > ./books/${BOOK}.txt
fi
if [ ! -f ./books/$BOOK.txt ]; then
echo " >> could not find book, requesting alternate at http://www.gutenberg.org/files/${BOOK}/${BOOK}-0.txt"
curl http://www.gutenberg.org/files/${BOOK}/${BOOK}-0.txt > ./books/${BOOK}.txt
elif [ $( du -sk books/${BOOK}.txt | awk '{print $1}' ) -eq 0 ]; then
echo " >> book size is 0, requesting alternate at http://www.gutenberg.org/files/${BOOK}/${BOOK}-0.txt"
curl http://www.gutenberg.org/files/${BOOK}/${BOOK}-0.txt > ./books/${BOOK}.txt
fi
if [ ! -f ./books/$BOOK.txt ]; then
echo " >> FAILED to download book ${BOOK}"
exit 1
fi
if [ ! -f ./meta/$BOOK ]; then
echo " >> FAILED to get meta for book ${BOOK}"
exit 1
fi
AUTHOR=$( grep 'typeof="pgterms:agent"' ./meta/${BOOK} | awk -F'>' '{print $2}' | awk -F '<' '{print $1}' )
LANGUAGE=$( cat ./meta/${BOOK} | perl -e '$x = ""; while (<STDIN>) { $x .= $_ ; }; $x =~ m/<th>Language<\/th>\s*([^\n]+)/m; $y = $1; $y =~ m/>([^<]+)/; print $1;' )
CATEGORY=$( cat ./meta/${BOOK} | grep 'property="dcterms:type"' | awk -F'>' '{print $2}' | awk -F '<' '{print $1}' )
EBOOK_NO=$BOOK
RDATE=$( cat ./meta/${BOOK} | grep 'itemprop="datePublished"' | awk -F'>' '{print $2}' | awk -F '<' '{print $1}' )
RELEASE_DATE=$( python -c "import datetime; from dateutil.parser import parse; d=parse('${RDATE}'); print d.strftime('%Y/%m/%d %H:%M:%S')" ) # ES friendly
COPYRIGHT_STATUS=$( cat ./meta/${BOOK} | grep 'property="dcterms:rights"' | awk -F'>' '{print $2}' | awk -F '<' '{print $1}' )
echo ${COPYRIGHT_STATUS} | egrep '^\s*Copyrighted\.' >/dev/null
if [ $? -eq 0 ]; then
IS_COPYRIGHTED=true
else
IS_COPYRIGHTED=false
fi
DOWNLOADS=$( cat ./meta/${BOOK} | grep 'itemprop="interactionCount"' | awk -F'>' '{print $2}' | awk -F '<' '{print $1}' | awk '{print $1}' )
PRICE_USD=$( cat ./meta/${BOOK} | grep 'itemprop="priceCurrency"' | perl -ne 'm/"price">([^<]+)/; $y = $1; $y =~ s/\$//; print $y ' )
TITLE=$( cat meta/${BOOK} | perl -e '$x = ""; while (<STDIN>) { $x .= $_ ; }; $x =~ m/<td itemprop="headline">\s*([^<]+)/m; print $1' )
OPENING_LINES=$( cat ./books/${BOOK}.txt | egrep -v '^\s*$' | egrep -v '^\s+' | sed -n -e '/\*\*\* START OF/,$p' | head -$(( $LINES + 1 )) | tail -${LINES} | perl -pe 's/\n/ /g; s/\r/ /g' | perl -pe 's/\n/ /g; s/\s+/SPACECHARACTER/g; s/[^\w]+//g; s/SPACECHARACTER/ /g;' | cut -c 1-900 )
CONTAINER=books
curl -H "x-auth-token: ${TOKEN}" -X PUT ${SWIFT_STORAGE_URL}/${CONTAINER} >/dev/null 2>&1
echo
curl -H "x-auth-token: ${TOKEN}" -X PUT ${SWIFT_STORAGE_URL}/${CONTAINER}/${BOOK}.txt --data-binary @./books/${BOOK}.txt \
-H "x-object-meta-author: ${AUTHOR}" \
-H "x-object-meta-language: ${LANGUAGE}" \
-H "x-object-meta-category: ${CATEGORY}" \
-H "x-object-meta-ebook-number: ${EBOOK_NO}" \
-H "x-object-meta-release-date: ${RELEASE_DATE}" \
-H "x-object-meta-copyright-status: ${COPYRIGHT_STATUS}" \
-H "x-object-meta-downloads: ${DOWNLOADS}" \
-H "x-object-meta-price-usd: ${PRICE_USD}" \
-H "x-object-meta-title: ${TITLE}" \
-H "x-object-meta-is-copyrighted: ${IS_COPYRIGHTED}" \
-H "x-object-meta-opening-lines: ${OPENING_LINES}"
echo " >> Scraped book ${BOOK} OK."
sleep 10
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment