Skip to content

Instantly share code, notes, and snippets.

@nop
Last active August 7, 2020 09:14
Show Gist options
  • Save nop/525d71594b5631c0338f0f2c84b5c26a to your computer and use it in GitHub Desktop.
Save nop/525d71594b5631c0338f0f2c84b5c26a to your computer and use it in GitHub Desktop.
Pull the complete works of William Shakespeare from shakespeare.mit.edu and convert to plain text
#!/bin/sh
#
# Pull the complete works of William Shakespeare from shakespeare.mit.edu,
# then convert them from HTML format to plain text with Pandoc.
comedy=("allswell" "asyoulikeit" "comedy_errors" "cymbeline" "lll" "measure" "merry_wives" "merchant" "midsummer" "much_ado" "pericles" "taming_shrew" "tempest" "troilus_cressida" "twelfth_night" "two_gentlemen" "winters_tale")
history=("1henryiv" "2henryiv" "henryv" "1henryvi" "2henryvi" "3henryvi" "henryviii" "john" "richardii" "richardiii")
tragedy=("cleopatra" "coriolanus" "hamlet" "julius_caesar" "lear" "macbeth" "othello" "romeo_juliet" "timon" "titus")
poetry=("LoversComplaint" "RapeOfLucrece" "VenusAndAdonis" "elegy")
sonnets=("I" "II" "III" "IV" "V" "VI" "VII" "VIII" "IX" "X" "XI" "XII" "XIII" "XIV" "XV" "XVI" "XVII" "XVIII" "XIX" "XX" "XXI" "XXII" "XXIII" "XXIV" "XXV" "XXVI" "XXVII" "XXVIII" "XXIX" "XXX" "XXXI" "XXXII" "XXXIII" "XXXIV" "XXXV" "XXXVI" "XXXVII" "XXXVIII" "XXXIX" "XL" "XLI" "XLII" "XLIII" "XLIV" "XLV" "XLVI" "XLVII" "XLVIII" "XLIX" "L" "LI" "LII" "LIII" "LIV" "LV" "LVI" "LVII" "LVIII" "LIX" "LX" "LXI" "LXII" "LXIII" "LXIV" "LXV" "LXVI" "LXVII" "LXVIII" "LXIX" "LXX" "LXXI" "LXXII" "LXXIII" "LXXIV" "LXXV" "LXXVI" "LXXVII" "LXXVIII" "LXXIX" "LXXX" "LXXXI" "LXXXII" "LXXXIII" "LXXXIV" "LXXXV" "LXXXVI" "LXXXVII" "LXXXVIII" "LXXXIX" "XC" "XCI" "XCII" "XCIII" "XCIV" "XCV" "XCVI" "XCVII" "XCVIII" "XCIX" "C" "CI" "CII" "CIII" "CIV" "CV" "CVI" "CVII" "CVIII" "CIX" "CX" "CXI" "CXII" "CXIII" "CXIV" "CXV" "CXVI" "CXVII" "CXVIII" "CXIX" "CXX" "CXXI" "CXXII" "CXXIII" "CXXIV" "CXXV" "CXXVI" "CXXVII" "CXXVIII" "CXXIX" "CXXX" "CXXXI" "CXXXII" "CXXXIII" "CXXXIV" "CXXXV" "CXXXVI" "CXXXVII" "CXXXVIII" "CXXXIX" "CXL" "CXLI" "CXLII" "CXLIII" "CXLIV" "CXLV" "CXLVI" "CXLVII" "CXLVIII" "CXLIX" "CL" "CLI" "CLII" "CLIII" "CLIV")
categories=("comedy" "history" "tragedy")
for category in "${categories[@]}"; do
if [ ! -d "${category}" ]; then
mkdir "${category}"
fi
works="${category}[@]"
for work in "${!works}"; do
echo "${category}: ${work}"
curl -L --no-progress-meter "http://shakespeare.mit.edu/${work}/full.html" |
pandoc -f html -t plain -- > "${category}/${work}.txt"
done
done
if [ ! -d "poetry" ] || [ ! -d "poetry/sonnets" ]; then
mkdir -p "poetry/sonnets"
fi
for poem in "${poetry[@]}"; do
echo "poetry: ${poem}"
curl -L --no-progress-meter "http://shakespeare.mit.edu/Poetry/${poem}.html" |
pandoc -f html -t plain -- > "poetry/${poem}.txt"
done
declare -i sonnet_no
sonnet_no=0
for sonnet in "${sonnets[@]}"; do
printf "sonnets: %03d - ${sonnet}\n" ${sonnet_no}
curl -L --no-progress-meter "http://shakespeare.mit.edu/Poetry/sonnet.${sonnet}.html" |
pandoc -f html -t plain -- > "poetry/sonnets/${sonnet_no} - ${sonnet}.txt"
sonnet_no+=1
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment