Skip to content

Instantly share code, notes, and snippets.

@isovector
Created July 5, 2013 18:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save isovector/5936263 to your computer and use it in GitHub Desktop.
Save isovector/5936263 to your computer and use it in GitHub Desktop.
Python script to scrape dictionary.com's words of the day into an anki deck
import urllib2
import re
# precompile our regexs since we use them on every iteration
wordRE = re.compile(r'"hw">([^<]+)<')
pronounceRE = re.compile(r'\\([^;,\\]+)')
typeRE = re.compile(r'<i>([^<]+)</i>')
defRE = re.compile(r'<b>1.</b> ([^<]+)')
otherDefRE = re.compile(r'-->([^<]+)</p>', re.DOTALL)
# helper function to extract a regex from a string
def get(html, RE):
return RE.search(html).group(1).strip()
# helper function to extract one of two regexs from a string
# yeah, it's ugly. who cares
def getTry(html, re1, re2):
match = re1.search(html)
if match:
return match.group(1).strip()
return get(html, re2)
def scrapePage(year, month, day):
try:
response = urllib2.urlopen("http://dictionary.reference.com/wordoftheday/archive/%d/%02d/%02d.html" % (year, month, day));
html = response.read()
# slice it twice so the second find is faster
# could have been done better by saving the result of the first find
# and searching from that point for the second one
html = html[html.find("<span class=\"hw\">"):]
html = html[:html.find("<!-- SECBR -->")]
word = "%s /%s/ (%s)" % (get(html, wordRE), get(html, pronounceRE), get(html, typeRE))
defin = getTry(html, defRE, otherDefRE)
print "%s\t%s" % (word, defin)
print "%s\t%s" % (defin, word)
except:
pass
# no leap years - good enough
daysInMonth = [31, 27, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
for year in range(2000, 2013):
for month in range(1, 13):
for day in range(1, daysInMonth[month - 1] + 1):
scrapePage(year, month, day)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment