Created
July 5, 2013 18:14
-
-
Save isovector/5936263 to your computer and use it in GitHub Desktop.
Python script to scrape dictionary.com's words of the day into an anki deck
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import re | |
# precompile our regexs since we use them on every iteration | |
wordRE = re.compile(r'"hw">([^<]+)<') | |
pronounceRE = re.compile(r'\\([^;,\\]+)') | |
typeRE = re.compile(r'<i>([^<]+)</i>') | |
defRE = re.compile(r'<b>1.</b> ([^<]+)') | |
otherDefRE = re.compile(r'-->([^<]+)</p>', re.DOTALL) | |
# helper function to extract a regex from a string | |
def get(html, RE): | |
return RE.search(html).group(1).strip() | |
# helper function to extract one of two regexs from a string | |
# yeah, it's ugly. who cares | |
def getTry(html, re1, re2): | |
match = re1.search(html) | |
if match: | |
return match.group(1).strip() | |
return get(html, re2) | |
def scrapePage(year, month, day): | |
try: | |
response = urllib2.urlopen("http://dictionary.reference.com/wordoftheday/archive/%d/%02d/%02d.html" % (year, month, day)); | |
html = response.read() | |
# slice it twice so the second find is faster | |
# could have been done better by saving the result of the first find | |
# and searching from that point for the second one | |
html = html[html.find("<span class=\"hw\">"):] | |
html = html[:html.find("<!-- SECBR -->")] | |
word = "%s /%s/ (%s)" % (get(html, wordRE), get(html, pronounceRE), get(html, typeRE)) | |
defin = getTry(html, defRE, otherDefRE) | |
print "%s\t%s" % (word, defin) | |
print "%s\t%s" % (defin, word) | |
except: | |
pass | |
# no leap years - good enough | |
daysInMonth = [31, 27, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] | |
for year in range(2000, 2013): | |
for month in range(1, 13): | |
for day in range(1, daysInMonth[month - 1] + 1): | |
scrapePage(year, month, day) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment