Skip to content

Instantly share code, notes, and snippets.

@infinity0
Last active December 15, 2015 06:39
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save infinity0/5217482 to your computer and use it in GitHub Desktop.
Save infinity0/5217482 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
#
# Usage:
#
# $ ./conception.py 1969-12-25
# searching for most recent event before 1969-04-03
# <li><a href="/wiki/April_1" title="April 1">April 1</a>
# &#226;&#8364;&#8220; The <a href="/wiki/Hawker_Siddeley_Harrier" title="Hawker Siddeley Harrier">Hawker Siddeley Harrier</a> enters
# service with the <a href="/wiki/Royal_Air_Force" title="Royal Air Force">Royal Air Force</a>.</li>
#
# If python complains about ImportError, install tidy/lxml python libs
#
# $ aptitude install python-tidylib python-lxml
from bisect import bisect
from lxml import etree
import os.path
from functools import partial
import sys
import tidy
import time
import urllib2
# TODO: better estimate from your birth weight and location
AV_PREG_LENGTH=266*24*3600
TIDY_OPTS = dict(output_xhtml=1,
add_xml_decl=1,
tidy_mark=0)
def get_content(url):
# http://stackoverflow.com/questions/120061/fetch-a-wikipedia-article-with-python
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
infile = opener.open(url)
text = infile.read()
return etree.HTML(str(tidy.parseString(text, **TIDY_OPTS)))
def md_to_key(d):
return "%02d.%02d" % (d.tm_mon, d.tm_mday)
def parse_li(parent, b_i, e_i, l):
try:
# ensure node is inside "events" section
p = l
while p.getparent() != parent:
p = p.getparent()
assert p.getparent() == parent
i = parent.index(p)
if not (b_i < i < e_i): return None
n = l.find(".//a[@href]")
s = os.path.basename(n.get("href"))
d = time.strptime(s, "%B_%d")
return (md_to_key(d), l)
except Exception, e:
return None
def get_events(y):
dom = get_content("http://en.wikipedia.org/wiki/%s" % y)
# find delimiters of "events" section
eventsh2 = dom.find(".//span[@id='Events']/..")
nexth2 = eventsh2
while True:
nexth2 = nexth2.getnext()
if nexth2.tag == 'h2': break
parent = eventsh2.getparent()
assert parent == nexth2.getparent()
b_i = parent.index(eventsh2)
e_i = parent.index(nexth2)
return sorted(filter(None, map(partial(parse_li, parent, b_i, e_i), dom.findall(".//li"))))
if __name__ == '__main__':
d = time.localtime(time.mktime(time.strptime(sys.argv[1], "%Y-%m-%d")) - AV_PREG_LENGTH)
print time.strftime("searching for most recent event before %Y-%m-%d", d)
dd = get_events(d.tm_year)
i = bisect(dd, (md_to_key(d), None))
if i > 0:
ev = dd[i-1]
else:
# need to check previous year
dd = get_events(d.tm_year - 1)
ev = dd[-1]
print etree.tostring(ev[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment