elebow/gist:002ec17cd7647fc1b251

## gistfile1.py
#!/usr/bin/python3

import urllib.request
import re
import json

#The source is HTML 3.2. Every modern parser I tried choked on it. So, we'll use regexes to extract what we want. The
#source HTML appears to be generated by a simple algorithm and has a very regular structure. I am aware of the
#implications of using regexes on HTML, but I think it's safe given the known constraints on our input.

months = {"Jan":0,"Feb":1,"Mar":2,"Apr":3,"May":4,"Jun":5,"Jul":6,"Aug":7,"Sep":8,"Oct":9,"Nov":10,"Dec":11}
ul_regex = r"<h2>\w+ \d{4}<\/h2>\s*<ul>.*?<\/ul>"

items = []

with urllib.request.urlopen("http://www2.jpl.nasa.gov/calendar/index.html") as f:
	lines = f.read().decode("utf-8").replace("\n", "")

for ul in re.findall(ul_regex, lines)[1:]:	#the first <ul> is a table of contents, so start from index 1
	lis = ul.split("<li> ")

	date = lis[0]	#"<h2>August 2014</h2>										<ul>"
	date = re.sub(r"<.*?>", "", date).strip() #strip tags and whitespace
	year = date.split(" ")[1]	#we only need the year

	lis[-1] = lis[-1].replace("</ul>", "")		#remove the closing tag from the last one

	for li in lis[1:]:	#the first <li> has the date and opening tag, so start from index 1
		li = re.sub(r"\s\s+", " ", li).strip()		#collapse extra whitespace left over from the indentation in the HTML source

		(date, text) = li.split(" -", 1);   #split the date from the item text

		#Some of the items have a little image at the front. Remove it and the following [...] text
		if text[0:8] == "<img src":
			text = re.sub("<img .*?]", "", text);

		#the date could be "mmm dd", "mmm dd-dd", or "mmm dd-mmm dd"
		m = re.match("(?:([A-Za-z]{3}) (\d\d)|([A-Za-z]{3}) (\d\d)-(\d\d)|([A-Za-z]{3}) (\d\d)-([A-Za-z]{3}) (\d\d))$", date)
		if m == None:
			continue;   #Some of the items have no date. We'll just leave them out of the result.

		#m.span()[1] will always be 6, 9, or 13, corresponding to the three possible date formats
		x = m.span()[1]
		if x == 6:
			date1 = "%s %s %s" % (year, months[m.group(1)], m.group(2))
			date2 = None
		elif x == 9:
			date1 = "%s %s %s" % (year, months[m.group(3)], m.group(4))
			date2 = "%s %s %s" % (year, months[m.group(3)], m.group(5))
		elif x == 13:
			date1 = "%s %s %s" % (year, months[m.group(6)], m.group(7))
			date2 = "%s %s %s" % (year, months[m.group(8)], m.group(9))
		else:
			continue	#can't happen, but skip this item

		items.append({"date1":date1, "date2":date2, "text":text})

print(json.dumps(items))
	#!/usr/bin/python3

	import urllib.request
	import re
	import json

	#The source is HTML 3.2. Every modern parser I tried choked on it. So, we'll use regexes to extract what we want. The
	#source HTML appears to be generated by a simple algorithm and has a very regular structure. I am aware of the
	#implications of using regexes on HTML, but I think it's safe given the known constraints on our input.

	months = {"Jan":0,"Feb":1,"Mar":2,"Apr":3,"May":4,"Jun":5,"Jul":6,"Aug":7,"Sep":8,"Oct":9,"Nov":10,"Dec":11}
	ul_regex = r"<h2>\w+ \d{4}<\/h2>\s<ul>.?<\/ul>"

	items = []

	with urllib.request.urlopen("http://www2.jpl.nasa.gov/calendar/index.html") as f:
	lines = f.read().decode("utf-8").replace("\n", "")

	for ul in re.findall(ul_regex, lines)[1:]: #the first <ul> is a table of contents, so start from index 1
	lis = ul.split("<li> ")

	date = lis[0] #"<h2>August 2014</h2> <ul>"
	date = re.sub(r"<.*?>", "", date).strip() #strip tags and whitespace
	year = date.split(" ")[1] #we only need the year

	lis[-1] = lis[-1].replace("</ul>", "") #remove the closing tag from the last one

	for li in lis[1:]: #the first <li> has the date and opening tag, so start from index 1
	li = re.sub(r"\s\s+", " ", li).strip() #collapse extra whitespace left over from the indentation in the HTML source

	(date, text) = li.split(" -", 1); #split the date from the item text

	#Some of the items have a little image at the front. Remove it and the following [...] text
	if text[0:8] == "<img src":
	text = re.sub("<img .*?]", "", text);

	#the date could be "mmm dd", "mmm dd-dd", or "mmm dd-mmm dd"
	m = re.match("(?:([A-Za-z]{3}) (\d\d)\|([A-Za-z]{3}) (\d\d)-(\d\d)\|([A-Za-z]{3}) (\d\d)-([A-Za-z]{3}) (\d\d))$", date)
	if m == None:
	continue; #Some of the items have no date. We'll just leave them out of the result.

	#m.span()[1] will always be 6, 9, or 13, corresponding to the three possible date formats
	x = m.span()[1]
	if x == 6:
	date1 = "%s %s %s" % (year, months[m.group(1)], m.group(2))
	date2 = None
	elif x == 9:
	date1 = "%s %s %s" % (year, months[m.group(3)], m.group(4))
	date2 = "%s %s %s" % (year, months[m.group(3)], m.group(5))
	elif x == 13:
	date1 = "%s %s %s" % (year, months[m.group(6)], m.group(7))
	date2 = "%s %s %s" % (year, months[m.group(8)], m.group(9))
	else:
	continue #can't happen, but skip this item

	items.append({"date1":date1, "date2":date2, "text":text})

	print(json.dumps(items))