Skip to content

Instantly share code, notes, and snippets.

@ptgolden
Created June 6, 2012 22:44
Show Gist options
  • Save ptgolden/2885300 to your computer and use it in GitHub Desktop.
Save ptgolden/2885300 to your computer and use it in GitHub Desktop.
Natural language date parser (for historical dates)
import re
year_part = re.compile('(?:1[89]|20)\d{2}')
months = [r'jan(?:uary)?$', r'feb(?:ruary)?$', r'mar(?:ch)?$',
r'apr(?:il)?$', r'may$', r'jun(?:e)?$',
r'jul(?:y)?$', r'aug(?:ust)?$', r'sep(?:t|tember)?$',
r'oct(?:ober)?$', r'nov(?:ember)?$', r'dec(?:ember)?',
'dec', 'dec', 'dec', 'dec', 'dec', 'dec', 'dec', 'dec',
r'spr(?:ing)?$', r'sum(?:mer)?', r'(?:aut|autumn|fall)$', r'win(?:ter)?$']
numbers = [r'[0-9]{1,2}(?:st|nd|rd)?$', r'[0-9]{4}$']
joiners = ['-', 'from$', 'to$', ',', 'and', '[/]']
unsure = ['[?]', 'c(?:irca|a)?$', 'ab(?:ou)?t']
other = [r'[0-9]{1,2}(?:st|nd|rd)?$', r'[0-9]{4}$', ' ', '-', '[?]',
'[.]', '[/]', '~', ',', '^ca?[.]?$']
def tokenize(string):
if not re.search(year_part, string):
return None
return [t for t in re.split(r'(\W)', string) if t]
def get_date_tokens_from_string(string):
tokens = tokenize(string)
if not tokens:
return
year_index = 0
for counter, token in enumerate(tokens):
match = re.findall(year_part, token)
year_index = counter if (match and 1800 < int(match[0]) < 2013) else year_index
valid_date_parts = re.compile(r'|'.join(months + other), re.I)
date_begin = year_index
while True:
if re.match(valid_date_parts, tokens[date_begin]) and date_begin > 0:
date_begin = date_begin - 1
else:
break
date_end = year_index
while True:
if re.match(valid_date_parts, tokens[date_end]) and len(tokens) > date_end + 1:
date_end = date_end + 1
else:
break
date_str = ''.join(tokens[date_begin + 1:date_end])
add_q = '?' in date_str
add_tilde = '~' in date_str
# Strip unnecessary chars, although this doesn't preserve '?' or '~'
date_str = re.sub(r'^\W+|\W+$', '', date_str)
date_str += '?' if add_q else ''
date_str += '~' if add_tilde else ''
return tokenize(date_str)
def parse_date_tokens(tokens):
date_str = ''
if not tokens:
return ''
def year_from_tokens(tokens):
for tindex, t in enumerate(tokens):
if re.match(r'\d{4}', t):
y_token_idx = tindex
year = t
return (year, y_token_idx)
def month_from_tokens(tokens):
month = -1
for mindex, m in enumerate(months):
month_search = [(tindex, t) for tindex, t in enumerate(tokens)
if re.match(r'%s' % m, t, re.I)]
if month_search:
month = mindex + 1
m_token_idx = month_search[0][0]
break
if month > 0:
return (month, m_token_idx)
else:
return (0, 0)
if len(tokens) == 1:
year, = tokens
date_str = year
elif len(tokens) == 3 and tokens[1] == '-':
date_str = '%s/%s' % (tokens[0], tokens[2])
elif len(tokens) == 3:
year, yindex = year_from_tokens(tokens)
month, mindex = month_from_tokens(tokens)
date_str = '%s' % year
date_str += '-%02d' % month if month else ''
else:
year, yindex = year_from_tokens(tokens)
month, mindex = month_from_tokens(tokens)
date_str = '%s' % year
if month:
date_str += '-%02d' % month
day_search = re.compile(r'([0-9]{1,2})(?:st|nd|rd)?$')
dindex1 = mindex + 1
dindex2 = mindex - 1
threshold = 1
day = 0
while threshold < 5:
if re.match(day_search, tokens[dindex1]):
day = re.findall(day_search, tokens[dindex1])[0]
break
else:
dindex1 += 1 if len(tokens) > dindex1 + 1 else 0
if re.match(day_search, tokens[dindex2]):
day = re.findall(day_search, tokens[dindex2])[0]
break
else:
dindex2 -= 1 if dindex2 > 0 else 0
threshold += 1
date_str += '-%02d' % int(day) if day else ''
if '?' in tokens or 'ca' in tokens or 'c' in tokens:
date_str += '?'
if '~' in tokens:
date_str += '~'
return date_str
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment