Skip to content

Instantly share code, notes, and snippets.

@bertspaan
Created July 6, 2012 08:46
Show Gist options
  • Save bertspaan/3059017 to your computer and use it in GitHub Desktop.
Save bertspaan/3059017 to your computer and use it in GitHub Desktop.
Parser for Dutch natural language time strings (Python 2.5, pyPEG 1.5)
import re, fileinput
import pyPEG
from pyPEG import parse, parseLine
from pyPEG import keyword, _and, _not, ignore
import datetime
#pyPEG.print_trace = True
numbers = [
"nul",
"een",
"twee",
"drie",
"vier",
"vijf",
"zes",
"zeven",
"acht",
"negen",
"tien",
"elf",
"twaalf",
"dertien",
"veertien",
"vijftien",
"zestien",
"zeventien",
"achttien",
"negentien",
"twintig",
"eenentwintig",
"tweeentwintig",
"drieentwintig",
"vierentwintig",
"vijfentwintig",
"zesentwintig",
"zevenentwintig",
"achtentwintig",
"negenentwintig",
"dertig",
"eenendertig",
"tweeendertig",
"drieendertig",
"vierendertig",
"vijfendertig",
"zesendertig",
"zevenendertig",
"achtendertig",
"negenendertig",
"veertig",
"eenenveertig",
"tweeenveertig",
"drieenveertig",
"vierenveertig",
"vijfenveertig",
"zesenveertig",
"zevenenveertig",
"achtenveertig",
"negenenveertig",
"vijftig",
"eenenvijftig",
"tweeenvijftig",
"drieenvijftig",
"vierenvijftig",
"vijfenvijftig",
"zesenvijftig",
"zevenenvijftig",
"achtenvijftig",
"negenenvijftig",
"zestig"
]
times = [
"12 56",
"12:56",
"twaalf dertig",
"negen vijftien",
"kwart voor twee",
"tien voor twee",
"drie over half tien",
"13 uur 52",
"half drie",
"zevenenveertig over tien",
"zeven voor half vier",
"achttien vijftien",
"zeven",
"negen uur",
"8 uur",
"achttien uur vijftien"
]
def number(): return re.compile(r"\w+")
def half(): return re.compile(r"half")
def hours(): return -1, half, number, -1, keyword("uur")
def sign(): return [re.compile(r"voor"), re.compile(r"over")]
def minutes(): return number
def time(): return [
(minutes, sign, hours),
(minutes, sign, hours),
(hours, -1, ":", minutes),
hours
]
def string_to_int(str):
if str == "kwart":
return 15
for i in range(0, 60):
if str == numbers[i]:
return i
return int(str)
def to_time(ast):
minutes_str = ""
hours_str = ""
half = False
sign = 1
# ast is tuple (ast, ''). skip weird '' part:
ast = ast[0]
for symbol in ast:
name = symbol[0]
value = symbol[1]
if name == "hours":
if len(value) == 2:
# Has 'half'
half = True
hours_str = value[1][1]
else:
hours_str = value[0][1]
elif name == "minutes":
minutes_str = value[0][1]
elif name == "sign":
if value[0] == "voor":
sign = -1
minutes = 0
if len(hours_str) > 0:
minutes = string_to_int(hours_str) * 60
if half:
minutes -= 30
if len(minutes_str) > 0:
minutes += sign * string_to_int(minutes_str)
hours = minutes // 60
minutes = minutes - (60 * hours)
today = datetime.date.today() + datetime.timedelta(days=1)
return datetime.datetime.combine(today, datetime.time(hours, minutes))
for time_str in times:
ast = parseLine(textline=time_str, pattern=time(), resultSoFar=[])
print time_str, " => ", to_time(ast)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment