Skip to content

Instantly share code, notes, and snippets.

@ryukinix
Last active February 4, 2022 15:35
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ryukinix/c2541bdaa831c8bde49aa809883e1a30 to your computer and use it in GitHub Desktop.
Save ryukinix/c2541bdaa831c8bde49aa809883e1a30 to your computer and use it in GitHub Desktop.
An EBNF grammar based in the Lark Parser designed to parse multivariate date formats. (PT_BR)
#!/usr/bin/env python3
# coding: utf-8
#
# Copyright © Neoway Business Solutions
#
# @project: Diário Oficial
# @author: Manoel Vilela
#
"""
An EBNF grammar based in the Lark Parser designed to
parse multivariate date formats. Grammar written for Portuguese in Mind.
"""
from lark import exceptions as LarkExceptions
from lark import Lark, InlineTransformer # pip install lark-parser
from datetime import datetime
default_date_format = "%d/%m/%Y"
grammar = '''\
?date: day "de" month "de" year
| day "/" month "/" year
| day "-" month "-" year
| day "." month "." year
| day month year
day: INT
year: INT
month: month_name | month_number
month_name:
| ("janeiro" | "jan") -> jan
| ("fevereiro" | "fev") -> fev
| ("março" | "mar") -> mar
| ("abril" | "abr") -> abr
| ("maio" | "mai") -> mai
| ("junho" | "jun") -> jun
| ("julho" | "jul") -> jul
| ("agosto" | "ago") -> ago
| ("setembro" | "set") -> set
| ("outubro" | "out") -> out
| ("novembro" | "nov") -> nov
| ("dezembro" | "dez") -> dez
month_number:
| ["0"] "1" -> jan
| ["0"] "2" -> fev
| ["0"] "3" -> mar
| ["0"] "4" -> abr
| ["0"] "5" -> mai
| ["0"] "6" -> jun
| ["0"] "7" -> jul
| ["0"] "8" -> ago
| ["0"] "9" -> set
| "10" -> out
| "11" -> nov
| "12" -> dez
%import common.WORD
%import common.INT
%import common.DIGIT
%import common.WS
%ignore WS
'''
month_dict = {
"jan": 1,
"fev": 2,
"mar": 3,
"abr": 4,
"mai": 5,
"jun": 6,
"jul": 7,
"ago": 8,
"set": 9,
"out": 10,
"nov": 11,
"dez": 12
}
parser = Lark(grammar, start='date')
class NaturalDateTree(InlineTransformer):
day = int
year = int
def month(self, tree):
month_label = tree._pretty_label() # jan, fev
return month_dict[month_label]
def date(self, day, month, year):
if year < 30:
year += 2000
elif year > 30 and year < 100:
year += 1900
try:
return datetime(year, month, day)
except ValueError:
return None
def parse_date(expr):
return NaturalDateTree().transform(parser.parse(expr))
def parse(date_string, date_format=default_date_format):
try:
parsed_date = parse_date(date_string.lower())
if parsed_date is not None:
return parsed_date.strftime(date_format)
except LarkExceptions.UnexpectedCharacters:
pass
return None
def run_tests():
tests = [
('02-08-2018', '02/08/2018'),
('1.03.2018', '01/03/2018'),
('1 de fevereiro de 2018', "01/02/2018"),
('02 fevereiro 2018', "02/02/2018"),
('28FEV2017', '28/02/2017'),
('31fev2019', None),
('shitty string', None),
('10 Mai 17', '10/05/2017'),
('08 Jun 98', '08/06/1998')
]
for entry, expected in tests:
parsed = parse(entry)
assert parsed == expected, f"Expected: {expected!r}, but: {parsed!r}"
if __name__ == '__main__':
run_tests()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment