Skip to content

Instantly share code, notes, and snippets.

@fmalina
Last active September 28, 2021 15:42
Show Gist options
  • Save fmalina/9dcfb1e0d7b23e0d67ff41617507b88b to your computer and use it in GitHub Desktop.
Save fmalina/9dcfb1e0d7b23e0d67ff41617507b88b to your computer and use it in GitHub Desktop.
Dates extractor with tests to extract dates from HTML and text (regex based parser)
""" Extract dates from HTML
test using: python date_extract.py -v
"""
import calendar
import re
from datetime import date
from django.template.defaultfilters import striptags
def get_date(html):
"""
>>> pass_test = '''
... Dates 8-9 May 2014 lorem ipsum...
... Date 30 January 2015 lorem...
... Dates 11-12 October 2014
... Published date 4 January 2012...
... 12 March 2009... date
... 18 th - 20 th October 1999 in ...
... Dates of ...: 5 th - 6 th April 2000
... Dates of ...: 19 th -20 th June, 2001
... Date published: March 2015
... Dates of ...: 30.10.00- 2.11.00
... Dates of ...: 02/11/1999 - 05/11/1999
... Dates of ...: 27/01/03 - 29/01/03'''.strip().split('\\n')
>>> for test in pass_test:
... get_date(test)
...
datetime.date(2014, 5, 9)
datetime.date(2015, 1, 30)
datetime.date(2014, 10, 12)
datetime.date(2012, 1, 4)
datetime.date(2009, 3, 12)
datetime.date(1999, 10, 20)
datetime.date(2000, 4, 6)
datetime.date(2001, 6, 20)
datetime.date(2015, 3, 1)
datetime.date(2000, 10, 30)
datetime.date(1999, 11, 2)
datetime.date(2003, 1, 27)
>>> fail_test = ['02/11/1960', '55/60/1999', '0.0.2099', 'January 1960']
>>> fail_test = ['bad date: %s ...' % d for d in fail_test]
>>> for test in fail_test:
... get_date(test)
...
datetime.date(2011, 2, 1)
''
''
''
"""
# months = ['january', ...
months = [calendar.month_name[x].lower() for x in range(1, 12+1)]
day = "(?P<day>(0?[1-9]|[12][0-9]|3[01])) ?((st|nd|rd|th){1})?[\.|/| ]"
mth = "(?P<mth>(0?[1-9]|1[012]|" + '|'.join(months)+"){1}) ?,?[\.|/| ]"
year = "(?P<year>(19)?9[5-9]|(20)?(0[0-9]|1[0-5]))"
opt_day = f'({day})?'
pattern = re.compile(opt_day + mth + year)
# strip tags, normalize spaces, lowercase body
txt = re.sub(r'\s+', ' ', striptags(html[:1200])).lower()
match = pattern.search(txt)
if match:
d, m, y = match.group('day'),\
match.group('mth'),\
match.group('year')
d = int(d or 1)
if m.isdigit():
m = int(m)
else:
m = months.index(m)+1
y = int(y)
if y < 50:
y += 2000
if 50 < y < 100:
y += 1900
try:
return date(y, m, d)
except ValueError:
pass
return ''
if __name__ == "__main__":
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment