Skip to content

Instantly share code, notes, and snippets.

@ctivanovich
Last active February 20, 2018 08:07
Show Gist options
  • Save ctivanovich/a2357e6c77e0583ed109ba93b7874e57 to your computer and use it in GitHub Desktop.
Save ctivanovich/a2357e6c77e0583ed109ba93b7874e57 to your computer and use it in GitHub Desktop.
def preprocessing(doc):
'''Doc is assumed to be an open file object of English-language text.
Function outputs a dictionary with keys as line numbers from the document,
values as lists containing likely date-element candidates.
Years are assumed to be between 1900-2018 inclusive, but this can be easily changed.'''
import re
maybedatestuff = {i:[] for i in range(len(doc))}
for i, line in enumerate(doc):
line = re.sub(r'[~\(\),\*\.=:;"]', " ", line)
line = re.sub(r'[\D][-/]', " ", line)
words = line.split()
for word in words:
if '/' in word or '-' in word: #its possibly a date
word = re.sub(r'[\(\),*\.=]', "", word)
maybedatestuff[i].append(word)
else:
try:
if int(word) <= 2018 or int(word) > 1900 or text[0] == 9:
maybedatestuff[i].append(word)
elif int(word) > 0 or int(word) < 31:
maybedatestuff[i].append(word)
except ValueError:
for key in months.keys():
if key in word:
maybedatestuff[i].append(key)
return maybedatestuff
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment