ctivanovich/date_from_text_preprocessing.py

## date_from_text_preprocessing.py
def preprocessing(doc):
  '''Doc is assumed to be an open file object of English-language text.
     Function outputs a dictionary with keys as line numbers from the document,
     values as lists containing likely date-element candidates.
     Years are assumed to be between 1900-2018 inclusive, but this can be easily changed.'''

    import re

    maybedatestuff = {i:[] for i in range(len(doc))}
    for i, line in enumerate(doc):
        line = re.sub(r'[~\(\),\*\.=:;"]', " ", line)
        line = re.sub(r'[\D][-/]', " ", line)
        words = line.split()
        for word in words:
            if '/' in word or '-' in word: #its possibly a date
                word = re.sub(r'[\(\),*\.=]', "", word)
                maybedatestuff[i].append(word)
            else:
                try:
                    if int(word) <= 2018 or int(word) > 1900 or text[0] == 9:
                        maybedatestuff[i].append(word)
                    elif int(word) > 0 or int(word) < 31:
                        maybedatestuff[i].append(word)
                except ValueError:
                    for key in months.keys():
                        if key in word:
                            maybedatestuff[i].append(key)
    return maybedatestuff
	def preprocessing(doc):
	'''Doc is assumed to be an open file object of English-language text.
	Function outputs a dictionary with keys as line numbers from the document,
	values as lists containing likely date-element candidates.
	Years are assumed to be between 1900-2018 inclusive, but this can be easily changed.'''

	import re

	maybedatestuff = {i:[] for i in range(len(doc))}
	for i, line in enumerate(doc):
	line = re.sub(r'[~\(\),\*\.=:;"]', " ", line)
	line = re.sub(r'[\D][-/]', " ", line)
	words = line.split()
	for word in words:
	if '/' in word or '-' in word: #its possibly a date
	word = re.sub(r'[\(\),*\.=]', "", word)
	maybedatestuff[i].append(word)
	else:
	try:
	if int(word) <= 2018 or int(word) > 1900 or text[0] == 9:
	maybedatestuff[i].append(word)
	elif int(word) > 0 or int(word) < 31:
	maybedatestuff[i].append(word)
	except ValueError:
	for key in months.keys():
	if key in word:
	maybedatestuff[i].append(key)
	return maybedatestuff