Created
July 16, 2016 12:33
-
-
Save solalatus/0c353156ac7678d32118df290ae40ede to your computer and use it in GitHub Desktop.
Dateparser based detctor for all dates in a string - untested
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dateparser | |
def detect_date(line=""): | |
"""Detect multiple, multi language date and datetime entries in a given string. | |
@param type: String to detect occuring_dates in, ideally a single line. | |
@type type: String | |
@return: Returns the list of found DateTime objects or None | |
@rtype: List or None | |
""" | |
# For a start, split the string, remove , and ; to get rid of edge cases | |
# and be able to manipulate the data word by word | |
splitted_line=line.replace(",","").replace(";","").split() | |
occuring_dates=[] | |
broken=False | |
i=0 | |
j=len(splitted_line)+1 | |
while i<len(splitted_line)-1: | |
while j>=0: | |
#Generate a candidate combination of words | |
section=" ".join(splitted_line[i:j]) | |
try: | |
#If the input string is really not vaild, dateparser will raise an exception. | |
date=dateparser.parse(section) | |
if date==None: | |
#If dateparser returns with None, raise an exception. | |
raise Exception("No date could be parsed.") | |
else: | |
#If we found something, just store it. | |
occuring_dates.append(date) | |
except: | |
#Nothing has been found, go ahead... | |
j-=1 | |
else: | |
#We have found something, hopefully | |
if date: | |
#Update the original list, remove everything "covered" by what we found. | |
#This is the way to get rid of duplicates from parsing parts | |
# of the same datetime many times. | |
splitted_line=splitted_line[j+1:] | |
i=0 | |
broken=True | |
break | |
j=len(splitted_line)+1 | |
if not broken: | |
i+=1 | |
#This is a "detector", so give None if nothing is found, so as to fit in logical operations. | |
if occuring_dates==[]: | |
occuring_dates=None | |
return occuring_dates | |
if __name__ == '__main__': | |
#English dummy example | |
line="aaa Sat, 16 Jul 2016 06:09:12 +0000 and Sat, 17 Aug 2015 09:11:15 +0100 aaa and 2 days ago aaaaa" | |
results=detect_date(line) | |
for r in results: | |
print(r) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment