Created
March 21, 2018 19:12
-
-
Save CodeZombie/554526f551243fad3a23efac285429fd to your computer and use it in GitHub Desktop.
python date extractor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
MONTHS = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "january", "jan", "feb", "february", "march", "mar", "april", "apr", "may", "june", "jun", "july", "jul", "august", "aug", "september", "sept", "october", "oct"] | |
DAYS = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th", "11th", "12th", "13th", "14th", "15th", "16th", "17th", "18th", "19th", "20th", "21st", "22nd", "23rd", "24th", "25th", "26th", "27th", "28th", "29th", "30th", "31st", "32nd", "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", "twentieth", "twenty first", "twenty second", "twenty third"] | |
YEARS = ["2018", "18", "1994", "94"] | |
#find every instance of every element in the array, and create pairs for each any every one. | |
#then score all of these pairs based on how many chars they remove and how close they all are to one another. | |
## TODO: | |
## HANDLE CAPS | |
def allOrderedPermutations(lists_): | |
finds = [] | |
for item in lists_[0]: #for every item in the first element in the list (months) | |
if len(lists_) > 1: #if we're not at the last list | |
ret = allOrderedPermutations(lists_[1:]) #get all elements from the next layer down | |
for x in ret: #for every element returned | |
if x[0][0] >= item[0] +len(item[1]): #if the index position of that element is after the current | |
finds.append([item]) #create a new entry | |
finds[len(finds)-1].extend(x) #and extend the list into it. | |
else: #else if we're at the end... | |
finds.append([item]) #just add it right in there. | |
return finds #return what we found | |
def findDates(string_, patterns_): | |
#searches through the string for every match in the patterns_ 2d array | |
#upond finding all matches, it sends that data to have every possible combination found and returned. | |
#these combinations are then rated by what size and distance from each element | |
#the number one scoring element is returned to the dates[] and the process starts over until no more dates are found | |
dates = [] | |
while True: | |
elementlists = [] #will hold each match from the string, like this: [ [[15, "january"], [20, "may"]], [[5, "10",], [25, "10th"]] ] | |
matches = [] #will hold all match lists like [ [30, [[10, "january"], [23, "5th"], [28, "2018"]]], etc] | |
for pattern_index, pattern in enumerate(patterns_): #for every pattern (MOnth, day, year) | |
elementlists.append([]) #create a list to hold all these elements | |
for pattern_item in pattern: #for each item in this pattern list | |
index = 0 | |
while True: #in an infinite loop: | |
found = string_.find(pattern_item, index) | |
if found != -1: | |
elementlists[pattern_index].append([found, pattern_item]) #insert a list containing the index and value of the found item | |
index = found + 1 #start searching 1 space above the last found item | |
else: | |
break #break out of this loop | |
permutations = allOrderedPermutations(elementlists) #order all the | |
if len(permutations) == 0: #if absolutely no dates were found... | |
return dates | |
for permutation in permutations: | |
#score is calculated by taking the length of all elements, minus the distance between all elements. | |
total_size = permutation[-1][0] + len(permutation[-1][1]) - permutation[0][0] #the entire size of the thing | |
penalty = total_size | |
score = 0 | |
for item in permutation: | |
penalty -= len(item[1]) | |
score += len(item[1]) | |
score -= (penalty*.5) #divide the penalty by 5 because we want size to matter a lot more | |
matches.append([score, permutation]) | |
matches = sorted(matches, key=lambda x: x[0]) | |
dates.append(matches[-1]) | |
#now remove these elements from the string.. | |
matches[-1][1].reverse() #reverse the order of the elements so deleting them from the string sequentially works. | |
for element in matches[-1][1]: | |
string_ = string_[:element[0]] + string_[element[0] + len(element[1]):] #remove these substrings from the string | |
matches[-1][1].reverse() #reverse it back. | |
#print(findDates("We may set the date to january 5th, 1994. But also find february 2 2018", [MONTHS, DAYS, YEARS])) | |
x = findDates("Sitting at the table with these losers talking about bullshit I may not care about. Chocolate milk and french fri. Friday \ | |
sometimes I like to take 10 apples and throw out 27. This happened on Wednesday, september 24, 2018. After the great disaster 2018.", [MONTHS, DAYS, YEARS]) | |
for y in x: | |
print(y) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment