Skip to content

Instantly share code, notes, and snippets.

@CodeZombie
Created March 21, 2018 19:12
Show Gist options
  • Save CodeZombie/554526f551243fad3a23efac285429fd to your computer and use it in GitHub Desktop.
Save CodeZombie/554526f551243fad3a23efac285429fd to your computer and use it in GitHub Desktop.
python date extractor
MONTHS = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "january", "jan", "feb", "february", "march", "mar", "april", "apr", "may", "june", "jun", "july", "jul", "august", "aug", "september", "sept", "october", "oct"]
DAYS = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th", "11th", "12th", "13th", "14th", "15th", "16th", "17th", "18th", "19th", "20th", "21st", "22nd", "23rd", "24th", "25th", "26th", "27th", "28th", "29th", "30th", "31st", "32nd", "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", "twentieth", "twenty first", "twenty second", "twenty third"]
YEARS = ["2018", "18", "1994", "94"]
#find every instance of every element in the array, and create pairs for each any every one.
#then score all of these pairs based on how many chars they remove and how close they all are to one another.
## TODO:
## HANDLE CAPS
def allOrderedPermutations(lists_):
finds = []
for item in lists_[0]: #for every item in the first element in the list (months)
if len(lists_) > 1: #if we're not at the last list
ret = allOrderedPermutations(lists_[1:]) #get all elements from the next layer down
for x in ret: #for every element returned
if x[0][0] >= item[0] +len(item[1]): #if the index position of that element is after the current
finds.append([item]) #create a new entry
finds[len(finds)-1].extend(x) #and extend the list into it.
else: #else if we're at the end...
finds.append([item]) #just add it right in there.
return finds #return what we found
def findDates(string_, patterns_):
#searches through the string for every match in the patterns_ 2d array
#upond finding all matches, it sends that data to have every possible combination found and returned.
#these combinations are then rated by what size and distance from each element
#the number one scoring element is returned to the dates[] and the process starts over until no more dates are found
dates = []
while True:
elementlists = [] #will hold each match from the string, like this: [ [[15, "january"], [20, "may"]], [[5, "10",], [25, "10th"]] ]
matches = [] #will hold all match lists like [ [30, [[10, "january"], [23, "5th"], [28, "2018"]]], etc]
for pattern_index, pattern in enumerate(patterns_): #for every pattern (MOnth, day, year)
elementlists.append([]) #create a list to hold all these elements
for pattern_item in pattern: #for each item in this pattern list
index = 0
while True: #in an infinite loop:
found = string_.find(pattern_item, index)
if found != -1:
elementlists[pattern_index].append([found, pattern_item]) #insert a list containing the index and value of the found item
index = found + 1 #start searching 1 space above the last found item
else:
break #break out of this loop
permutations = allOrderedPermutations(elementlists) #order all the
if len(permutations) == 0: #if absolutely no dates were found...
return dates
for permutation in permutations:
#score is calculated by taking the length of all elements, minus the distance between all elements.
total_size = permutation[-1][0] + len(permutation[-1][1]) - permutation[0][0] #the entire size of the thing
penalty = total_size
score = 0
for item in permutation:
penalty -= len(item[1])
score += len(item[1])
score -= (penalty*.5) #divide the penalty by 5 because we want size to matter a lot more
matches.append([score, permutation])
matches = sorted(matches, key=lambda x: x[0])
dates.append(matches[-1])
#now remove these elements from the string..
matches[-1][1].reverse() #reverse the order of the elements so deleting them from the string sequentially works.
for element in matches[-1][1]:
string_ = string_[:element[0]] + string_[element[0] + len(element[1]):] #remove these substrings from the string
matches[-1][1].reverse() #reverse it back.
#print(findDates("We may set the date to january 5th, 1994. But also find february 2 2018", [MONTHS, DAYS, YEARS]))
x = findDates("Sitting at the table with these losers talking about bullshit I may not care about. Chocolate milk and french fri. Friday \
sometimes I like to take 10 apples and throw out 27. This happened on Wednesday, september 24, 2018. After the great disaster 2018.", [MONTHS, DAYS, YEARS])
for y in x:
print(y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment