Created
February 17, 2012 23:17
-
-
Save artursapek/1856075 to your computer and use it in GitHub Desktop.
A tolerant search function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
class finder(): | |
# I got these dicts from someone on Stackoverflow. They translate common letters to visually similar unicode keys. | |
# This will let us match an N with a tilde to a regular N, for example | |
global unicode_to_text_Matches | |
unicode_to_text_Matches = { 161: '!', 192: 'A', 193: 'A', 194: 'A', 195: 'A', 196: 'A', 197: 'A', 198: 'Ae', 199: 'C', 200: 'E', 201: 'E', 202: 'E', 203: 'E', 204: 'I', 205: 'I', | |
206: 'I', 207: 'I', 208: 'Th', 209: 'N', 210: 'O', 211: 'O', 212: 'O', 213: 'O', 214: 'O', 216: 'O', 217: 'U', 218: 'U', 219: 'U', 220: 'U', 221: 'Y', 222: 'th', 223: 'ss', 224: 'a', | |
225: 'a', 226: 'a', 227: 'a', 228: 'a', 229: 'a', 230: 'ae', 231: 'c', 232: 'e', 233: 'e', 234: 'e', 235: 'e', 236: 'i', 237: 'i', 238: 'i', 239: 'i', 240: 'th', 241: 'n', 242: 'o', | |
243: 'o', 244: 'o', 245: 'o', 246: 'o', 248: 'o', 249: 'u', 250: 'u', 251: 'u', 252: 'u', 253: 'y', 254: 'th', 255: 'y' } | |
global text_to_unicode_Matches | |
text_to_unicode_Matches = { '!': [161], 'A': [192, 193, 194, 195, 196, 197], 'C': [199], 'E': [200, 201, 202, 203], 'I': [204, 205, 206, 207], 'O': [210, 211, 212, 213, 214, 216], | |
'N': [209], 'U': [217, 218, 219, 220], 'Y': [221], 'e': [232, 233, 234, 235], 'a': [224, 225, 226, 227, 228, 229], 'c': [231], | |
'i': [236, 237, 238, 239], 'o': [242, 243, 244, 245, 246, 248], 'n': [241], 'ss': [223], 'u': [249, 250, 251, 252], 'y': [253, 255] } | |
global spaceReplace | |
spaceReplace = '[\\[\\]\\(\\)\\s\\.\\,\\-\\.\\_]+?' | |
# Pre: pass in any word or phrase | |
# Post: returns a regex that flexibly will match that with unicode chars, and different characters used for spaces. | |
# When run with listIt as True, returns a list of the characters individually regexified instead of a complete string for the query. | |
# You can ''.join() this list and it comes out the same as when listIt is False. | |
# This lets you iterate over it as the original characters. | |
def regexify(self, query, listIt=False): | |
questionOut = ['[', ']'] | |
change = { '. ':'\.?\s?', ' and ': '(\sand\s|\s&\s|\s&\s)?', ' & ': '(\sand\s|\s&\s|\s&\s)?', 'the ': '(the\s)?', ' ': spaceReplace} | |
if not listIt: # If a list wasn't request we just return a simple string that can be used as a regex for the query. | |
for q in questionOut: | |
query = query.replace(q, q + '?') | |
for c in change: | |
query = query.replace(c, change[c]) | |
r = u'' | |
for i in query: # Take care of unicode chars. | |
if unicode_to_text_Matches.has_key(ord(i)): | |
r += '[' + unicode_to_text_Matches[ord(i)] + i + ']' | |
if text_to_unicode_Matches.has_key(i): | |
r += '[' + i | |
for val in text_to_unicode_Matches[i]: | |
r += unichr( val ) | |
r += ']' | |
elif ord(i) >= 0x80: | |
pass | |
else: | |
r += i | |
return r | |
else: # if listIt is True... we take a slightly different route and use a list obviously | |
# First we go through the query again and see if we need to make changes based on the predefined 'change' dict up above | |
changed = [] # Here we will log exactly where we made these changes. This is necessary for later | |
for i,n in enumerate(query): # Iterating over the query character by character... | |
for ind,ch in enumerate(change): # ... we check if the query matches any of the 'change' keys from that character forward. | |
if query[i:i + len(ch)] == ch: # If so, we define the culprit... | |
culprit = query[i:i + len(ch)] | |
for duba in range(0, query.count(culprit)): | |
w = query.find(culprit) # I'm running out of variables | |
if w != -1: | |
query = query[:w] + change[ch] + query[w + len(ch):] # For each occurence of the culprit we split the query, insert the replacement, and stitch it back together | |
changed.append([w, len(change[ch])]) # We need to do it this way and not with str.replace() so we can keep track of the indeces where we did this for later | |
# Now we have an list like this [[1, 3], [5, 6]] of where we made these changes and how long they were | |
l = [] # The list we'll be returning | |
skip = 0 | |
for ind,i in enumerate(query): | |
r = '' | |
stop = False | |
if skip > 0: | |
skip -= 1 # If we're skipping this character just tack one off 'skip' and go again | |
else: # If we'r not, we carry on | |
for y,x in enumerate(changed): | |
if ind == changed[y][0]: # Check if this is the first character of a sequence we are meant to skip by checking that 'changed' list of indeces from earlier | |
l.append(query[ind:ind+changed[y][1]]) | |
skip = changed[y][1] - 1 # This is where that 'changed' list comes in. We skip over the characters inside the predefined changes we made. | |
changed.pop(y) # Forget about this one since we already skipped it | |
stop = True | |
if not stop: | |
if unicode_to_text_Matches.has_key(ord(i)): | |
r += '[' + unicode_to_text_Matches[ord(i)] + i + ']' # The same operation as above, replacing unicode characters... | |
if text_to_unicode_Matches.has_key(i): | |
r += '[' + i | |
for val in text_to_unicode_Matches[i]: | |
r += unichr( val ) | |
r += ']' | |
l.append( r ) | |
elif ord(i) >= 0x80: # Sysenter I think. We don't want this at all. | |
pass | |
else: | |
l.append( i ) | |
return l # The list of regexes for each character in the original query | |
# Pre: pass in query and source to search through | |
# Post: returns bool (if the query was found) and the exact match that was found (possibly corrected spelling) | |
def strict(self, query, source): | |
r = re.search(self.regexify(query), source, re.I) | |
if r: | |
return True, r.group(0) | |
return False, query | |
# Same as strict() but uses tolerance | |
def flexible(self, query, source, tolerance=2): | |
if re.search(self.regexify(query), source, re.I): | |
return True, query | |
for i in range(0, len(query)): | |
name = self.regexify(query, listIt = True) | |
for x in range(i, i + tolerance): | |
char = name[x] | |
if char[-1] != '?': | |
if len(char) == 1 or (char[-2] != '?' and name[x] != spaceReplace): | |
name[x] = name[x] + '?\w?\s?' # add a question mark | |
tolerant = ''.join(name) # After we insert the question marks put it all back together into a regex! | |
print tolerant | |
r = re.search(tolerant, str(source), re.I) | |
if r: | |
return True, r.group(0).strip() | |
return False, query # If all failed, I guess the search comes up negative :( |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment