Skip to content

Instantly share code, notes, and snippets.

@artursapek
Created February 17, 2012 23:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save artursapek/1856075 to your computer and use it in GitHub Desktop.
Save artursapek/1856075 to your computer and use it in GitHub Desktop.
A tolerant search function
import re
class finder():
# I got these dicts from someone on Stackoverflow. They translate common letters to visually similar unicode keys.
# This will let us match an N with a tilde to a regular N, for example
global unicode_to_text_Matches
unicode_to_text_Matches = { 161: '!', 192: 'A', 193: 'A', 194: 'A', 195: 'A', 196: 'A', 197: 'A', 198: 'Ae', 199: 'C', 200: 'E', 201: 'E', 202: 'E', 203: 'E', 204: 'I', 205: 'I',
206: 'I', 207: 'I', 208: 'Th', 209: 'N', 210: 'O', 211: 'O', 212: 'O', 213: 'O', 214: 'O', 216: 'O', 217: 'U', 218: 'U', 219: 'U', 220: 'U', 221: 'Y', 222: 'th', 223: 'ss', 224: 'a',
225: 'a', 226: 'a', 227: 'a', 228: 'a', 229: 'a', 230: 'ae', 231: 'c', 232: 'e', 233: 'e', 234: 'e', 235: 'e', 236: 'i', 237: 'i', 238: 'i', 239: 'i', 240: 'th', 241: 'n', 242: 'o',
243: 'o', 244: 'o', 245: 'o', 246: 'o', 248: 'o', 249: 'u', 250: 'u', 251: 'u', 252: 'u', 253: 'y', 254: 'th', 255: 'y' }
global text_to_unicode_Matches
text_to_unicode_Matches = { '!': [161], 'A': [192, 193, 194, 195, 196, 197], 'C': [199], 'E': [200, 201, 202, 203], 'I': [204, 205, 206, 207], 'O': [210, 211, 212, 213, 214, 216],
'N': [209], 'U': [217, 218, 219, 220], 'Y': [221], 'e': [232, 233, 234, 235], 'a': [224, 225, 226, 227, 228, 229], 'c': [231],
'i': [236, 237, 238, 239], 'o': [242, 243, 244, 245, 246, 248], 'n': [241], 'ss': [223], 'u': [249, 250, 251, 252], 'y': [253, 255] }
global spaceReplace
spaceReplace = '[\\[\\]\\(\\)\\s\\.\\,\\-\\.\\_]+?'
# Pre: pass in any word or phrase
# Post: returns a regex that flexibly will match that with unicode chars, and different characters used for spaces.
# When run with listIt as True, returns a list of the characters individually regexified instead of a complete string for the query.
# You can ''.join() this list and it comes out the same as when listIt is False.
# This lets you iterate over it as the original characters.
def regexify(self, query, listIt=False):
questionOut = ['[', ']']
change = { '. ':'\.?\s?', ' and ': '(\sand\s|\s&\s|\s&\s)?', ' & ': '(\sand\s|\s&\s|\s&\s)?', 'the ': '(the\s)?', ' ': spaceReplace}
if not listIt: # If a list wasn't request we just return a simple string that can be used as a regex for the query.
for q in questionOut:
query = query.replace(q, q + '?')
for c in change:
query = query.replace(c, change[c])
r = u''
for i in query: # Take care of unicode chars.
if unicode_to_text_Matches.has_key(ord(i)):
r += '[' + unicode_to_text_Matches[ord(i)] + i + ']'
if text_to_unicode_Matches.has_key(i):
r += '[' + i
for val in text_to_unicode_Matches[i]:
r += unichr( val )
r += ']'
elif ord(i) >= 0x80:
pass
else:
r += i
return r
else: # if listIt is True... we take a slightly different route and use a list obviously
# First we go through the query again and see if we need to make changes based on the predefined 'change' dict up above
changed = [] # Here we will log exactly where we made these changes. This is necessary for later
for i,n in enumerate(query): # Iterating over the query character by character...
for ind,ch in enumerate(change): # ... we check if the query matches any of the 'change' keys from that character forward.
if query[i:i + len(ch)] == ch: # If so, we define the culprit...
culprit = query[i:i + len(ch)]
for duba in range(0, query.count(culprit)):
w = query.find(culprit) # I'm running out of variables
if w != -1:
query = query[:w] + change[ch] + query[w + len(ch):] # For each occurence of the culprit we split the query, insert the replacement, and stitch it back together
changed.append([w, len(change[ch])]) # We need to do it this way and not with str.replace() so we can keep track of the indeces where we did this for later
# Now we have an list like this [[1, 3], [5, 6]] of where we made these changes and how long they were
l = [] # The list we'll be returning
skip = 0
for ind,i in enumerate(query):
r = ''
stop = False
if skip > 0:
skip -= 1 # If we're skipping this character just tack one off 'skip' and go again
else: # If we'r not, we carry on
for y,x in enumerate(changed):
if ind == changed[y][0]: # Check if this is the first character of a sequence we are meant to skip by checking that 'changed' list of indeces from earlier
l.append(query[ind:ind+changed[y][1]])
skip = changed[y][1] - 1 # This is where that 'changed' list comes in. We skip over the characters inside the predefined changes we made.
changed.pop(y) # Forget about this one since we already skipped it
stop = True
if not stop:
if unicode_to_text_Matches.has_key(ord(i)):
r += '[' + unicode_to_text_Matches[ord(i)] + i + ']' # The same operation as above, replacing unicode characters...
if text_to_unicode_Matches.has_key(i):
r += '[' + i
for val in text_to_unicode_Matches[i]:
r += unichr( val )
r += ']'
l.append( r )
elif ord(i) >= 0x80: # Sysenter I think. We don't want this at all.
pass
else:
l.append( i )
return l # The list of regexes for each character in the original query
# Pre: pass in query and source to search through
# Post: returns bool (if the query was found) and the exact match that was found (possibly corrected spelling)
def strict(self, query, source):
r = re.search(self.regexify(query), source, re.I)
if r:
return True, r.group(0)
return False, query
# Same as strict() but uses tolerance
def flexible(self, query, source, tolerance=2):
if re.search(self.regexify(query), source, re.I):
return True, query
for i in range(0, len(query)):
name = self.regexify(query, listIt = True)
for x in range(i, i + tolerance):
char = name[x]
if char[-1] != '?':
if len(char) == 1 or (char[-2] != '?' and name[x] != spaceReplace):
name[x] = name[x] + '?\w?\s?' # add a question mark
tolerant = ''.join(name) # After we insert the question marks put it all back together into a regex!
print tolerant
r = re.search(tolerant, str(source), re.I)
if r:
return True, r.group(0).strip()
return False, query # If all failed, I guess the search comes up negative :(
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment