artursapek/looseSearch.py

## looseSearch.py
import re

class finder():

    # I got these dicts from someone on Stackoverflow. They translate common letters to visually similar unicode keys.
    # This will let us match an N with a tilde to a regular N, for example
    global unicode_to_text_Matches
    unicode_to_text_Matches = { 161: '!', 192: 'A', 193: 'A', 194: 'A', 195: 'A', 196: 'A', 197: 'A', 198: 'Ae', 199: 'C', 200: 'E', 201: 'E', 202: 'E', 203: 'E', 204: 'I', 205: 'I',
                                        206: 'I', 207: 'I', 208: 'Th', 209: 'N', 210: 'O', 211: 'O', 212: 'O', 213: 'O', 214: 'O', 216: 'O', 217: 'U', 218: 'U', 219: 'U', 220: 'U', 221: 'Y', 222: 'th', 223: 'ss', 224: 'a',
                                        225: 'a', 226: 'a', 227: 'a', 228: 'a', 229: 'a', 230: 'ae', 231: 'c', 232: 'e', 233: 'e', 234: 'e', 235: 'e', 236: 'i', 237: 'i', 238: 'i', 239: 'i', 240: 'th', 241: 'n', 242: 'o',
                                        243: 'o', 244: 'o', 245: 'o', 246: 'o', 248: 'o', 249: 'u', 250: 'u', 251: 'u', 252: 'u', 253: 'y', 254: 'th', 255: 'y' }
    global text_to_unicode_Matches
    text_to_unicode_Matches = { '!': [161], 'A': [192, 193, 194, 195, 196, 197], 'C': [199], 'E': [200, 201, 202, 203], 'I': [204, 205, 206, 207], 'O': [210, 211, 212, 213, 214, 216],
                                'N': [209], 'U': [217, 218, 219, 220], 'Y': [221], 'e': [232, 233, 234, 235], 'a': [224, 225, 226, 227, 228, 229], 'c': [231],
                                'i': [236, 237, 238, 239], 'o': [242, 243, 244, 245, 246, 248], 'n': [241], 'ss': [223], 'u': [249, 250, 251, 252], 'y': [253, 255] }
    global spaceReplace
    spaceReplace = '[\\[\\]\\(\\)\\s\\.\\,\\-\\.\\_]+?'

    # Pre: pass in any word or phrase
    # Post: returns a regex that flexibly will match that with unicode chars, and different characters used for spaces.
        # When run with listIt as True, returns a list of the characters individually regexified instead of a complete string for the query.
        # You can ''.join() this list and it comes out the same as when listIt is False.
        # This lets you iterate over it as the original characters.
    def regexify(self, query, listIt=False):
        questionOut = ['[', ']']
        change = { '. ':'\.?\s?', ' and ': '(\sand\s|\s&\s|\s&amp;\s)?', ' & ': '(\sand\s|\s&\s|\s&amp;\s)?', 'the ': '(the\s)?', ' ': spaceReplace}

        if not listIt: # If a list wasn't request we just return a simple string that can be used as a regex for the query.
            for q in questionOut:
                query = query.replace(q, q + '?')
            for c in change:
                query = query.replace(c, change[c])
            r = u''
            for i in query: # Take care of unicode chars.
                if unicode_to_text_Matches.has_key(ord(i)):
                    r += '[' + unicode_to_text_Matches[ord(i)] + i + ']'
                if text_to_unicode_Matches.has_key(i):
                    r += '[' + i
                    for val in text_to_unicode_Matches[i]:
                        r += unichr( val )
                    r += ']'
                elif ord(i) >= 0x80:
                    pass
                else:
                    r += i
            return r
        else: # if listIt is True... we take a slightly different route and use a list obviously
              # First we go through the query again and see if we need to make changes based on the predefined 'change' dict up above
            changed = []                                # Here we will log exactly where we made these changes. This is necessary for later
            for i,n in enumerate(query):                # Iterating over the query character by character...
                for ind,ch in enumerate(change):        # ... we check if the query matches any of the 'change' keys from that character forward.
                    if query[i:i + len(ch)] == ch:      # If so, we define the culprit...
                        culprit = query[i:i + len(ch)]
                        for duba in range(0, query.count(culprit)):
                            w = query.find(culprit)     # I'm running out of variables
                            if w != -1:
                                query = query[:w] + change[ch] + query[w + len(ch):] # For each occurence of the culprit we split the query, insert the replacement, and stitch it back together
                                changed.append([w, len(change[ch])]) # We need to do it this way and not with str.replace() so we can keep track of the indeces where we did this for later
                        # Now we have an list like this [[1, 3], [5, 6]] of where we made these changes and how long they were
            l = [] # The list we'll be returning
            skip = 0
            for ind,i in enumerate(query):
                r = ''
                stop = False
                if skip > 0:
                    skip -= 1  # If we're skipping this character just tack one off 'skip' and go again
                else:          # If we'r not, we carry on
                    for y,x in enumerate(changed):
                        if ind == changed[y][0]: # Check if this is the first character of a sequence we are meant to skip by checking that 'changed' list of indeces from earlier
                            l.append(query[ind:ind+changed[y][1]])
                            skip = changed[y][1] - 1 # This is where that 'changed' list comes in. We skip over the characters inside the predefined changes we made.
                            changed.pop(y)           # Forget about this one since we already skipped it
                            stop = True
                    if not stop:
                        if unicode_to_text_Matches.has_key(ord(i)):
                            r +=  '[' + unicode_to_text_Matches[ord(i)] + i + ']' # The same operation as above, replacing unicode characters...
                        if text_to_unicode_Matches.has_key(i):
                            r += '[' + i
                            for val in text_to_unicode_Matches[i]:
                                r += unichr( val )
                            r +=  ']'
                            l.append( r )
                        elif ord(i) >= 0x80: # Sysenter I think. We don't want this at all.
                            pass
                        else:
                            l.append( i )
            return l # The list of regexes for each character in the original query

    # Pre: pass in query and source to search through
    # Post: returns bool (if the query was found) and the exact match that was found (possibly corrected spelling)

    def strict(self, query, source):
        r = re.search(self.regexify(query), source, re.I)
        if r:
            return True, r.group(0)
        return False, query

    # Same as strict() but uses tolerance

    def flexible(self, query, source, tolerance=2):
        if re.search(self.regexify(query), source, re.I):
            return True, query
        for i in range(0, len(query)):
            name = self.regexify(query, listIt = True)
            for x in range(i, i + tolerance):
                char = name[x]
                if char[-1] != '?':
                    if len(char) == 1 or (char[-2] != '?' and name[x] != spaceReplace):
                        name[x] = name[x] + '?\w?\s?' # add a question mark
            tolerant = ''.join(name) # After we insert the question marks put it all back together into a regex!
            print tolerant
            r = re.search(tolerant, str(source), re.I)
            if r:
                return True, r.group(0).strip()
        return False, query # If all failed, I guess the search comes up negative :(
	import re

	class finder():

	# I got these dicts from someone on Stackoverflow. They translate common letters to visually similar unicode keys.
	# This will let us match an N with a tilde to a regular N, for example
	global unicode_to_text_Matches
	unicode_to_text_Matches = { 161: '!', 192: 'A', 193: 'A', 194: 'A', 195: 'A', 196: 'A', 197: 'A', 198: 'Ae', 199: 'C', 200: 'E', 201: 'E', 202: 'E', 203: 'E', 204: 'I', 205: 'I',
	206: 'I', 207: 'I', 208: 'Th', 209: 'N', 210: 'O', 211: 'O', 212: 'O', 213: 'O', 214: 'O', 216: 'O', 217: 'U', 218: 'U', 219: 'U', 220: 'U', 221: 'Y', 222: 'th', 223: 'ss', 224: 'a',
	225: 'a', 226: 'a', 227: 'a', 228: 'a', 229: 'a', 230: 'ae', 231: 'c', 232: 'e', 233: 'e', 234: 'e', 235: 'e', 236: 'i', 237: 'i', 238: 'i', 239: 'i', 240: 'th', 241: 'n', 242: 'o',
	243: 'o', 244: 'o', 245: 'o', 246: 'o', 248: 'o', 249: 'u', 250: 'u', 251: 'u', 252: 'u', 253: 'y', 254: 'th', 255: 'y' }
	global text_to_unicode_Matches
	text_to_unicode_Matches = { '!': [161], 'A': [192, 193, 194, 195, 196, 197], 'C': [199], 'E': [200, 201, 202, 203], 'I': [204, 205, 206, 207], 'O': [210, 211, 212, 213, 214, 216],
	'N': [209], 'U': [217, 218, 219, 220], 'Y': [221], 'e': [232, 233, 234, 235], 'a': [224, 225, 226, 227, 228, 229], 'c': [231],
	'i': [236, 237, 238, 239], 'o': [242, 243, 244, 245, 246, 248], 'n': [241], 'ss': [223], 'u': [249, 250, 251, 252], 'y': [253, 255] }
	global spaceReplace
	spaceReplace = '[\\[\\]\\(\\)\\s\\.\\,\\-\\.\\_]+?'

	# Pre: pass in any word or phrase
	# Post: returns a regex that flexibly will match that with unicode chars, and different characters used for spaces.
	# When run with listIt as True, returns a list of the characters individually regexified instead of a complete string for the query.
	# You can ''.join() this list and it comes out the same as when listIt is False.
	# This lets you iterate over it as the original characters.
	def regexify(self, query, listIt=False):
	questionOut = ['[', ']']
	change = { '. ':'\.?\s?', ' and ': '(\sand\s\|\s&\s\|\s&\s)?', ' & ': '(\sand\s\|\s&\s\|\s&\s)?', 'the ': '(the\s)?', ' ': spaceReplace}

	if not listIt: # If a list wasn't request we just return a simple string that can be used as a regex for the query.
	for q in questionOut:
	query = query.replace(q, q + '?')
	for c in change:
	query = query.replace(c, change[c])
	r = u''
	for i in query: # Take care of unicode chars.
	if unicode_to_text_Matches.has_key(ord(i)):
	r += '[' + unicode_to_text_Matches[ord(i)] + i + ']'
	if text_to_unicode_Matches.has_key(i):
	r += '[' + i
	for val in text_to_unicode_Matches[i]:
	r += unichr( val )
	r += ']'
	elif ord(i) >= 0x80:
	pass
	else:
	r += i
	return r
	else: # if listIt is True... we take a slightly different route and use a list obviously
	# First we go through the query again and see if we need to make changes based on the predefined 'change' dict up above
	changed = [] # Here we will log exactly where we made these changes. This is necessary for later
	for i,n in enumerate(query): # Iterating over the query character by character...
	for ind,ch in enumerate(change): # ... we check if the query matches any of the 'change' keys from that character forward.
	if query[i:i + len(ch)] == ch: # If so, we define the culprit...
	culprit = query[i:i + len(ch)]
	for duba in range(0, query.count(culprit)):
	w = query.find(culprit) # I'm running out of variables
	if w != -1:
	query = query[:w] + change[ch] + query[w + len(ch):] # For each occurence of the culprit we split the query, insert the replacement, and stitch it back together
	changed.append([w, len(change[ch])]) # We need to do it this way and not with str.replace() so we can keep track of the indeces where we did this for later
	# Now we have an list like this [[1, 3], [5, 6]] of where we made these changes and how long they were
	l = [] # The list we'll be returning
	skip = 0
	for ind,i in enumerate(query):
	r = ''
	stop = False
	if skip > 0:
	skip -= 1 # If we're skipping this character just tack one off 'skip' and go again
	else: # If we'r not, we carry on
	for y,x in enumerate(changed):
	if ind == changed[y][0]: # Check if this is the first character of a sequence we are meant to skip by checking that 'changed' list of indeces from earlier
	l.append(query[ind:ind+changed[y][1]])
	skip = changed[y][1] - 1 # This is where that 'changed' list comes in. We skip over the characters inside the predefined changes we made.
	changed.pop(y) # Forget about this one since we already skipped it
	stop = True
	if not stop:
	if unicode_to_text_Matches.has_key(ord(i)):
	r += '[' + unicode_to_text_Matches[ord(i)] + i + ']' # The same operation as above, replacing unicode characters...
	if text_to_unicode_Matches.has_key(i):
	r += '[' + i
	for val in text_to_unicode_Matches[i]:
	r += unichr( val )
	r += ']'
	l.append( r )
	elif ord(i) >= 0x80: # Sysenter I think. We don't want this at all.
	pass
	else:
	l.append( i )
	return l # The list of regexes for each character in the original query

	# Pre: pass in query and source to search through
	# Post: returns bool (if the query was found) and the exact match that was found (possibly corrected spelling)

	def strict(self, query, source):
	r = re.search(self.regexify(query), source, re.I)
	if r:
	return True, r.group(0)
	return False, query

	# Same as strict() but uses tolerance

	def flexible(self, query, source, tolerance=2):
	if re.search(self.regexify(query), source, re.I):
	return True, query
	for i in range(0, len(query)):
	name = self.regexify(query, listIt = True)
	for x in range(i, i + tolerance):
	char = name[x]
	if char[-1] != '?':
	if len(char) == 1 or (char[-2] != '?' and name[x] != spaceReplace):
	name[x] = name[x] + '?\w?\s?' # add a question mark
	tolerant = ''.join(name) # After we insert the question marks put it all back together into a regex!
	print tolerant
	r = re.search(tolerant, str(source), re.I)
	if r:
	return True, r.group(0).strip()
	return False, query # If all failed, I guess the search comes up negative :(