numberoverzero/profanity.py

## profanity.py
'''
Character substitution
====
Not handled at all.  Should be reasonably straightforward to plug into the existing code.
Roughly:

def expand_substitutions(wordlist, sub_map):
    for word in wordlist:
        for substitution in expand_word(word, sub_map):
            yield substitution

def expand_word(word, sub_map):
    # yield all permutations of all possible substitutions using sub_map

To plug this into the existing generate_patterns:

def generate_patterns(wordlist, sub_map, delims, max_delims):
    wordlist = expand_substitutions(wordlist, sub_map)
    for word in wordlist:
        for pattern in compiled_permutations(word, delims, max_delims):
            yield pattern


Performance improvements
====

The largest possible speed increase would come from either:

    1) Combining all patterns s.t. generate_pattern_list returns one compiled regex
    2) Grouping patterns s.t. generate_pattern_list returns a smaller list of compiled regexs

---

    1) Makes is_profane a O(1) operation instead of O(n), but could easily run into internal parser limits.  Probably requires some clever permutation/backtracking/grouping logic when generating all regexes for a word - or grouping along delimiter permutations.
    2) is_profane remains O(k) where k is number of regex groups - basically a constant factor improvement over the O(n) implementation that exists.  The best grouping algorithm wouldn't necessarily be one which divides n into an even number of combined regexes, but one which minimizes the maximum execution time of any combined search, or some other hotspot reordering algorithm moves the faster, more common matching regexes to the front of the list over time.  (This could be applied regardless of grouping, as long as (1) isn't used).  Depending on average runtime of (1) this could be give better practical performance in the matching case, with (likely) worse performance in the non-matching case.

'''

import re
import itertools

join = lambda x: "".join(x)

def inject_substring(string, substring):
    sections = []
    for char in string:
        sections.append(char)
        sections.append(substring)
    sections.pop()
    return join(sections)

def permutations_with_repetition(iterable, r=None):
    for unique in set(itertools.permutations(iterable, r)):
        yield unique

def anchored_permutation(string):
    if len(string) < 3:
        yield [string]
    else:
        first = string[0]
        last = string[-1]
        for permutation in permutations_with_repetition(string[1:-1]):
            permute_str = join(permutation)
            yield join((first, permute_str, last))

sub_pattern_fmt = "[{d}]{{0,{m}}}"
def regex_permutations(string, delims, max_delims):
    sub_pattern = sub_pattern_fmt.format(d=delims,m=max_delims)
    for string_permutation in anchored_permutation(string):
        yield inject_substring(string_permutation, sub_pattern)

def compiled_permutation(string, delims, max_delims):
    permutations = regex_permutations(string, delims, max_delims)
    for permutation in permutations:
        yield re.compile(permutation)

def generate_patterns(wordlist, delims, max_delims):
    for word in wordlist:
        for pattern in compiled_permutations(word, delims, max_delims):
            yield pattern

def is_profane(msg, patternlist):
    for pattern in patternlist:
        matches = pattern.search(msg)
        if matches is not None:
            return True
    return False
	'''
	Character substitution
	====
	Not handled at all. Should be reasonably straightforward to plug into the existing code.
	Roughly:

	def expand_substitutions(wordlist, sub_map):
	for word in wordlist:
	for substitution in expand_word(word, sub_map):
	yield substitution

	def expand_word(word, sub_map):
	# yield all permutations of all possible substitutions using sub_map

	To plug this into the existing generate_patterns:

	def generate_patterns(wordlist, sub_map, delims, max_delims):
	wordlist = expand_substitutions(wordlist, sub_map)
	for word in wordlist:
	for pattern in compiled_permutations(word, delims, max_delims):
	yield pattern


	Performance improvements
	====

	The largest possible speed increase would come from either:

	1) Combining all patterns s.t. generate_pattern_list returns one compiled regex
	2) Grouping patterns s.t. generate_pattern_list returns a smaller list of compiled regexs

	---

	1) Makes is_profane a O(1) operation instead of O(n), but could easily run into internal parser limits. Probably requires some clever permutation/backtracking/grouping logic when generating all regexes for a word - or grouping along delimiter permutations.
	2) is_profane remains O(k) where k is number of regex groups - basically a constant factor improvement over the O(n) implementation that exists. The best grouping algorithm wouldn't necessarily be one which divides n into an even number of combined regexes, but one which minimizes the maximum execution time of any combined search, or some other hotspot reordering algorithm moves the faster, more common matching regexes to the front of the list over time. (This could be applied regardless of grouping, as long as (1) isn't used). Depending on average runtime of (1) this could be give better practical performance in the matching case, with (likely) worse performance in the non-matching case.

	'''

	import re
	import itertools

	join = lambda x: "".join(x)

	def inject_substring(string, substring):
	sections = []
	for char in string:
	sections.append(char)
	sections.append(substring)
	sections.pop()
	return join(sections)

	def permutations_with_repetition(iterable, r=None):
	for unique in set(itertools.permutations(iterable, r)):
	yield unique

	def anchored_permutation(string):
	if len(string) < 3:
	yield [string]
	else:
	first = string[0]
	last = string[-1]
	for permutation in permutations_with_repetition(string[1:-1]):
	permute_str = join(permutation)
	yield join((first, permute_str, last))

	sub_pattern_fmt = "[{d}]{{0,{m}}}"
	def regex_permutations(string, delims, max_delims):
	sub_pattern = sub_pattern_fmt.format(d=delims,m=max_delims)
	for string_permutation in anchored_permutation(string):
	yield inject_substring(string_permutation, sub_pattern)

	def compiled_permutation(string, delims, max_delims):
	permutations = regex_permutations(string, delims, max_delims)
	for permutation in permutations:
	yield re.compile(permutation)

	def generate_patterns(wordlist, delims, max_delims):
	for word in wordlist:
	for pattern in compiled_permutations(word, delims, max_delims):
	yield pattern

	def is_profane(msg, patternlist):
	for pattern in patternlist:
	matches = pattern.search(msg)
	if matches is not None:
	return True
	return False