Skip to content

Instantly share code, notes, and snippets.

@forslund
Last active September 2, 2021 05:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save forslund/c01636a18ddef0a9bef84dae609f4511 to your computer and use it in GitHub Desktop.
Save forslund/c01636a18ddef0a9bef84dae609f4511 to your computer and use it in GitHub Desktop.
import re
import time
words = [
'where',
'which',
'when',
'what',
'that',
'will',
'from',
'that',
'also',
'who',
'how',
'did',
'and',
'but',
'the',
'too',
'why',
'is',
'it',
'do',
'or',
'to',
'a'
]
def str_replace(phrase):
phrase = " " + phrase + " "
for word in words:
mtch = " " + word + " "
if phrase.find(mtch) > -1:
phrase = phrase.replace(mtch, " ")
return phrase
def for_regex(phrase):
for word in words:
phrase = re.sub(r'\b' + word + r'\b', '', phrase)
return " ".join(phrase.split())
def single_regex(phrase):
regex = r'\b(' + '|'.join(words) + r')\b'
result = re.sub(regex, '', phrase)
return ' '.join(result.split())
REGEX = re.compile(r'\b(' + '|'.join(words) + r')\b')
def precalculated_single_regex(phrase):
result = REGEX.sub('', phrase)
return ' '.join(result.split())
def list_join(phrase):
return ' '.join(w for w in phrase.split() if w not in words)
def dont_preserve_order(phrase):
# remove noise to produce essence
phrase = " " + phrase.lower() + " "
result_set = set(set(phrase.split(' ')) - set(words))
return (' '.join(result_set)).strip()
start = time.monotonic()
for i in range(100000):
str_replace("his name is andy and he is cool")
stop = time.monotonic()
print('str_replace', stop - start)
start = time.monotonic()
for i in range(100000):
for_regex("his name is andy and he is cool")
stop = time.monotonic()
print('for_regex', stop - start)
start = time.monotonic()
for i in range(100000):
single_regex("his name is andy and he is cool")
stop = time.monotonic()
print('single_regex', stop - start)
start = time.monotonic()
for i in range(100000):
precalculated_single_regex("his name is andy and he is cool")
stop = time.monotonic()
print('precalculated_single_regex', stop - start)
start = time.monotonic()
for i in range(100000):
list_join("his name is andy and he is cool")
stop = time.monotonic()
print('list_join', stop - start)
start = time.monotonic()
for i in range(100000):
dont_preserve_order("his name is andy and he is cool")
stop = time.monotonic()
print('dont_preserve_order', stop - start)
@ken-mycroft
Copy link

I think it was these routines

def remove_noise_dont_preserve_order(self, phrase):
    # remove noise to produce essence
    phrase = " " + phrase + " "
    result_set = set( set(phrase.split(' ')) - set(self.translated_noise_words) )
    return (' '.join(result_set)).strip()

def remove_noise_preserve_order(self, phrase):
    # remove noise to produce essence
    phrase = " " + phrase + " "
    for word in self.translated_noise_words:
        mtch = " " + word + " "
        if phrase.find(mtch) > -1:
            phrase = phrase.replace(mtch, " ")
    phrase = " ".join(phrase.split())
    print(phrase)
    return phrase.strip()

@forslund
Copy link
Author

forslund commented Sep 2, 2021

Thanks. Will update with the missing

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment