Skip to content

Instantly share code, notes, and snippets.

@attilaolah
Created January 31, 2013 09:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save attilaolah/4681755 to your computer and use it in GitHub Desktop.
Save attilaolah/4681755 to your computer and use it in GitHub Desktop.
import re
class WordFinder(object):
"""Searches text for words that conform to the given constraint.
To use it, instantiate it with any number of strings. These strings will be
used as constraints. For example, to find all words that contain the letter
'X' and the number '6', you can do something like this::
>>> find = WordFinder('X', '6')
>>> find
<WordFinder object (using 2 constraints)>
When you use the finder, you will get a list of words that contain at least
one character from each string you used when you constructed the finder::
>>> find('DarkX 666 sixx6 XXX T6FX\\tfoo\\n\\n66X')
['T6FX', '66X']
Let's find the words that have both numbers and letters in them::
>>> import string
>>> find = WordFinder(string.digits, string.ascii_letters)
>>> find('Konica Minolta DiMAGE Z3 4MP Digital Camera 200$ -30% off!')
['Z3', '4MP']
If you also want the punctuation::
>>> letters_punctiation = string.ascii_letters + string.punctuation
>>> find = WordFinder(string.digits, letters_punctiation)
>>> find('Konica Minolta DiMAGE Z3 4MP Digital Camera 200$ -30% off!')
['Z3', '4MP', '200$', '-30%']
If you don't supply any constraints, all words will be returned::
>>> find = WordFinder()
>>> find('Konica Minolta DiMAGE Z3 4MP Digital Camera 200$ -30% off!')
['Konica', 'Minolta', 'DiMAGE', 'Z3', '4MP', 'Digital', 'Camera', ...]
To use custom word separators, supply the `separator` keyword argument::
>>> find = WordFinder(string.ascii_letters)
>>> find('Foo, bar! Spam? eggs...')
['Foo,', 'bar!', 'Spam?', 'eggs...']
>>> find = WordFinder(string.ascii_letters, separator=r'\W+')
>>> find('Foo, bar! Spam? eggs...')
['Foo', 'bar', 'Spam', 'eggs']
"""
def __init__(self, *strings, **kw):
"""Compile the regexes needed for matching."""
self.constraints = [re.compile(r'[%s]' % word) for word in strings]
self.separator = re.compile(kw.get('separator', r'\s+'))
def __call__(self, text):
"""Yield words that conform to all the constraints."""
return [word for word in self.separator.split(text) if \
all((rx.search(word) for rx in self.constraints))]
def __repr__(self):
"""Tell how many constraints we have."""
return '<%s object (using %d constraints)>' % \
(self.__class__.__name__, len(self.constraints))
if __name__ == '__main__':
# Do some testing
import doctest, unittest
tests = [doctest.DocTestSuite(__name__, optionflags=doctest.ELLIPSIS)]
unittest.TextTestRunner().run(unittest.TestSuite(tests))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment