Created
January 31, 2013 09:50
-
-
Save attilaolah/4681755 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
class WordFinder(object): | |
"""Searches text for words that conform to the given constraint. | |
To use it, instantiate it with any number of strings. These strings will be | |
used as constraints. For example, to find all words that contain the letter | |
'X' and the number '6', you can do something like this:: | |
>>> find = WordFinder('X', '6') | |
>>> find | |
<WordFinder object (using 2 constraints)> | |
When you use the finder, you will get a list of words that contain at least | |
one character from each string you used when you constructed the finder:: | |
>>> find('DarkX 666 sixx6 XXX T6FX\\tfoo\\n\\n66X') | |
['T6FX', '66X'] | |
Let's find the words that have both numbers and letters in them:: | |
>>> import string | |
>>> find = WordFinder(string.digits, string.ascii_letters) | |
>>> find('Konica Minolta DiMAGE Z3 4MP Digital Camera 200$ -30% off!') | |
['Z3', '4MP'] | |
If you also want the punctuation:: | |
>>> letters_punctiation = string.ascii_letters + string.punctuation | |
>>> find = WordFinder(string.digits, letters_punctiation) | |
>>> find('Konica Minolta DiMAGE Z3 4MP Digital Camera 200$ -30% off!') | |
['Z3', '4MP', '200$', '-30%'] | |
If you don't supply any constraints, all words will be returned:: | |
>>> find = WordFinder() | |
>>> find('Konica Minolta DiMAGE Z3 4MP Digital Camera 200$ -30% off!') | |
['Konica', 'Minolta', 'DiMAGE', 'Z3', '4MP', 'Digital', 'Camera', ...] | |
To use custom word separators, supply the `separator` keyword argument:: | |
>>> find = WordFinder(string.ascii_letters) | |
>>> find('Foo, bar! Spam? eggs...') | |
['Foo,', 'bar!', 'Spam?', 'eggs...'] | |
>>> find = WordFinder(string.ascii_letters, separator=r'\W+') | |
>>> find('Foo, bar! Spam? eggs...') | |
['Foo', 'bar', 'Spam', 'eggs'] | |
""" | |
def __init__(self, *strings, **kw): | |
"""Compile the regexes needed for matching.""" | |
self.constraints = [re.compile(r'[%s]' % word) for word in strings] | |
self.separator = re.compile(kw.get('separator', r'\s+')) | |
def __call__(self, text): | |
"""Yield words that conform to all the constraints.""" | |
return [word for word in self.separator.split(text) if \ | |
all((rx.search(word) for rx in self.constraints))] | |
def __repr__(self): | |
"""Tell how many constraints we have.""" | |
return '<%s object (using %d constraints)>' % \ | |
(self.__class__.__name__, len(self.constraints)) | |
if __name__ == '__main__': | |
# Do some testing | |
import doctest, unittest | |
tests = [doctest.DocTestSuite(__name__, optionflags=doctest.ELLIPSIS)] | |
unittest.TextTestRunner().run(unittest.TestSuite(tests)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment