Created

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist
View gist:463cf925595874ba64b9
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
from nltk import regexp_tokenize
 
def tokenize(term):
# Adapted From Natural Language Processing with Python
regex = r'''(?xi)
(?:H|S)\.\ ?(?:(?:J|R)\.\ )?(?:Con\.\ )?(?:Res\.\ )?\d+ # Bills
| ([A-Z]\.)+ # Abbreviations (U.S.A., etc.)
| ([A-Z]+\&[A-Z]+) # Internal ampersands (AT&T, etc.)
| (Mr\.|Dr\.|Mrs\.|Ms\.) # Mr., Mrs., etc.
| \d*\.\d+ # Numbers with decimal points.
| \d\d?:\d\d # Times.
| \$?[,\.0-9]+\d # Numbers with thousands separators, (incl currency).
| (((a|A)|(p|P))\.(m|M)\.) # a.m., p.m., A.M., P.M.
| \w+((-|')\w+)* # Words with optional internal hyphens.
| \$?\d+(\.\d+)?%? # Currency and percentages.
| (?<=\b)\.\.\.(?=\b) # Ellipses surrounded by word borders
| [][.,;"'?():-_`]
'''
# Strip punctuation from this one; solr doesn't know about any of it
tokens = regexp_tokenize(term, regex)
# tokens = [re.sub(r'[.,?!]', '', token) for token in tokens] # instead of this we just test word length
return tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.