Skip to content

Instantly share code, notes, and snippets.

@mjbommar
Last active August 29, 2015 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mjbommar/0d8d05f2ab455feaf329 to your computer and use it in GitHub Desktop.
Save mjbommar/0d8d05f2ab455feaf329 to your computer and use it in GitHub Desktop.
Fuzzy sentence matching in Python - Bommarito Consulting, LLC: http://bommaritollc.com/2014/06/fuzzy-match-sentences-in-python
# ## IPython Notebook for [Bommarito Consulting](http://bommaritollc.com/) Blog Post
# ### **Link**: [Fuzzy sentence matching in Python](http://bommaritollc.com/2014/06/fuzzy-match-sentences-in-python): http://bommaritollc.com/2014/06/fuzzy-match-sentences-in-python
# **Author**: [Michael J. Bommarito II](https://www.linkedin.com/in/bommarito/)
# Imports
import nltk.corpus
import nltk.tokenize.punkt
import nltk.stem.snowball
from nltk.corpus import wordnet
import string
# Get default English stopwords and extend with punctuation
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')
def get_wordnet_pos(pos_tag):
if pos_tag[1].startswith('J'):
return (pos_tag[0], wordnet.ADJ)
elif pos_tag[1].startswith('V'):
return (pos_tag[0], wordnet.VERB)
elif pos_tag[1].startswith('N'):
return (pos_tag[0], wordnet.NOUN)
elif pos_tag[1].startswith('R'):
return (pos_tag[0], wordnet.ADV)
else:
return (pos_tag[0], wordnet.NOUN)
# Create tokenizer and stemmer
tokenizer = nltk.tokenize.punkt.PunktWordTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
def is_ci_token_stopword_lemma_match(a, b):
"""Check if a and b are matches."""
pos_a = map(get_wordnet_pos, nltk.pos_tag(tokenizer.tokenize(a)))
pos_b = map(get_wordnet_pos, nltk.pos_tag(tokenizer.tokenize(b)))
lemmae_a = [lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_a \
if token.lower().strip(string.punctuation) not in stopwords]
lemmae_b = [lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_b \
if token.lower().strip(string.punctuation) not in stopwords]
return (lemmae_a == lemmae_b)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment