kyleziegler/shingling_jaccard.py

## shingling_jaccard.py
tokenizer = nltk.tokenize.casual.TweetTokenizer()

def jaccard_similarity(list_x, list_y):
    # Convert to set, capturing only unique values
    set_x = set(list_x)
    set_y = set(list_y)
    intersection = set_x.intersection(set_y)
    union = set_x.union(set_y)

    # Prevent divide by 0 when there is an empty set, and return jaccard
    # simularity
    return len(intersection) / len(union) if len(union) > 0 else 0

def shingling_jaccard_similarity(text_x, text_y, n):

    # Tokenize, then send to ngrams, otherwise you'll make ngrams at the character level. It
    # is typical to tokenize text sentences at the word level in the area of NLP.
    x_tok = tokenizer.tokenize(text_x)
    y_tok = tokenizer.tokenize(text_y)

    # Make ngrams, of length n that is passed in.
    x = nltk.ngrams(list(x_tok), n)
    y = nltk.ngrams(list(y_tok), n)

    return jaccard_similarity(list(x),list(y))
	tokenizer = nltk.tokenize.casual.TweetTokenizer()

	def jaccard_similarity(list_x, list_y):
	# Convert to set, capturing only unique values
	set_x = set(list_x)
	set_y = set(list_y)
	intersection = set_x.intersection(set_y)
	union = set_x.union(set_y)

	# Prevent divide by 0 when there is an empty set, and return jaccard
	# simularity
	return len(intersection) / len(union) if len(union) > 0 else 0

	def shingling_jaccard_similarity(text_x, text_y, n):

	# Tokenize, then send to ngrams, otherwise you'll make ngrams at the character level. It
	# is typical to tokenize text sentences at the word level in the area of NLP.
	x_tok = tokenizer.tokenize(text_x)
	y_tok = tokenizer.tokenize(text_y)

	# Make ngrams, of length n that is passed in.
	x = nltk.ngrams(list(x_tok), n)
	y = nltk.ngrams(list(y_tok), n)

	return jaccard_similarity(list(x),list(y))