gaulinmp/similarity_example.ipynb

## similarity_example.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              similarity_example.ipynb
            
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## similarity_example.py
# Get Tuple algorithms
import re
import math
import numpy as np
from itertools import chain
from collections import Counter
import nltk
from nltk.util import ngrams # This is the ngram magic.
from textblob import TextBlob

NGRAM = 4

re_sent_ends_naive = re.compile(r'[.\n]')
re_stripper_alpha = re.compile('[^a-zA-Z]+')
re_stripper_naive = re.compile('[^a-zA-Z\.\n]')

splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

def get_tuples_nosentences(txt):
    """Get tuples that ignores all punctuation (including sentences)."""
    if not txt: return None
    ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)
    return list(ng)

def get_tuples_manual_sentences(txt):
    """Naive get tuples that uses periods or newlines to denote sentences."""
    if not txt: return None
    sentences = (x.split() for x in splitter_naive(txt) if x)
    ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)
    return list(chain(*ng))

def get_tuples_nltk_punkt_sentences(txt):
    """Get tuples that doesn't use textblob."""
    if not txt: return None
    sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)
    # Need to filter X because of empty 'words' from punctuation split
    ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)
    return list(chain(*ng))

def get_tuples_textblob_sentences(txt):
    """New get_tuples that does use textblob."""
    if not txt: return None
    tb = TextBlob(txt)
    ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)
    return [item for sublist in ng for item in sublist]

def jaccard_distance(a, b):
    """Calculate the jaccard distance between sets A and B"""
    a = set(a)
    b = set(b)
    return 1.0 * len(a&b)/len(a|b)

def cosine_similarity_ngrams(a, b):
    vec1 = Counter(a)
    vec2 = Counter(b)

    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    return float(numerator) / denominator


def test():
    paragraph = """It was the best of times, it was the worst of times.
               It was the age of wisdom? It was the age of foolishness!
               I first met Dr. Frankenstein in Munich; his monster was, presumably, at home."""
    print(paragraph)
    _ = get_tuples_nosentences(paragraph);print("Number of N-grams (no sentences):", len(_));_

    _ = get_tuples_manual_sentences(paragraph);print("Number of N-grams (naive sentences):", len(_));_

    _ = get_tuples_nltk_punkt_sentences(paragraph);print("Number of N-grams (nltk sentences):", len(_));_

    _ = get_tuples_textblob_sentences(paragraph);print("Number of N-grams (TextBlob sentences):", len(_));_

    a = get_tuples_nosentences("It was the best of times.")
    b = get_tuples_nosentences("It was the worst of times.")
    print("Jaccard: {}   Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))

    a = get_tuples_nosentences("Above is a bad example of four-gram similarity.")
    b = get_tuples_nosentences("This is a better example of four-gram similarity.")
    print("Jaccard: {}   Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))

    a = get_tuples_nosentences("Jaccard Index ignores repetition repetition repetition repetition repetition.")
    b = get_tuples_nosentences("Cosine similarity weighs repetition repetition repetition repetition repetition.")
    print("Jaccard: {}   Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
test()
	# Get Tuple algorithms
	import re
	import math
	import numpy as np
	from itertools import chain
	from collections import Counter
	import nltk
	from nltk.util import ngrams # This is the ngram magic.
	from textblob import TextBlob

	NGRAM = 4

	re_sent_ends_naive = re.compile(r'[.\n]')
	re_stripper_alpha = re.compile('[^a-zA-Z]+')
	re_stripper_naive = re.compile('[^a-zA-Z\.\n]')

	splitter_naive = lambda x: re_sent_ends_naive.split(re_stripper_naive.sub(' ', x))

	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

	def get_tuples_nosentences(txt):
	"""Get tuples that ignores all punctuation (including sentences)."""
	if not txt: return None
	ng = ngrams(re_stripper_alpha.sub(' ', txt).split(), NGRAM)
	return list(ng)

	def get_tuples_manual_sentences(txt):
	"""Naive get tuples that uses periods or newlines to denote sentences."""
	if not txt: return None
	sentences = (x.split() for x in splitter_naive(txt) if x)
	ng = (ngrams(x, NGRAM) for x in sentences if len(x) >= NGRAM)
	return list(chain(*ng))

	def get_tuples_nltk_punkt_sentences(txt):
	"""Get tuples that doesn't use textblob."""
	if not txt: return None
	sentences = (re_stripper_alpha.split(x) for x in sent_detector.tokenize(txt) if x)
	# Need to filter X because of empty 'words' from punctuation split
	ng = (ngrams(filter(None, x), NGRAM) for x in sentences if len(x) >= NGRAM)
	return list(chain(*ng))

	def get_tuples_textblob_sentences(txt):
	"""New get_tuples that does use textblob."""
	if not txt: return None
	tb = TextBlob(txt)
	ng = (ngrams(x.words, NGRAM) for x in tb.sentences if len(x.words) > NGRAM)
	return [item for sublist in ng for item in sublist]

	def jaccard_distance(a, b):
	"""Calculate the jaccard distance between sets A and B"""
	a = set(a)
	b = set(b)
	return 1.0 * len(a&b)/len(a\|b)

	def cosine_similarity_ngrams(a, b):
	vec1 = Counter(a)
	vec2 = Counter(b)

	intersection = set(vec1.keys()) & set(vec2.keys())
	numerator = sum([vec1[x] * vec2[x] for x in intersection])

	sum1 = sum([vec1[x]**2 for x in vec1.keys()])
	sum2 = sum([vec2[x]**2 for x in vec2.keys()])
	denominator = math.sqrt(sum1) * math.sqrt(sum2)

	if not denominator:
	return 0.0
	return float(numerator) / denominator


	def test():
	paragraph = """It was the best of times, it was the worst of times.
	It was the age of wisdom? It was the age of foolishness!
	I first met Dr. Frankenstein in Munich; his monster was, presumably, at home."""
	print(paragraph)
	_ = get_tuples_nosentences(paragraph);print("Number of N-grams (no sentences):", len(_));_

	_ = get_tuples_manual_sentences(paragraph);print("Number of N-grams (naive sentences):", len(_));_

	_ = get_tuples_nltk_punkt_sentences(paragraph);print("Number of N-grams (nltk sentences):", len(_));_

	_ = get_tuples_textblob_sentences(paragraph);print("Number of N-grams (TextBlob sentences):", len(_));_

	a = get_tuples_nosentences("It was the best of times.")
	b = get_tuples_nosentences("It was the worst of times.")
	print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))

	a = get_tuples_nosentences("Above is a bad example of four-gram similarity.")
	b = get_tuples_nosentences("This is a better example of four-gram similarity.")
	print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))

	a = get_tuples_nosentences("Jaccard Index ignores repetition repetition repetition repetition repetition.")
	b = get_tuples_nosentences("Cosine similarity weighs repetition repetition repetition repetition repetition.")
	print("Jaccard: {} Cosine: {}".format(jaccard_distance(a,b), cosine_similarity_ngrams(a,b)))
	test()