Skip to content

Instantly share code, notes, and snippets.

@nithyadurai87
Created December 24, 2018 12:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nithyadurai87/f3fff58ab7272279ef069689fc391dec to your computer and use it in GitHub Desktop.
Save nithyadurai87/f3fff58ab7272279ef069689fc391dec to your computer and use it in GitHub Desktop.
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics.pairwise import euclidean_distances
corpus1 = [{'Gender': 'Male'},{'Gender': 'Female'},{'Gender': 'Transgender'},{'Gender': 'Male'},{'Gender': 'Female'}]
corpus2 = ['Bird is a Peacock Bird','Peacock dances very well','It eats variety of seeds','Cumin seed was eaten by it once']
vectors = [[2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1],
[0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0],[0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0]]
# one-hot encoding
v1 = DictVectorizer()
print (v1.fit_transform(corpus1).toarray())
print (v1.vocabulary_)
# bag-of-words (term frequencies, binary frequencies)
v2 = CountVectorizer()
print (v2.fit_transform(corpus2).todense())
print (v2.vocabulary_)
print (TfidfVectorizer().fit_transform(corpus2).todense())
print (HashingVectorizer(n_features=6).transform(corpus2).todense())
print (euclidean_distances([vectors[0]],[vectors[1]]))
print (euclidean_distances([vectors[0]],[vectors[2]]))
print (euclidean_distances([vectors[0]],[vectors[3]]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment