Amber0914/CountVectorizer_example2.py

## CountVectorizer_example2.py
from sklearn.feature_extraction.text import CountVectorizer

train_X = ["John likes to watch movies",
           "Mary likes movies too",
           "Joe only likes horror movies and action movies"]

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') # take a word as a token.
train_vector = vectorizer.fit_transform(train_X) # Learn the vocabulary dictionary and return term-document matrix.
token_set = vectorizer.get_feature_names() # the vocabulary dictionary: ['action', 'and', 'horror', 'joe', 'john', 'likes', 'mary', 'movies', 'only', 'to', 'too', 'watch']

test_X = ["Jay likes romantic movies"]
test_vector = vectorizer.transform(test_X)
print(test_vector)
'''
  (0, 5)	1
  (0, 7)	1
'''
	from sklearn.feature_extraction.text import CountVectorizer

	train_X = ["John likes to watch movies",
	"Mary likes movies too",
	"Joe only likes horror movies and action movies"]

	vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') # take a word as a token.
	train_vector = vectorizer.fit_transform(train_X) # Learn the vocabulary dictionary and return term-document matrix.
	token_set = vectorizer.get_feature_names() # the vocabulary dictionary: ['action', 'and', 'horror', 'joe', 'john', 'likes', 'mary', 'movies', 'only', 'to', 'too', 'watch']

	test_X = ["Jay likes romantic movies"]
	test_vector = vectorizer.transform(test_X)
	print(test_vector)
	'''
	(0, 5) 1
	(0, 7) 1
	'''