Mukul Sharma sharma-ji

## gist:a0ea008e54768b874b8004496f4450b4
from sklearn.feature_extraction.text import CountVectorizer

# list of text documents
text = ["this is test doc", "this is another test doc"]

# create the transform
vector = CountVectorizer()

# tokenize and build vocab
vector.fit(text)

## gist:2b843c261df8b10d5b57c7261effce26
# Split text into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)

# Convert words to lower case
tokens = [w.lower() for w in tokens]

# Remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
	from sklearn.feature_extraction.text import CountVectorizer

	# list of text documents
	text = ["this is test doc", "this is another test doc"]

	# create the transform
	vector = CountVectorizer()

	# tokenize and build vocab
	vector.fit(text)
	# Split text into words
	from nltk.tokenize import word_tokenize
	tokens = word_tokenize(text)

	# Convert words to lower case
	tokens = [w.lower() for w in tokens]

	# Remove punctuation from each word
	import string
	table = str.maketrans('', '', string.punctuation)