Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["this is test doc", "this is another test doc"]
# create the transform
vector = CountVectorizer()
# tokenize and build vocab
vector.fit(text)
@sharma-ji
sharma-ji / gist:2b843c261df8b10d5b57c7261effce26
Created July 16, 2018 06:17
Pipeline for text cleaning
# Split text into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
# Convert words to lower case
tokens = [w.lower() for w in tokens]
# Remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)