DanielOX/NLP_Feature_Extraction_SKLearn.py

## NLP_Feature_Extraction_SKLearn.py
from sklearn.feature_extraction.text import CountVectorizer

# corpus source [ https://en.wikipedia.org/wiki/Baseball ]

corpus = """Baseball is a bat-and-ball game played between two opposing teams who take turns batting and fielding. The game proceeds when a player on the fielding team, called the pitcher, throws a ball which a player on the batting team tries to hit with a bat. The objective of the offensive team (batting team) is to hit the ball into the field of play, allowing its players to run the bases, having them advance counter-clockwise around four bases to score what are called "runs". The objective of the defensive team (fielding team) is to prevent batters from becoming runners, and to prevent runners' advance around the bases.[2] A run is scored when a runner legally advances around the bases in order and touches home plate (the place where the player started as a batter). The team that scores the most runs by the end of the game is the winner."""

# Tokenize corpus into list of sentences because CountVectorizer expects the list of sentences as a parameter

def sent_tokenize(corpus):
    return [ sentence for sentence in nltk.sent_tokenize(corpus) ]

# Mapping string corpus to list of sentences

corpus = sent_tokenize(corpus)

# Create an instance of CountVectorizer class which have some preprocessing methods by default written into it

vectorizer = CountVectorizer()

# Fit and transform the new list of sentences to the CountVectorizer instance

features = vectorizer.fit_transform(corpus)

# Now features is a sparse matrix which contains frequency of each words, to view the sparse matrix one has to make it dense.
# We can use numpy todense() method to view the sparse matrix

print(features.todense())
	from sklearn.feature_extraction.text import CountVectorizer

	# corpus source [ https://en.wikipedia.org/wiki/Baseball ]

	corpus = """Baseball is a bat-and-ball game played between two opposing teams who take turns batting and fielding. The game proceeds when a player on the fielding team, called the pitcher, throws a ball which a player on the batting team tries to hit with a bat. The objective of the offensive team (batting team) is to hit the ball into the field of play, allowing its players to run the bases, having them advance counter-clockwise around four bases to score what are called "runs". The objective of the defensive team (fielding team) is to prevent batters from becoming runners, and to prevent runners' advance around the bases.[2] A run is scored when a runner legally advances around the bases in order and touches home plate (the place where the player started as a batter). The team that scores the most runs by the end of the game is the winner."""

	# Tokenize corpus into list of sentences because CountVectorizer expects the list of sentences as a parameter

	def sent_tokenize(corpus):
	return [ sentence for sentence in nltk.sent_tokenize(corpus) ]

	# Mapping string corpus to list of sentences

	corpus = sent_tokenize(corpus)

	# Create an instance of CountVectorizer class which have some preprocessing methods by default written into it

	vectorizer = CountVectorizer()

	# Fit and transform the new list of sentences to the CountVectorizer instance

	features = vectorizer.fit_transform(corpus)

	# Now features is a sparse matrix which contains frequency of each words, to view the sparse matrix one has to make it dense.
	# We can use numpy todense() method to view the sparse matrix

	print(features.todense())