Created
January 22, 2019 16:52
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Vectorization for Data Visualization | |
def vectorization(table): | |
#CountVectorizer will convert a collection of text documents to a matrix of token counts | |
#Produces a sparse representation of the counts | |
#Initialize | |
vector = CountVectorizer() | |
#We fit and transform the vector created | |
frequency_matrix = vector.fit_transform(table.tweet) | |
#Sum all the frequencies for each word | |
sum_frequencies = np.sum(frequency_matrix, axis=0) | |
#Now we use squeeze to remove single-dimensional entries from the shape of an array that we got from applying np.asarray to | |
#the sum of frequencies. | |
frequency = np.squeeze(np.asarray(sum_frequencies)) | |
#Now we get into a dataframe all the frequencies and the words that they correspond to | |
frequency_df = pd.DataFrame([frequency], columns=vector.get_feature_names()).transpose() | |
return frequency_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment