Created
February 24, 2021 12:32
-
-
Save jacKlinc/49dcfc737842dd0c66feec3b74f015fd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import CountVectorizer | |
def parse_txt(txt_file): | |
""" | |
Pass text file location and returns n list elements for each line in the file | |
""" | |
with open(txt_file, "r") as f: | |
# Reads files, removes new lines and appends to list | |
words = f.read().splitlines() | |
# Removes None elements | |
return list(filter(None, words)) | |
def get_top_n_words(corpus, n=None): | |
""" | |
List the top n words in a vocabulary according to occurrence in a text corpus. | |
get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> | |
[('python', 2), | |
('world', 2), | |
('love', 2), | |
('hello', 1), | |
('is', 1), | |
('programming', 1), | |
('the', 1), | |
('language', 1)] | |
""" | |
vec = CountVectorizer().fit(corpus) | |
bag_of_words = vec.transform(corpus) | |
sum_words = bag_of_words.sum(axis=0) | |
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] | |
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True) | |
return words_freq[:n] | |
words = parse_txt("my_file.txt") | |
print(get_top_n_words(words)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment