Skip to content

Instantly share code, notes, and snippets.

@jacKlinc
Created February 24, 2021 12:32
Show Gist options
  • Save jacKlinc/49dcfc737842dd0c66feec3b74f015fd to your computer and use it in GitHub Desktop.
Save jacKlinc/49dcfc737842dd0c66feec3b74f015fd to your computer and use it in GitHub Desktop.
from sklearn.feature_extraction.text import CountVectorizer
def parse_txt(txt_file):
"""
Pass text file location and returns n list elements for each line in the file
"""
with open(txt_file, "r") as f:
# Reads files, removes new lines and appends to list
words = f.read().splitlines()
# Removes None elements
return list(filter(None, words))
def get_top_n_words(corpus, n=None):
"""
List the top n words in a vocabulary according to occurrence in a text corpus.
get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) ->
[('python', 2),
('world', 2),
('love', 2),
('hello', 1),
('is', 1),
('programming', 1),
('the', 1),
('language', 1)]
"""
vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
words = parse_txt("my_file.txt")
print(get_top_n_words(words))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment