jacKlinc/txt_word_counter.py

## txt_word_counter.py
from sklearn.feature_extraction.text import CountVectorizer

def parse_txt(txt_file):
    """
    Pass text file location and returns n list elements for each line in the file
    """
    with open(txt_file, "r") as f:
        # Reads files, removes new lines and appends to list
        words = f.read().splitlines()
        # Removes None elements
        return list(filter(None, words))

def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.

    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) ->
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

words = parse_txt("my_file.txt")
print(get_top_n_words(words))
	from sklearn.feature_extraction.text import CountVectorizer

	def parse_txt(txt_file):
	"""
	Pass text file location and returns n list elements for each line in the file
	"""
	with open(txt_file, "r") as f:
	# Reads files, removes new lines and appends to list
	words = f.read().splitlines()
	# Removes None elements
	return list(filter(None, words))

	def get_top_n_words(corpus, n=None):
	"""
	List the top n words in a vocabulary according to occurrence in a text corpus.

	get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) ->
	[('python', 2),
	('world', 2),
	('love', 2),
	('hello', 1),
	('is', 1),
	('programming', 1),
	('the', 1),
	('language', 1)]
	"""
	vec = CountVectorizer().fit(corpus)
	bag_of_words = vec.transform(corpus)
	sum_words = bag_of_words.sum(axis=0)
	words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
	words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
	return words_freq[:n]

	words = parse_txt("my_file.txt")
	print(get_top_n_words(words))