Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@nmpowell
Last active December 22, 2022 06:23
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save nmpowell/f3faf7496bf9d915579f to your computer and use it in GitHub Desktop.
Save nmpowell/f3faf7496bf9d915579f to your computer and use it in GitHub Desktop.
Count words in a text file, sort by frequency, and generate a histogram of the top N
#!/usr/bin/python
"""Python script to create a histogram of words in a text file.
Usage: python word_frequency.py -f "/path/to/file.txt" -n 200
Specify the path to the text file as above. Manually specify the top N words to report (default 100).
Text file can contain punctuation, new lines, etc., but special characters aren't handled well.
"""
import os
import sys
import string
import argparse
import operator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
__author__ = 'Nick Powell (PhD student, CMIC & CABI, UCL, UK), nicholas.powell.11@ucl.ac.uk'
__version__ = '0.2.20150303'
__created__ = '2014-12-18, Thursday'
def main():
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f','--filepath',dest='filepath',metavar='file path',help='Path to text input file to be analysed.', required=True)
parser.add_argument('-n','--number',dest='number',metavar='number',help='Most frequent n words will be displayed and plotted.', required=False, default=100, type=int)
args = parser.parse_args()
# Path to text file to analyse
rawfilepath = args.filepath
# Print a histogram containing the top N words, and print them and their counts.
top_n = args.number
# Load the file
filepath = os.path.normpath(os.path.join(rawfilepath))
file = open(filepath, 'r')
# Parse as a list, removing lines
content_sublists = [line.split(',') for line in file.readlines()]
# Parse into a single list (from a list of lists)
content_list = [item for sublist in content_sublists for item in sublist]
# Remove whitespace so we can concatenate appropriately, and unify case
content_list_strip = [str.strip().lower() for str in content_list]
# Concatenate strings into a single string
content_concat = ' '.join(content_list_strip)
# Remove punctuation and new lines
punct = set(string.punctuation)
unpunct_content = ''.join(x for x in content_concat if x not in punct)
# Split string into list of strings, again
word_list = unpunct_content.split()
# Perform count
counts_all = Counter(word_list)
words, count_values = zip(*counts_all.items())
# Sort both lists by frequency in values (Schwartzian transform) - thanks, http://stackoverflow.com/questions/9543211/sorting-a-list-in-python-using-the-result-from-sorting-another-list
values_sorted, words_sorted = zip(*sorted(zip(count_values, words), key=operator.itemgetter(0), reverse=True))
# Top N
words_sorted_top = words_sorted[0:top_n]
values_sorted_top = values_sorted[0:top_n]
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
print("{0} unique words identified in the text file, {1}".format(len(values_sorted), filepath))
print("The top {0} words are: \n{1}".format(top_n, words_sorted_top))
print("... their respective frequencies: \n{0}".format(values_sorted_top))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
# Pandas DataFrame just for visualisation
df = pd.DataFrame({'count': values_sorted_top, 'word': words_sorted_top})
print("{0}".format(df))
sys.stdout.flush()
# Histogram
# Make xticklabels comprehensible by matplotlib
xticklabels = unicode(list(words_sorted_top)).split()
# Remove the single quotes, commas and enclosing square brackets
xtlabs = [xstr.replace("'","").replace(",","").replace("]","").replace("[","") for xstr in xticklabels]
indices = np.arange(len(words_sorted_top))
width = 1
fig = plt.figure()
fig.suptitle('Word frequency histogram, top {0}'.format(top_n), fontsize=16)
plt.xlabel('word', fontsize=12)
plt.ylabel('count', fontsize=12)
plt.bar(indices, values_sorted_top, width)
plt.xticks(indices + width * 0.5, xtlabs, rotation='vertical', fontsize=8)
plt.show()
if __name__ == '__main__':
main()
# End
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment