Skip to content

Instantly share code, notes, and snippets.

@timgianitsos
Last active April 8, 2022 07:37
Show Gist options
  • Save timgianitsos/aa60a850012b0b64a45aecf2f316609f to your computer and use it in GitHub Desktop.
Save timgianitsos/aa60a850012b0b64a45aecf2f316609f to your computer and use it in GitHub Desktop.
Input a file to see how closely word frequencies correspond to the Zipf distribution
from collections import Counter
from string import punctuation
from os.path import split
import sys
__author__ = 'Tim Gianitsos'
def main(filename=None, num_display=30):
c = Counter(s for t in open(filename).read().split() if (s:=t.lower().replace(fr'“‘”’{punctuation}', '')).isalpha())
mc = c.most_common()
num_display = min(int(num_display), len(mc))
mc = mc[:num_display]
first_freq = mc[0][1]
title = f'Frequencies of top {num_display} words as a ratio of most frequent word'
print(title)
print('\n'.join(f'{t[0]:13s} {t[1] / first_freq * 100:.2f}%' for t in mc))
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.gca().set_xticks(range(1, len(mc) + 1))
plt.gca().set_xticklabels([t[0] for t in mc], rotation=60)
plt.plot(range(1, num_display + 1), [1 / i for i in range(1, num_display + 1)], label='Unnormalized Zipf distribution')
plt.plot(range(1, num_display + 1), [t[1] / first_freq for t in mc], label=split(filename)[1])
plt.legend()
plt.title(title)
plt.show()
except ModuleNotFoundError as err:
pass
if __name__ == '__main__':
if len(sys.argv) <= 1:
print(f'usage: python3.8 {__file__} name_of_text.txt', file=sys.stderr)
else:
main(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment