timgianitsos/zipf.py

## zipf.py
from collections import Counter
from string import punctuation
from os.path import split
import sys

__author__ = 'Tim Gianitsos'

def main(filename=None, num_display=30):
	c = Counter(s for t in open(filename).read().split() if (s:=t.lower().replace(fr'“‘”’{punctuation}', '')).isalpha())
	mc = c.most_common()
	num_display = min(int(num_display), len(mc))
	mc = mc[:num_display]
	first_freq = mc[0][1]
	title = f'Frequencies of top {num_display} words as a ratio of most frequent word'
	print(title)
	print('\n'.join(f'{t[0]:13s} {t[1] / first_freq * 100:.2f}%' for t in mc))
	try:
		import matplotlib.pyplot as plt
		plt.style.use('seaborn')
		plt.gca().set_xticks(range(1, len(mc) + 1))
		plt.gca().set_xticklabels([t[0] for t in mc], rotation=60)
		plt.plot(range(1, num_display + 1), [1 / i for i in range(1, num_display + 1)], label='Unnormalized Zipf distribution')
		plt.plot(range(1, num_display + 1), [t[1] / first_freq for t in mc], label=split(filename)[1])
		plt.legend()
		plt.title(title)
		plt.show()
	except ModuleNotFoundError as err:
		pass

if __name__ == '__main__':
	if len(sys.argv) <= 1:
		print(f'usage: python3.8 {__file__} name_of_text.txt', file=sys.stderr)
	else:
		main(*sys.argv[1:])
	from collections import Counter
	from string import punctuation
	from os.path import split
	import sys

	__author__ = 'Tim Gianitsos'

	def main(filename=None, num_display=30):
	c = Counter(s for t in open(filename).read().split() if (s:=t.lower().replace(fr'“‘”’{punctuation}', '')).isalpha())
	mc = c.most_common()
	num_display = min(int(num_display), len(mc))
	mc = mc[:num_display]
	first_freq = mc[0][1]
	title = f'Frequencies of top {num_display} words as a ratio of most frequent word'
	print(title)
	print('\n'.join(f'{t[0]:13s} {t[1] / first_freq * 100:.2f}%' for t in mc))
	try:
	import matplotlib.pyplot as plt
	plt.style.use('seaborn')
	plt.gca().set_xticks(range(1, len(mc) + 1))
	plt.gca().set_xticklabels([t[0] for t in mc], rotation=60)
	plt.plot(range(1, num_display + 1), [1 / i for i in range(1, num_display + 1)], label='Unnormalized Zipf distribution')
	plt.plot(range(1, num_display + 1), [t[1] / first_freq for t in mc], label=split(filename)[1])
	plt.legend()
	plt.title(title)
	plt.show()
	except ModuleNotFoundError as err:
	pass

	if __name__ == '__main__':
	if len(sys.argv) <= 1:
	print(f'usage: python3.8 {__file__} name_of_text.txt', file=sys.stderr)
	else:
	main(*sys.argv[1:])