Skip to content

Instantly share code, notes, and snippets.

@fnielsen
Last active November 6, 2022 12:32
Show Gist options
  • Save fnielsen/7102991 to your computer and use it in GitHub Desktop.
Save fnielsen/7102991 to your computer and use it in GitHub Desktop.
Zipf plot from word counts in the Brown corpus
from __future__ import division
from itertools import *
from pylab import *
from nltk.corpus import brown
from string import lower
from collections import Counter
# The data: token counts from the Brown corpus
tokens_with_count = Counter(imap(lower, brown.words()))
counts = array(tokens_with_count.values())
tokens = tokens_with_count.keys()
# A Zipf plot
ranks = arange(1, len(counts)+1)
indices = argsort(-counts)
frequencies = counts[indices]
loglog(ranks, frequencies, marker=".")
title("Zipf plot for Brown corpus tokens")
xlabel("Frequency rank of token")
ylabel("Absolute frequency of token")
grid(True)
for n in list(logspace(-0.5, log10(len(counts)), 20).astype(int)):
dummy = text(ranks[n], frequencies[n], " " + tokens[indices[n]],
verticalalignment="bottom",
horizontalalignment="left")
show()
@Riyasharma-in
Copy link

from string import lower
shown error

@fnielsen
Copy link
Author

@Riyasharma-in Try tokens_with_count = Counter(map(str.lower, brown.words()))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment