tmalsburg/lengths_of_words_brown_corpus.py

## lengths_of_words_brown_corpus.py
import nltk
from statistics import mean, stdev, median, mode

nltk.download('brown')
tokens = nltk.corpus.brown.tagged_words(tagset="universal")
types  = list(dict.fromkeys(tokens))

# Lengths of tokens / types but ignoring punctuation, numbers, and X
# which is mostly foreign words (German, French, Latin) but strangely
# also a small number of common English words:
len_tokens = [len(w) for w,t in tokens if t not in ['.', 'NUM', 'X']]
len_types  = [len(w) for w,t in types  if t not in ['.', 'NUM', 'X']]

template = """{0}:
  Mean:   {1:.2f} {2:.2f}
  Median: {3:.2f}
  Mode:   {4:.2f}"""

print(template.format("Tokens", *[f(len_tokens) for f in [mean, stdev, median, mode]]))
print(template.format("Types",  *[f(len_types)  for f in [mean, stdev, median, mode]]))
	import nltk
	from statistics import mean, stdev, median, mode

	nltk.download('brown')
	tokens = nltk.corpus.brown.tagged_words(tagset="universal")
	types = list(dict.fromkeys(tokens))

	# Lengths of tokens / types but ignoring punctuation, numbers, and X
	# which is mostly foreign words (German, French, Latin) but strangely
	# also a small number of common English words:
	len_tokens = [len(w) for w,t in tokens if t not in ['.', 'NUM', 'X']]
	len_types = [len(w) for w,t in types if t not in ['.', 'NUM', 'X']]

	template = """{0}:
	Mean: {1:.2f} {2:.2f}
	Median: {3:.2f}
	Mode: {4:.2f}"""

	print(template.format("Tokens", *[f(len_tokens) for f in [mean, stdev, median, mode]]))
	print(template.format("Types", *[f(len_types) for f in [mean, stdev, median, mode]]))