jeroenjanssens/topwords.R

## topwords.py
#!/usr/bin/env python
import re
import sys
from collections import Counter
num_words = int(sys.argv[1])
text = sys.stdin.read()
text = text.lower()
words = re.split('\W+', text)
cnt = Counter(words)
for word, count in cnt.most_common(num_words):
    print "%8d %s" % (count, word)

## topwords.R
#!/usr/bin/env Rscript
num.words <- as.integer(commandArgs(trailingOnly = TRUE))
f <- file("stdin")
input.lines <- readLines(f)
close(f)
full.text <- tolower(paste(input.lines, collapse = " "))
splits <- gregexpr("\\w+", full.text)
words.all <- (regmatches(full.text, splits)[[1]])
words.unique <- as.data.frame(table(words.all))
words.sorted <- words.unique[order(-words.unique$Freq),]
dummy <- mapply(function(w, c) {
	cat(sprintf("%8d %s\n", c, w))
}, head(words.sorted$words, num.words), head(words.sorted$Freq, num.words))

## topwords.sh
#!/usr/bin/env bash
NUM_WORDS="$1"
tr '[:upper:]' '[:lower:]' |
grep -oE '\w+' |
sort |
uniq -c |
sort -nr |
head -n $NUM_WORDS
	#!/usr/bin/env python
	import re
	import sys
	from collections import Counter
	num_words = int(sys.argv[1])
	text = sys.stdin.read()
	text = text.lower()
	words = re.split('\W+', text)
	cnt = Counter(words)
	for word, count in cnt.most_common(num_words):
	print "%8d %s" % (count, word)
	#!/usr/bin/env Rscript
	num.words <- as.integer(commandArgs(trailingOnly = TRUE))
	f <- file("stdin")
	input.lines <- readLines(f)
	close(f)
	full.text <- tolower(paste(input.lines, collapse = " "))
	splits <- gregexpr("\\w+", full.text)
	words.all <- (regmatches(full.text, splits)[[1]])
	words.unique <- as.data.frame(table(words.all))
	words.sorted <- words.unique[order(-words.unique$Freq),]
	dummy <- mapply(function(w, c) {
	cat(sprintf("%8d %s\n", c, w))
	}, head(words.sorted$words, num.words), head(words.sorted$Freq, num.words))
	#!/usr/bin/env bash
	NUM_WORDS="$1"
	tr '[:upper:]' '[:lower:]' \|
	grep -oE '\w+' \|
	sort \|
	uniq -c \|
	sort -nr \|
	head -n $NUM_WORDS