Skip to content

Instantly share code, notes, and snippets.

@trevorc
Created April 19, 2010 03:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save trevorc/370732 to your computer and use it in GitHub Desktop.
Save trevorc/370732 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import collections
import sys
def tokens(f):
for line in f:
for t in line.split():
yield t
def get_ngrams(stream, n):
stream = iter(stream)
window = collections.deque(maxlen=n)
while len(window) < n:
window.append(stream.next())
for t in stream:
yield tuple(window)
window.append(t)
def with_counts(it):
seen = {}
for x in it:
seen[x] = seen.get(x, 0) + 1
return ((y, x) for x, y in seen.iteritems())
def main():
ngrams = get_ngrams(tokens(sys.stdin), 2)
counts = sorted(with_counts(ngrams), reverse=True)
for x, y in counts:
print x, ' '.join(y)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment