Skip to content

Instantly share code, notes, and snippets.

@ConstantineLignos
Created March 22, 2013 20:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ConstantineLignos/5224537 to your computer and use it in GitHub Desktop.
Save ConstantineLignos/5224537 to your computer and use it in GitHub Desktop.
Sample items (i.e., words) from a list of items based on their frequencies. Assumes the input file contains lines of the form "<count> <item>".
#!/usr/bin/env python
"""Randomly sample items from a list of items with their frequencies."""
import sys
import random
from operator import itemgetter
def parse_countword_line(line):
"""Parse a line of the form '<count> <word>' into a tuple (word, count)."""
count, word = line.split()
return (word, int(count))
def load_word_counts(path):
"""Load a wordlist file into a dictionary of form {word: count}."""
with open(path, 'rU') as wordlist:
return dict(parse_countword_line(line) for line in wordlist)
def counts_to_freqs(countdict):
"""Convert a dictionary with counts as values to frequencies."""
# Convert total to float so dividing by it will produce a float
total = float(sum(countdict.itervalues()))
# Convert counts to frequencies
return {word: (count / total) for word, count in countdict.iteritems()}
def sample_n_keys(freq_dict, numkeys):
"""Sample numkeys keys from freq_dict, whose values are frequencies.
Uses Shannon-Miller-Selfridge sampling. It may be possible to speed
up sampling by using bisection instead of summing, but this is fast
enough."""
# Sort into word and freq lists. Sorting speeds up sampling by
# putting frequent items first.
words, freqs = zip(*sorted(freq_dict.items(), key=itemgetter(1),
reverse=True))
# Sampling loop, run numkeys times
for _ in range(numkeys):
# Draw a random number and sum frequencies until we hit it
rand = random.random()
freq_sum = 0.0
for idx, freq in enumerate(freqs):
freq_sum += freq
if freq_sum > rand:
yield words[idx]
# After yielding, return to the sampling loop
break
def main():
"""Parse arguments, get the frequencies, and print the samples."""
try:
wordlist_path = sys.argv[1]
n_samples = int(sys.argv[2])
except IndexError:
print "Usage: freqsampler wordlist nsamples"
sys.exit(1)
word_freqs = counts_to_freqs(load_word_counts(wordlist_path))
for key in sample_n_keys(word_freqs, n_samples):
print key
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment