Created
July 26, 2013 12:54
-
-
Save spacelis/6088623 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
File: stratified.py | |
Author: SpaceLis | |
Email: Wen.Li@tudelft.nl | |
Github: http://github.com/spacelis | |
Description: | |
Sampling in a stratified way. That is sampling from each subpopulation to | |
make the sample set more representative than simple random sampling. For | |
example, a population of places from each category is not uniform, it is | |
needed to insure each category has a place sampled and the number of the | |
samples from each category should be propotional to its size. | |
""" | |
import numpy as np | |
from itertools import tee, izip, chain | |
from collections import Counter | |
def pairwise(iterable): | |
"s -> (s0,s1), (s1,s2), (s2, s3), ..." | |
a, b = tee(iterable) | |
next(b, None) | |
return izip(a, b) | |
def int_partition(K, percentages, minimum=1): | |
""" Scale the percentages up K times | |
""" | |
assert K > minimum * len(percentages), 'K is too small for partitioning' | |
p = np.array(percentages, dtype=np.float) | |
dist = np.ones(len(p), dtype=np.int) * minimum | |
dist += (K - len(p)) * p | |
left = K - dist.sum() | |
if left > 0: | |
while left > 0: | |
diff = p * K - dist | |
dist[np.argmax(diff)] += 1 | |
left -= 1 | |
elif left < 0: # FIXME seems never having chance to run actually | |
while left < 0: | |
diff = p * K - dist | |
dist[np.argmin(diff)] -= 1 | |
left += 1 | |
return dist | |
def stratified_samples(iterable, percentages, size, prop_sizes=True): | |
""" Sampling the data with stratified sampling | |
""" | |
partitions = len(percentages) | |
dist = sorted(Counter(iterable).iteritems(), | |
key=lambda x: x[1], | |
reverse=True) | |
if prop_sizes: | |
samplesize = int_partition(size, percentages) | |
else: | |
samplesize = int_partition(size, | |
[1. / partitions] * partitions) | |
pivots = list(np.cumsum(int_partition(len(dist), percentages))) | |
chozen_idx = [np.random.choice(range(s, t), n, replace=False) | |
for n, (s, t) in zip(samplesize, pairwise([0] + pivots))] | |
chozen = [[dist[i][0] for i in ig] for ig in chozen_idx] | |
return chozen |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment