riceluxs1t/heap_sample.py

## heap_sample.py
# -*- coding:utf-8 -*-
import random
from collections import defaultdict
def weighted_choice(choices, n):
    """
    n개의 에드네트워크 정보를 픽한다. 단 이미 선택된 에드네트워크는 고려 대상에서 제외된다 (sample without replacement).
    힙 자료구조를 이용한다. O(nlogn)

    :param choices: 에드네트워크 초이스 정보가 담긴 리스트. 각 초이스정보는 딕셔너리. 딕셔너리의 키는 weight, adnetwork id, api key.
    :param n: 픽 갯수.
    :return: n개의 에드네트워크 픽정보.

    """

    heap = make_heap(choices)
    res = []

    # pop_heap을 통해 n개의 에드네트워크를 순차적으로 샘플.
    for _ in xrange(n):
        res.append(pop_heap(heap))

    if len(res) < n:
        raise Exception("Shouldn't get here")
    return res

def make_heap(choices):
    """
        가중치에 비례해서 에드네트워크를 n개 픽하는데 사용 될 heap 자료구조를 만든다. heap은 배열로 엘레강스하게 표현 될 수 있다.
        heap의 index 속성.
        i번쨰 노드의 왼쪽 자식은 2i, 오른쪽 자식은 2i+1. 부모는 i/2
        이 속성이 성립하기 위해선 인덱스가 1부터 카운트 되야한다.

        heap의 각 노드는 @adnetwork_choice 클래스 객체를 사용한다.
        w : 가중치
        v:  픽정보 (딕셔너리)
        tw: 현재 노드를 기준으로 하위트리에 속한 노드(현재 노드 포함)들의 총 가중치합.

    :param choices: 에드네트워크 정보가 담긴 리스트. 각 정보 객체의 딕셔너리의 키는 weight, adnetwork id, api key.
    :return: 위 로직에 의해 만들어진 heap구조.

    """

    # 가독성을 위한 더미 클래스. (a,b,c) 형태의 tuple로 생각해도 무방하다.
    class adnetwork_choice:
        def __init__(self, w, v, tw):
            self.w, self.v, self.tw = w, v, tw

    # 0번째 인덱스에 더미노드를 추가한다.
    # h[i<<1] = 왼쪽 자식, h[(i<<1) + 1] = 오른쪽 자식
    h = [None]

    # build the heap
    for choice in choices:
        h.append(adnetwork_choice(choice['weight'], choice, choice['weight']))

    # leaf node에서 부터 올라가면서 가중치의 합을 구한다.
    for i in range(len(h) -1, 1, -1):

        # 부모 노드에 자식 노드의 tw을 더해 준다.
        h[i>>1].tw += h[i].tw
    for e in h[1:]:
        print e.v, e.tw, e.w
    return h
def pop_heap(h):
    """
        힙구조를 쓰지 않는다면 가중치에 비례해서 에드네트워크 하나를 픽하는 과정은 다음과 같다.
        1) [0 ~ 총 웨이트의 합] 구간의 숫자하나, r, 를 랜덤(uniformly random)하게 고른다.
        2) 가중치 배열을 돌면서 가중치의 누적합을 구한다. 예) weights = [2, 5, 7]. cumsum_weights = [2, 7, 14]
        3) 편의상 cumsum_weights[-1] = -infinity, cumsum_weights[n] = +infinity로 가정한다.
        4) cumsum_weights[i-1] < r <= cumsum_weights[i]인 i번째 에드네트워크를 리턴한다.

        힙구조를 쓰면 4)단계를 O(logn) 시간에 수행 할 수 있다. binary search를 해도 O(logn)에 수행 할 수 있지만, sample without
        replacement 이기 때문에 하나의 에드네트워크를 선택한 다음에는 불변식 생각하기가 까다롭다.

    :param h: heap
    :return: 에드네트워크 정보가 담긴 딕셔너리. 딕셔너리의 키는 weight, adnetwork id, api key.

    """

    # [0 ~ 총 웨이트의 합] 구간의 숫자하나를 랜덤하게 고른다.
    r = h[1].tw * random.random()

    # 1 = root. 1번 노드부터 시작해서 4)단계를 실행한다.
    i = 1
    while r > h[i].w:
        r -= h[i].w
        # 왼쪽 자식으로 내려가본다.
        i = i << 1
        #  왼쪽 자식으로 내려갔는데 하위 트리의 웨이트 합이 여전히 r보다 작다면
        #  오른쪽 자식으로 내려간다.
        if r > h[i].tw:
            r -= h[i].tw
            i += 1 #오른쪽 자식으로 이동.

    # 픽된 애의 가중치를 0 으로 조정 함으로 고려 대상에서 제외한다.
    adn_weight = h[i].w
    adn_picked = h[i].v
    h[i].w = 0
    # 픽된 노드에서 트리를 다시 올라가면서 가중치를 조정한다.
    while i:
        h[i].tw -= adn_weight
        i = i >> 1 # 부모로 이동. i/2 = 부모
    return adn_picked

def test():
    choices = [{'weight':10, 'name':'A'}, {'weight':20, 'name':'B'}, {'weight':30, 'name':'C'}]
    first_count = defaultdict(int)
    second_count = defaultdict(int)
    third_count = defaultdict(int)
    for _ in range(100000):
        res = weighted_choice(choices, len(choices))
        first_count[res[0]['name']] += 1
        second_count[res[1]['name']] += 1
        third_count[res[2]['name']] += 1

    print first_count
    print second_count
    print third_count
	# -- coding:utf-8 --
	import random
	from collections import defaultdict
	def weighted_choice(choices, n):
	"""
	n개의 에드네트워크 정보를 픽한다. 단 이미 선택된 에드네트워크는 고려 대상에서 제외된다 (sample without replacement).
	힙 자료구조를 이용한다. O(nlogn)

	:param choices: 에드네트워크 초이스 정보가 담긴 리스트. 각 초이스정보는 딕셔너리. 딕셔너리의 키는 weight, adnetwork id, api key.
	:param n: 픽 갯수.
	:return: n개의 에드네트워크 픽정보.

	"""

	heap = make_heap(choices)
	res = []

	# pop_heap을 통해 n개의 에드네트워크를 순차적으로 샘플.
	for _ in xrange(n):
	res.append(pop_heap(heap))

	if len(res) < n:
	raise Exception("Shouldn't get here")
	return res

	def make_heap(choices):
	"""
	가중치에 비례해서 에드네트워크를 n개 픽하는데 사용 될 heap 자료구조를 만든다. heap은 배열로 엘레강스하게 표현 될 수 있다.
	heap의 index 속성.
	i번쨰 노드의 왼쪽 자식은 2i, 오른쪽 자식은 2i+1. 부모는 i/2
	이 속성이 성립하기 위해선 인덱스가 1부터 카운트 되야한다.

	heap의 각 노드는 @adnetwork_choice 클래스 객체를 사용한다.
	w : 가중치
	v: 픽정보 (딕셔너리)
	tw: 현재 노드를 기준으로 하위트리에 속한 노드(현재 노드 포함)들의 총 가중치합.

	:param choices: 에드네트워크 정보가 담긴 리스트. 각 정보 객체의 딕셔너리의 키는 weight, adnetwork id, api key.
	:return: 위 로직에 의해 만들어진 heap구조.

	"""

	# 가독성을 위한 더미 클래스. (a,b,c) 형태의 tuple로 생각해도 무방하다.
	class adnetwork_choice:
	def __init__(self, w, v, tw):
	self.w, self.v, self.tw = w, v, tw

	# 0번째 인덱스에 더미노드를 추가한다.
	# h[i<<1] = 왼쪽 자식, h[(i<<1) + 1] = 오른쪽 자식
	h = [None]

	# build the heap
	for choice in choices:
	h.append(adnetwork_choice(choice['weight'], choice, choice['weight']))

	# leaf node에서 부터 올라가면서 가중치의 합을 구한다.
	for i in range(len(h) -1, 1, -1):

	# 부모 노드에 자식 노드의 tw을 더해 준다.
	h[i>>1].tw += h[i].tw
	for e in h[1:]:
	print e.v, e.tw, e.w
	return h
	def pop_heap(h):
	"""
	힙구조를 쓰지 않는다면 가중치에 비례해서 에드네트워크 하나를 픽하는 과정은 다음과 같다.
	1) [0 ~ 총 웨이트의 합] 구간의 숫자하나, r, 를 랜덤(uniformly random)하게 고른다.
	2) 가중치 배열을 돌면서 가중치의 누적합을 구한다. 예) weights = [2, 5, 7]. cumsum_weights = [2, 7, 14]
	3) 편의상 cumsum_weights[-1] = -infinity, cumsum_weights[n] = +infinity로 가정한다.
	4) cumsum_weights[i-1] < r <= cumsum_weights[i]인 i번째 에드네트워크를 리턴한다.

	힙구조를 쓰면 4)단계를 O(logn) 시간에 수행 할 수 있다. binary search를 해도 O(logn)에 수행 할 수 있지만, sample without
	replacement 이기 때문에 하나의 에드네트워크를 선택한 다음에는 불변식 생각하기가 까다롭다.

	:param h: heap
	:return: 에드네트워크 정보가 담긴 딕셔너리. 딕셔너리의 키는 weight, adnetwork id, api key.

	"""

	# [0 ~ 총 웨이트의 합] 구간의 숫자하나를 랜덤하게 고른다.
	r = h[1].tw * random.random()

	# 1 = root. 1번 노드부터 시작해서 4)단계를 실행한다.
	i = 1
	while r > h[i].w:
	r -= h[i].w
	# 왼쪽 자식으로 내려가본다.
	i = i << 1
	# 왼쪽 자식으로 내려갔는데 하위 트리의 웨이트 합이 여전히 r보다 작다면
	# 오른쪽 자식으로 내려간다.
	if r > h[i].tw:
	r -= h[i].tw
	i += 1 #오른쪽 자식으로 이동.

	# 픽된 애의 가중치를 0 으로 조정 함으로 고려 대상에서 제외한다.
	adn_weight = h[i].w
	adn_picked = h[i].v
	h[i].w = 0
	# 픽된 노드에서 트리를 다시 올라가면서 가중치를 조정한다.
	while i:
	h[i].tw -= adn_weight
	i = i >> 1 # 부모로 이동. i/2 = 부모
	return adn_picked

	def test():
	choices = [{'weight':10, 'name':'A'}, {'weight':20, 'name':'B'}, {'weight':30, 'name':'C'}]
	first_count = defaultdict(int)
	second_count = defaultdict(int)
	third_count = defaultdict(int)
	for _ in range(100000):
	res = weighted_choice(choices, len(choices))
	first_count[res[0]['name']] += 1
	second_count[res[1]['name']] += 1
	third_count[res[2]['name']] += 1

	print first_count
	print second_count
	print third_count