sajal/magiclist.py

## magiclist.py
from collections import defaultdict
import math

class MagicList():
    """
    An attempt to make a python object which could potentially hold
    unlimited values with finite distinct values.
    All items are not stored in memory, only the count of each distinct item is.
    So if you want to hold say 1 million numbers where each number is between 1 to 5000,
    this should be a better choice than a list.
    If you have [1,2,3,1,1,1,1,1,1,1,1,1,1,1,1,2,3], internally it will be stored as :-
    {1: 13, 2: 2, 3: 2}
    """
    def __init__(self, fromlist=None, fromdict=None):
        """
        if none of fromlist and fromdict is supplied, it makes blank list
        """
        self._realobj = defaultdict(int)
        self.count = 0

        if fromlist:
            self.insert_list(fromlist)
        elif fromdict:
            self.insert_dict(fromdict)

    def append(self, num, val=1):
        """
        Insert a number into magiclist.
        val is is the number of times this current number is to be added
        """
        num = int(num)
        val = int(val)
        self._realobj[num] += val
        self.count += val

    def insert_list(self, fromlist):
        """
        Give it a list type object/iterable, it will add in values to existing MagicList
        """
        for item in fromlist:
            self.append(item)

    def insert_dict(self, fromdict):
        """
        Give it a dict type object and it inserts it to current magiclist
        Useful if u serialize MagicList and wanna unserialize it, or to join 2
        """
        for num in fromdict.keys():
            self.append(num, fromdict[num])

    def as_list(self):
        """
        Get normal list representation of the MagicList
        """
        output = []
        for key in self._realobj:
            for i in range(self._realobj[key]):
                output += [key]
        return output

    def as_dict(self):
        return dict(self._realobj)

    def get_sum(self):
        """
        Sum of all items in magiclist
        """
        total = 0
        for key in self._realobj:
            total += self._realobj[key] * key
        return total

    def get_mean(self, precise=False):
        """
        Mean items in magiclist.
        Returns a float
        """
        if self.count < 1:
            return 0
        elif precise:
            return (self.get_sum() * 1.0) / self.count
        else:
            return int((self.get_sum() * 1.0) / self.count)

    def get_median(self):
        """
        Returns median
        """
        return self.get_percentile(50)

    def get_percentile(self, percentile):
        """
        Gets percentile, use 50 for median...
        Returns a float
        Logic shamelessly stollen from scipy
        """
        #first check rank
        if self.count < 1:
            #Dont ask me why. I like this
            return 0
        else:
            idx = percentile /100. * (self.count - 1)
            if (idx % 1 == 0):
                #simple.. get idxth numner
                return self._get_num_from_idx(int(idx))
            else:
                #do weight
                num1 = self._get_num_from_idx(int(idx))
                num2 = self._get_num_from_idx(int(idx) + 1)
                return int(num1 + (num2 - num1) * (idx % 1))
            return rank

    def _get_num_from_idx(self, idx):
        """
        Returns number at position idx of the supposed sorted list
        """
        numbers = sorted(self._realobj.keys())
        runningcount = 0
        for num in numbers:
            if idx >= runningcount and idx < runningcount + self._realobj[num] :
                return num
            runningcount += self._realobj[num]

    def get_std(self):
        """
        TODO: Return standard deviation
        """
        if self.count < 1:
            return 0
        else:
            mean = self.get_mean(precise=True)
            if mean == 0:
                return 0
            else:
                variance = 0
                for k in self._realobj.keys():
                    variance += (math.pow(k, 2) * self._realobj[k]) / mean
                return int(math.sqrt(variance))


    def get_histogram(self, binwidth=25):
        """
        Return histogram bins
        """
        bins = {
            "binwidth": binwidth
            }
        hist = defaultdict(int)
        for k in self._realobj.keys():
            binkey = str(int(k / binwidth) * binwidth)
            hist[binkey] += self._realobj[k]
        bins["histo"] = dict(hist)
        return bins


    def get_aggrigations(self):
        output = {}
        output["median"] = self.get_median()
        output["mean"] = self.get_mean()
        output["percentiles"] = {}
        for p in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
            output["percentiles"][str(p)] = self.get_percentile(p)
        output["std"] = self.get_std()
        output["histogram"] = self.get_histogram(binwidth=25)
        return output

if __name__ == "__main__":
    from scipy import stats
    input = [5,100,1,2,3,4,2,2,3,4,2]
    print input
    ml = MagicList()
    for item in input:
        ml.append(item)
    print ml.count
    print ml.as_list()
    print ml.get_sum()
    print ml.get_mean()
    print ml.get_percentile(50)
    for i in range(0, 100):
        if stats.scoreatpercentile(input, i) != ml.get_percentile(i):
            raise Exception("WTF!")
    print ml.as_dict()
	from collections import defaultdict
	import math

	class MagicList():
	"""
	An attempt to make a python object which could potentially hold
	unlimited values with finite distinct values.
	All items are not stored in memory, only the count of each distinct item is.
	So if you want to hold say 1 million numbers where each number is between 1 to 5000,
	this should be a better choice than a list.
	If you have [1,2,3,1,1,1,1,1,1,1,1,1,1,1,1,2,3], internally it will be stored as :-
	{1: 13, 2: 2, 3: 2}
	"""
	def __init__(self, fromlist=None, fromdict=None):
	"""
	if none of fromlist and fromdict is supplied, it makes blank list
	"""
	self._realobj = defaultdict(int)
	self.count = 0

	if fromlist:
	self.insert_list(fromlist)
	elif fromdict:
	self.insert_dict(fromdict)

	def append(self, num, val=1):
	"""
	Insert a number into magiclist.
	val is is the number of times this current number is to be added
	"""
	num = int(num)
	val = int(val)
	self._realobj[num] += val
	self.count += val

	def insert_list(self, fromlist):
	"""
	Give it a list type object/iterable, it will add in values to existing MagicList
	"""
	for item in fromlist:
	self.append(item)

	def insert_dict(self, fromdict):
	"""
	Give it a dict type object and it inserts it to current magiclist
	Useful if u serialize MagicList and wanna unserialize it, or to join 2
	"""
	for num in fromdict.keys():
	self.append(num, fromdict[num])

	def as_list(self):
	"""
	Get normal list representation of the MagicList
	"""
	output = []
	for key in self._realobj:
	for i in range(self._realobj[key]):
	output += [key]
	return output

	def as_dict(self):
	return dict(self._realobj)

	def get_sum(self):
	"""
	Sum of all items in magiclist
	"""
	total = 0
	for key in self._realobj:
	total += self._realobj[key] * key
	return total

	def get_mean(self, precise=False):
	"""
	Mean items in magiclist.
	Returns a float
	"""
	if self.count < 1:
	return 0
	elif precise:
	return (self.get_sum() * 1.0) / self.count
	else:
	return int((self.get_sum() * 1.0) / self.count)

	def get_median(self):
	"""
	Returns median
	"""
	return self.get_percentile(50)

	def get_percentile(self, percentile):
	"""
	Gets percentile, use 50 for median...
	Returns a float
	Logic shamelessly stollen from scipy
	"""
	#first check rank
	if self.count < 1:
	#Dont ask me why. I like this
	return 0
	else:
	idx = percentile /100. * (self.count - 1)
	if (idx % 1 == 0):
	#simple.. get idxth numner
	return self._get_num_from_idx(int(idx))
	else:
	#do weight
	num1 = self._get_num_from_idx(int(idx))
	num2 = self._get_num_from_idx(int(idx) + 1)
	return int(num1 + (num2 - num1) * (idx % 1))
	return rank

	def _get_num_from_idx(self, idx):
	"""
	Returns number at position idx of the supposed sorted list
	"""
	numbers = sorted(self._realobj.keys())
	runningcount = 0
	for num in numbers:
	if idx >= runningcount and idx < runningcount + self._realobj[num] :
	return num
	runningcount += self._realobj[num]

	def get_std(self):
	"""
	TODO: Return standard deviation
	"""
	if self.count < 1:
	return 0
	else:
	mean = self.get_mean(precise=True)
	if mean == 0:
	return 0
	else:
	variance = 0
	for k in self._realobj.keys():
	variance += (math.pow(k, 2) * self._realobj[k]) / mean
	return int(math.sqrt(variance))


	def get_histogram(self, binwidth=25):
	"""
	Return histogram bins
	"""
	bins = {
	"binwidth": binwidth
	}
	hist = defaultdict(int)
	for k in self._realobj.keys():
	binkey = str(int(k / binwidth) * binwidth)
	hist[binkey] += self._realobj[k]
	bins["histo"] = dict(hist)
	return bins


	def get_aggrigations(self):
	output = {}
	output["median"] = self.get_median()
	output["mean"] = self.get_mean()
	output["percentiles"] = {}
	for p in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
	output["percentiles"][str(p)] = self.get_percentile(p)
	output["std"] = self.get_std()
	output["histogram"] = self.get_histogram(binwidth=25)
	return output

	if __name__ == "__main__":
	from scipy import stats
	input = [5,100,1,2,3,4,2,2,3,4,2]
	print input
	ml = MagicList()
	for item in input:
	ml.append(item)
	print ml.count
	print ml.as_list()
	print ml.get_sum()
	print ml.get_mean()
	print ml.get_percentile(50)
	for i in range(0, 100):
	if stats.scoreatpercentile(input, i) != ml.get_percentile(i):
	raise Exception("WTF!")
	print ml.as_dict()