Instantly share code, notes, and snippets.

# sajal/magiclist.py Created Oct 23, 2012

Memory efficient list...
 from collections import defaultdict import math class MagicList(): """ An attempt to make a python object which could potentially hold unlimited values with finite distinct values. All items are not stored in memory, only the count of each distinct item is. So if you want to hold say 1 million numbers where each number is between 1 to 5000, this should be a better choice than a list. If you have [1,2,3,1,1,1,1,1,1,1,1,1,1,1,1,2,3], internally it will be stored as :- {1: 13, 2: 2, 3: 2} """ def __init__(self, fromlist=None, fromdict=None): """ if none of fromlist and fromdict is supplied, it makes blank list """ self._realobj = defaultdict(int) self.count = 0 if fromlist: self.insert_list(fromlist) elif fromdict: self.insert_dict(fromdict) def append(self, num, val=1): """ Insert a number into magiclist. val is is the number of times this current number is to be added """ num = int(num) val = int(val) self._realobj[num] += val self.count += val def insert_list(self, fromlist): """ Give it a list type object/iterable, it will add in values to existing MagicList """ for item in fromlist: self.append(item) def insert_dict(self, fromdict): """ Give it a dict type object and it inserts it to current magiclist Useful if u serialize MagicList and wanna unserialize it, or to join 2 """ for num in fromdict.keys(): self.append(num, fromdict[num]) def as_list(self): """ Get normal list representation of the MagicList """ output = [] for key in self._realobj: for i in range(self._realobj[key]): output += [key] return output def as_dict(self): return dict(self._realobj) def get_sum(self): """ Sum of all items in magiclist """ total = 0 for key in self._realobj: total += self._realobj[key] * key return total def get_mean(self, precise=False): """ Mean items in magiclist. Returns a float """ if self.count < 1: return 0 elif precise: return (self.get_sum() * 1.0) / self.count else: return int((self.get_sum() * 1.0) / self.count) def get_median(self): """ Returns median """ return self.get_percentile(50) def get_percentile(self, percentile): """ Gets percentile, use 50 for median... Returns a float Logic shamelessly stollen from scipy """ #first check rank if self.count < 1: #Dont ask me why. I like this return 0 else: idx = percentile /100. * (self.count - 1) if (idx % 1 == 0): #simple.. get idxth numner return self._get_num_from_idx(int(idx)) else: #do weight num1 = self._get_num_from_idx(int(idx)) num2 = self._get_num_from_idx(int(idx) + 1) return int(num1 + (num2 - num1) * (idx % 1)) return rank def _get_num_from_idx(self, idx): """ Returns number at position idx of the supposed sorted list """ numbers = sorted(self._realobj.keys()) runningcount = 0 for num in numbers: if idx >= runningcount and idx < runningcount + self._realobj[num] : return num runningcount += self._realobj[num] def get_std(self): """ TODO: Return standard deviation """ if self.count < 1: return 0 else: mean = self.get_mean(precise=True) if mean == 0: return 0 else: variance = 0 for k in self._realobj.keys(): variance += (math.pow(k, 2) * self._realobj[k]) / mean return int(math.sqrt(variance)) def get_histogram(self, binwidth=25): """ Return histogram bins """ bins = { "binwidth": binwidth } hist = defaultdict(int) for k in self._realobj.keys(): binkey = str(int(k / binwidth) * binwidth) hist[binkey] += self._realobj[k] bins["histo"] = dict(hist) return bins def get_aggrigations(self): output = {} output["median"] = self.get_median() output["mean"] = self.get_mean() output["percentiles"] = {} for p in [10, 20, 30, 40, 50, 60, 70, 80, 90]: output["percentiles"][str(p)] = self.get_percentile(p) output["std"] = self.get_std() output["histogram"] = self.get_histogram(binwidth=25) return output if __name__ == "__main__": from scipy import stats input = [5,100,1,2,3,4,2,2,3,4,2] print input ml = MagicList() for item in input: ml.append(item) print ml.count print ml.as_list() print ml.get_sum() print ml.get_mean() print ml.get_percentile(50) for i in range(0, 100): if stats.scoreatpercentile(input, i) != ml.get_percentile(i): raise Exception("WTF!") print ml.as_dict()