Created
October 23, 2012 22:02
-
-
Save sajal/3941935 to your computer and use it in GitHub Desktop.
Memory efficient list...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
import math | |
class MagicList(): | |
""" | |
An attempt to make a python object which could potentially hold | |
unlimited values with finite distinct values. | |
All items are not stored in memory, only the count of each distinct item is. | |
So if you want to hold say 1 million numbers where each number is between 1 to 5000, | |
this should be a better choice than a list. | |
If you have [1,2,3,1,1,1,1,1,1,1,1,1,1,1,1,2,3], internally it will be stored as :- | |
{1: 13, 2: 2, 3: 2} | |
""" | |
def __init__(self, fromlist=None, fromdict=None): | |
""" | |
if none of fromlist and fromdict is supplied, it makes blank list | |
""" | |
self._realobj = defaultdict(int) | |
self.count = 0 | |
if fromlist: | |
self.insert_list(fromlist) | |
elif fromdict: | |
self.insert_dict(fromdict) | |
def append(self, num, val=1): | |
""" | |
Insert a number into magiclist. | |
val is is the number of times this current number is to be added | |
""" | |
num = int(num) | |
val = int(val) | |
self._realobj[num] += val | |
self.count += val | |
def insert_list(self, fromlist): | |
""" | |
Give it a list type object/iterable, it will add in values to existing MagicList | |
""" | |
for item in fromlist: | |
self.append(item) | |
def insert_dict(self, fromdict): | |
""" | |
Give it a dict type object and it inserts it to current magiclist | |
Useful if u serialize MagicList and wanna unserialize it, or to join 2 | |
""" | |
for num in fromdict.keys(): | |
self.append(num, fromdict[num]) | |
def as_list(self): | |
""" | |
Get normal list representation of the MagicList | |
""" | |
output = [] | |
for key in self._realobj: | |
for i in range(self._realobj[key]): | |
output += [key] | |
return output | |
def as_dict(self): | |
return dict(self._realobj) | |
def get_sum(self): | |
""" | |
Sum of all items in magiclist | |
""" | |
total = 0 | |
for key in self._realobj: | |
total += self._realobj[key] * key | |
return total | |
def get_mean(self, precise=False): | |
""" | |
Mean items in magiclist. | |
Returns a float | |
""" | |
if self.count < 1: | |
return 0 | |
elif precise: | |
return (self.get_sum() * 1.0) / self.count | |
else: | |
return int((self.get_sum() * 1.0) / self.count) | |
def get_median(self): | |
""" | |
Returns median | |
""" | |
return self.get_percentile(50) | |
def get_percentile(self, percentile): | |
""" | |
Gets percentile, use 50 for median... | |
Returns a float | |
Logic shamelessly stollen from scipy | |
""" | |
#first check rank | |
if self.count < 1: | |
#Dont ask me why. I like this | |
return 0 | |
else: | |
idx = percentile /100. * (self.count - 1) | |
if (idx % 1 == 0): | |
#simple.. get idxth numner | |
return self._get_num_from_idx(int(idx)) | |
else: | |
#do weight | |
num1 = self._get_num_from_idx(int(idx)) | |
num2 = self._get_num_from_idx(int(idx) + 1) | |
return int(num1 + (num2 - num1) * (idx % 1)) | |
return rank | |
def _get_num_from_idx(self, idx): | |
""" | |
Returns number at position idx of the supposed sorted list | |
""" | |
numbers = sorted(self._realobj.keys()) | |
runningcount = 0 | |
for num in numbers: | |
if idx >= runningcount and idx < runningcount + self._realobj[num] : | |
return num | |
runningcount += self._realobj[num] | |
def get_std(self): | |
""" | |
TODO: Return standard deviation | |
""" | |
if self.count < 1: | |
return 0 | |
else: | |
mean = self.get_mean(precise=True) | |
if mean == 0: | |
return 0 | |
else: | |
variance = 0 | |
for k in self._realobj.keys(): | |
variance += (math.pow(k, 2) * self._realobj[k]) / mean | |
return int(math.sqrt(variance)) | |
def get_histogram(self, binwidth=25): | |
""" | |
Return histogram bins | |
""" | |
bins = { | |
"binwidth": binwidth | |
} | |
hist = defaultdict(int) | |
for k in self._realobj.keys(): | |
binkey = str(int(k / binwidth) * binwidth) | |
hist[binkey] += self._realobj[k] | |
bins["histo"] = dict(hist) | |
return bins | |
def get_aggrigations(self): | |
output = {} | |
output["median"] = self.get_median() | |
output["mean"] = self.get_mean() | |
output["percentiles"] = {} | |
for p in [10, 20, 30, 40, 50, 60, 70, 80, 90]: | |
output["percentiles"][str(p)] = self.get_percentile(p) | |
output["std"] = self.get_std() | |
output["histogram"] = self.get_histogram(binwidth=25) | |
return output | |
if __name__ == "__main__": | |
from scipy import stats | |
input = [5,100,1,2,3,4,2,2,3,4,2] | |
print input | |
ml = MagicList() | |
for item in input: | |
ml.append(item) | |
print ml.count | |
print ml.as_list() | |
print ml.get_sum() | |
print ml.get_mean() | |
print ml.get_percentile(50) | |
for i in range(0, 100): | |
if stats.scoreatpercentile(input, i) != ml.get_percentile(i): | |
raise Exception("WTF!") | |
print ml.as_dict() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment