Skip to content

Instantly share code, notes, and snippets.

@sajal
Created October 23, 2012 22:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sajal/3941935 to your computer and use it in GitHub Desktop.
Save sajal/3941935 to your computer and use it in GitHub Desktop.
Memory efficient list...
from collections import defaultdict
import math
class MagicList():
"""
An attempt to make a python object which could potentially hold
unlimited values with finite distinct values.
All items are not stored in memory, only the count of each distinct item is.
So if you want to hold say 1 million numbers where each number is between 1 to 5000,
this should be a better choice than a list.
If you have [1,2,3,1,1,1,1,1,1,1,1,1,1,1,1,2,3], internally it will be stored as :-
{1: 13, 2: 2, 3: 2}
"""
def __init__(self, fromlist=None, fromdict=None):
"""
if none of fromlist and fromdict is supplied, it makes blank list
"""
self._realobj = defaultdict(int)
self.count = 0
if fromlist:
self.insert_list(fromlist)
elif fromdict:
self.insert_dict(fromdict)
def append(self, num, val=1):
"""
Insert a number into magiclist.
val is is the number of times this current number is to be added
"""
num = int(num)
val = int(val)
self._realobj[num] += val
self.count += val
def insert_list(self, fromlist):
"""
Give it a list type object/iterable, it will add in values to existing MagicList
"""
for item in fromlist:
self.append(item)
def insert_dict(self, fromdict):
"""
Give it a dict type object and it inserts it to current magiclist
Useful if u serialize MagicList and wanna unserialize it, or to join 2
"""
for num in fromdict.keys():
self.append(num, fromdict[num])
def as_list(self):
"""
Get normal list representation of the MagicList
"""
output = []
for key in self._realobj:
for i in range(self._realobj[key]):
output += [key]
return output
def as_dict(self):
return dict(self._realobj)
def get_sum(self):
"""
Sum of all items in magiclist
"""
total = 0
for key in self._realobj:
total += self._realobj[key] * key
return total
def get_mean(self, precise=False):
"""
Mean items in magiclist.
Returns a float
"""
if self.count < 1:
return 0
elif precise:
return (self.get_sum() * 1.0) / self.count
else:
return int((self.get_sum() * 1.0) / self.count)
def get_median(self):
"""
Returns median
"""
return self.get_percentile(50)
def get_percentile(self, percentile):
"""
Gets percentile, use 50 for median...
Returns a float
Logic shamelessly stollen from scipy
"""
#first check rank
if self.count < 1:
#Dont ask me why. I like this
return 0
else:
idx = percentile /100. * (self.count - 1)
if (idx % 1 == 0):
#simple.. get idxth numner
return self._get_num_from_idx(int(idx))
else:
#do weight
num1 = self._get_num_from_idx(int(idx))
num2 = self._get_num_from_idx(int(idx) + 1)
return int(num1 + (num2 - num1) * (idx % 1))
return rank
def _get_num_from_idx(self, idx):
"""
Returns number at position idx of the supposed sorted list
"""
numbers = sorted(self._realobj.keys())
runningcount = 0
for num in numbers:
if idx >= runningcount and idx < runningcount + self._realobj[num] :
return num
runningcount += self._realobj[num]
def get_std(self):
"""
TODO: Return standard deviation
"""
if self.count < 1:
return 0
else:
mean = self.get_mean(precise=True)
if mean == 0:
return 0
else:
variance = 0
for k in self._realobj.keys():
variance += (math.pow(k, 2) * self._realobj[k]) / mean
return int(math.sqrt(variance))
def get_histogram(self, binwidth=25):
"""
Return histogram bins
"""
bins = {
"binwidth": binwidth
}
hist = defaultdict(int)
for k in self._realobj.keys():
binkey = str(int(k / binwidth) * binwidth)
hist[binkey] += self._realobj[k]
bins["histo"] = dict(hist)
return bins
def get_aggrigations(self):
output = {}
output["median"] = self.get_median()
output["mean"] = self.get_mean()
output["percentiles"] = {}
for p in [10, 20, 30, 40, 50, 60, 70, 80, 90]:
output["percentiles"][str(p)] = self.get_percentile(p)
output["std"] = self.get_std()
output["histogram"] = self.get_histogram(binwidth=25)
return output
if __name__ == "__main__":
from scipy import stats
input = [5,100,1,2,3,4,2,2,3,4,2]
print input
ml = MagicList()
for item in input:
ml.append(item)
print ml.count
print ml.as_list()
print ml.get_sum()
print ml.get_mean()
print ml.get_percentile(50)
for i in range(0, 100):
if stats.scoreatpercentile(input, i) != ml.get_percentile(i):
raise Exception("WTF!")
print ml.as_dict()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment