Skip to content

Instantly share code, notes, and snippets.

@ChingChuan-Chen
Last active June 18, 2019 15:45
Show Gist options
  • Save ChingChuan-Chen/c887846ef14474ccf5158fa4d0f41fd3 to your computer and use it in GitHub Desktop.
Save ChingChuan-Chen/c887846ef14474ccf5158fa4d0f41fd3 to your computer and use it in GitHub Desktop.
DiscretizationMDLP
# put in discretization/
from .discretization import *
#!/usr/bin/env bash
cd discretization
python setup.py build_ext --inplace
# put in discretization/
from libc.math cimport log, pow
from libcpp cimport bool
from libcpp.map cimport map as cppmap
from libcpp.vector cimport vector as cppvector
from libcpp.stack cimport stack as cppstack
from libcpp.set cimport set as cppset
from libcpp.iterator cimport iterator as cppiterator
from libcpp.algorithm cimport sort as cppsort
from libcpp.algorithm cimport upper_bound
from libcpp.pair cimport pair as cpppair
cdef bool cmp(double v, cpppair[double, long] data):
return data.first >= v
cdef class DiscretizationMDLP:
cdef:
cppvector[cpppair[double, long]] data
def __init__(self, cppvector[double] x, cppvector[long] y):
if x.size() != y.size():
raise Exception("The length of x must be equal to the length of y.")
cdef size_t i
for i in range(x.size()):
self.data.push_back(cpppair[double, long](x[i], y[i]))
cppsort(self.data.begin(), self.data.end())
def get_data(self):
return self.data
def get_entropy(self, size_t low, size_t upp):
cdef:
cppmap[long, long] freq
size_t n = upp-low, i
for i in range(low, upp):
freq[self.data[i].second] += 1;
cdef:
double ent = 0.0, temp
for _, v in freq:
temp = <double> v / n
ent += temp * log(temp)
return -ent, freq.size()
def get_cut(self, size_t low, size_t upp):
cdef:
size_t i, n = upp - low, left_y_cnt, right_y_cnt, k1, k2
long curr_cut, prev_cut = -1
double whole_ent, prev_cut_value, curr_cut_value, prev_ent = 9999.0, weight, curr_ent
double left_ent, right_ent, entropy1, entropy2, w
whole_ent, k = self.get_entropy(low, upp)
for i in range(low, upp-1):
if self.data[i].first != self.data[i+1].first:
curr_cut_value = (self.data[i].first + self.data[i+1].first) / 2.0
curr_cut = upper_bound(self.data.begin(), self.data.end(),
curr_cut_value, cmp) - self.data.begin()
weight = <double> (curr_cut-low)/n
left_ent, left_y_cnt = self.get_entropy(low, curr_cut)
right_ent, right_y_cnt = self.get_entropy(curr_cut, upp)
curr_ent = weight * left_ent + (1.-weight) * right_ent
if curr_ent < prev_ent:
prev_ent, prev_cut, prev_cut_value = curr_ent, curr_cut, curr_cut_value
entropy1, entropy2, k1, k2, w = left_ent, right_ent, left_y_cnt, right_y_cnt, weight
if prev_cut == -1:
return 0.0, 0, 0.0, True
else:
gain = whole_ent - (w * entropy1 + (1.-w) * entropy2)
delta = log(pow(3, <double> k) - 2.) - (<double> k * whole_ent -
<double> k1 * entropy1 - <double> k2 * entropy2)
stop_cut = gain <= 1. / (<double> n) * (log(<double> n - 1.) + delta)
return prev_ent, prev_cut, prev_cut_value, stop_cut
def get_partition_points(self):
cdef:
cppvector[size_t] cut_locations
cppvector[double] cut_points
cppstack[cpppair[size_t, size_t]] search_intervals
long cut
double entropy, cut_value
bool stop_cut
search_intervals.push(cpppair[size_t, size_t](0, self.data.size()))
while search_intervals.size() > 0:
low, upp = search_intervals.top()
search_intervals.pop()
entropy, cut, cut_value, stop_cut = self.get_cut(low, upp)
if stop_cut:
continue
search_intervals.push(cpppair[size_t, size_t](low, <size_t> cut))
search_intervals.push(cpppair[size_t, size_t](<size_t> cut, upp))
cut_locations.push_back(cut)
cut_points.push_back(cut_value)
cppsort(cut_locations.begin(), cut_locations.end())
cppsort(cut_points.begin(), cut_points.end())
return cut_locations, cut_points
from math import log
import bisect
from collections import Counter
class DiscretizationMDLP(object):
def __init__(self, x, y):
if len(x) != len(y):
raise Exception("The length of x must be equal to the length of y.")
# sort first
self.data = sorted(zip(*[x, y]), key=lambda x: x[0])
# store keys for binary search
self.keys = [z[0] for z in self.data]
def _get_entropy(self, y):
"""function to get entropy and unique count of y"""
# get counter of y
counter_y = Counter(y)
# get proportions for each y
ps = [value/len(y) for _, value in counter_y.items()]
if len(ps) == 1:
return 0, len(counter_y.keys())
else:
return -sum([p*log(p) for p in ps]), len(counter_y.keys())
def get_cut(self, low, upp):
"""function to get cut point"""
# get supports to check
support = sorted(list(set(self.keys[low:upp])))
# initialize parameters
n = upp - low
prev_ent, prev_cut, prev_cut_value = 9999, None, None
entropy1, entropy2, k1, k2, w = None, None, None, None, None
# get whole entropy
whole_ent, k = self._get_entropy([y for _, y in self.data[low:upp]])
for i in range(len(support)-1):
curr_cut_value = (support[i] + support[i+1])/2
# get current cut point
curr_cut = bisect.bisect_right(self.keys[low:upp], curr_cut_value)
# calculate weight
weight = curr_cut/n
# get the entropies of two partitions
left_ent, left_y_cnt = self._get_entropy([y for _, y in self.data[low:(low+curr_cut)]])
right_ent, right_y_cnt = self._get_entropy([y for _, y in self.data[(low+curr_cut):upp]])
# get current entropy
curr_ent = weight * left_ent + (1-weight) * right_ent
# compare the entropy to the history
if curr_ent < prev_ent:
prev_ent, prev_cut, prev_cut_value = curr_ent, curr_cut, curr_cut_value
# keep these variables for calculating the stop criterio
entropy1, entropy2, k1, k2, w = left_ent, right_ent, left_y_cnt, right_y_cnt, weight
if prev_cut is None:
return None, None, None, True
else:
# get entropy gain
gain = whole_ent - (w * entropy1 + (1-w) * entropy2)
# get stoping value
delta = log(pow(3, k) - 2) - (k * whole_ent - k1 * entropy1 - k2 * entropy2)
# check whether to stop or not
stop_cut = gain <= 1 / n * (log(n - 1) + delta)
return prev_ent, low+prev_cut, prev_cut_value, stop_cut
def get_partition_points(self):
cut_location = []
cut_points = []
search_intervals = [(0, len(self.data))]
while len(search_intervals) > 0:
low, upp = search_intervals.pop()
entropy, cut, cut_value, stop_cut = self.get_cut(low, upp)
if stop_cut:
continue
search_intervals.append((low, cut))
search_intervals.append((cut, upp))
cut_location.append(cut)
cut_points.append(cut_value)
return sorted(cut_location), sorted(cut_points)
import timeit
data_str = """
x = [139., 139., 139., 139., 1490., 1490., 1490., 32456., 32456.,
33444., 33444., 33444., 35666., 35666., 35666., 35666.] * 1000
y = [1, 1, 1, 1, 2, 2, 2, 3, 4, 3, 3, 4, 3, 3, 4, 4] * 1000
"""
module_setup = """
from mdlp.discretization import MDLP
import numpy as np
""" + data_str + """
x = np.array(x).reshape([len(x), -1])
y = np.array(y)
"""
python_setup = "import DiscretizationMDLP" + data_str
cython_setup = "import discretization" + data_str
print("mdlp-discretization: ", timeit.timeit("mdlpfit = MDLP()\nmdlpfit.fit(x, y)",
setup=module_setup, number=50), "seconds")
print("Python code: ", timeit.timeit('DiscretizationMDLP.DiscretizationMDLP(x, y).get_partition_points()',
setup=python_setup, number=50), "seconds")
print("Cython code: ", timeit.timeit('discretization.DiscretizationMDLP(x, y).get_partition_points()',
setup=cython_setup, number=50), "seconds")
# mdlp-discretization: 53.94980424300002 seconds
# Python code: 0.9393027749999945 seconds
# Cython code: 0.053218557000001 seconds
# put in discretization/
from distutils.core import setup, Extension
from Cython.Build import cythonize
setup(ext_modules = cythonize(Extension(name="discretization",
sources=["discretization.pyx"],
language='c++')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment