Last active
June 18, 2019 15:45
-
-
Save ChingChuan-Chen/c887846ef14474ccf5158fa4d0f41fd3 to your computer and use it in GitHub Desktop.
DiscretizationMDLP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# put in discretization/ | |
from .discretization import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
cd discretization | |
python setup.py build_ext --inplace |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# put in discretization/ | |
from libc.math cimport log, pow | |
from libcpp cimport bool | |
from libcpp.map cimport map as cppmap | |
from libcpp.vector cimport vector as cppvector | |
from libcpp.stack cimport stack as cppstack | |
from libcpp.set cimport set as cppset | |
from libcpp.iterator cimport iterator as cppiterator | |
from libcpp.algorithm cimport sort as cppsort | |
from libcpp.algorithm cimport upper_bound | |
from libcpp.pair cimport pair as cpppair | |
cdef bool cmp(double v, cpppair[double, long] data): | |
return data.first >= v | |
cdef class DiscretizationMDLP: | |
cdef: | |
cppvector[cpppair[double, long]] data | |
def __init__(self, cppvector[double] x, cppvector[long] y): | |
if x.size() != y.size(): | |
raise Exception("The length of x must be equal to the length of y.") | |
cdef size_t i | |
for i in range(x.size()): | |
self.data.push_back(cpppair[double, long](x[i], y[i])) | |
cppsort(self.data.begin(), self.data.end()) | |
def get_data(self): | |
return self.data | |
def get_entropy(self, size_t low, size_t upp): | |
cdef: | |
cppmap[long, long] freq | |
size_t n = upp-low, i | |
for i in range(low, upp): | |
freq[self.data[i].second] += 1; | |
cdef: | |
double ent = 0.0, temp | |
for _, v in freq: | |
temp = <double> v / n | |
ent += temp * log(temp) | |
return -ent, freq.size() | |
def get_cut(self, size_t low, size_t upp): | |
cdef: | |
size_t i, n = upp - low, left_y_cnt, right_y_cnt, k1, k2 | |
long curr_cut, prev_cut = -1 | |
double whole_ent, prev_cut_value, curr_cut_value, prev_ent = 9999.0, weight, curr_ent | |
double left_ent, right_ent, entropy1, entropy2, w | |
whole_ent, k = self.get_entropy(low, upp) | |
for i in range(low, upp-1): | |
if self.data[i].first != self.data[i+1].first: | |
curr_cut_value = (self.data[i].first + self.data[i+1].first) / 2.0 | |
curr_cut = upper_bound(self.data.begin(), self.data.end(), | |
curr_cut_value, cmp) - self.data.begin() | |
weight = <double> (curr_cut-low)/n | |
left_ent, left_y_cnt = self.get_entropy(low, curr_cut) | |
right_ent, right_y_cnt = self.get_entropy(curr_cut, upp) | |
curr_ent = weight * left_ent + (1.-weight) * right_ent | |
if curr_ent < prev_ent: | |
prev_ent, prev_cut, prev_cut_value = curr_ent, curr_cut, curr_cut_value | |
entropy1, entropy2, k1, k2, w = left_ent, right_ent, left_y_cnt, right_y_cnt, weight | |
if prev_cut == -1: | |
return 0.0, 0, 0.0, True | |
else: | |
gain = whole_ent - (w * entropy1 + (1.-w) * entropy2) | |
delta = log(pow(3, <double> k) - 2.) - (<double> k * whole_ent - | |
<double> k1 * entropy1 - <double> k2 * entropy2) | |
stop_cut = gain <= 1. / (<double> n) * (log(<double> n - 1.) + delta) | |
return prev_ent, prev_cut, prev_cut_value, stop_cut | |
def get_partition_points(self): | |
cdef: | |
cppvector[size_t] cut_locations | |
cppvector[double] cut_points | |
cppstack[cpppair[size_t, size_t]] search_intervals | |
long cut | |
double entropy, cut_value | |
bool stop_cut | |
search_intervals.push(cpppair[size_t, size_t](0, self.data.size())) | |
while search_intervals.size() > 0: | |
low, upp = search_intervals.top() | |
search_intervals.pop() | |
entropy, cut, cut_value, stop_cut = self.get_cut(low, upp) | |
if stop_cut: | |
continue | |
search_intervals.push(cpppair[size_t, size_t](low, <size_t> cut)) | |
search_intervals.push(cpppair[size_t, size_t](<size_t> cut, upp)) | |
cut_locations.push_back(cut) | |
cut_points.push_back(cut_value) | |
cppsort(cut_locations.begin(), cut_locations.end()) | |
cppsort(cut_points.begin(), cut_points.end()) | |
return cut_locations, cut_points |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from math import log | |
import bisect | |
from collections import Counter | |
class DiscretizationMDLP(object): | |
def __init__(self, x, y): | |
if len(x) != len(y): | |
raise Exception("The length of x must be equal to the length of y.") | |
# sort first | |
self.data = sorted(zip(*[x, y]), key=lambda x: x[0]) | |
# store keys for binary search | |
self.keys = [z[0] for z in self.data] | |
def _get_entropy(self, y): | |
"""function to get entropy and unique count of y""" | |
# get counter of y | |
counter_y = Counter(y) | |
# get proportions for each y | |
ps = [value/len(y) for _, value in counter_y.items()] | |
if len(ps) == 1: | |
return 0, len(counter_y.keys()) | |
else: | |
return -sum([p*log(p) for p in ps]), len(counter_y.keys()) | |
def get_cut(self, low, upp): | |
"""function to get cut point""" | |
# get supports to check | |
support = sorted(list(set(self.keys[low:upp]))) | |
# initialize parameters | |
n = upp - low | |
prev_ent, prev_cut, prev_cut_value = 9999, None, None | |
entropy1, entropy2, k1, k2, w = None, None, None, None, None | |
# get whole entropy | |
whole_ent, k = self._get_entropy([y for _, y in self.data[low:upp]]) | |
for i in range(len(support)-1): | |
curr_cut_value = (support[i] + support[i+1])/2 | |
# get current cut point | |
curr_cut = bisect.bisect_right(self.keys[low:upp], curr_cut_value) | |
# calculate weight | |
weight = curr_cut/n | |
# get the entropies of two partitions | |
left_ent, left_y_cnt = self._get_entropy([y for _, y in self.data[low:(low+curr_cut)]]) | |
right_ent, right_y_cnt = self._get_entropy([y for _, y in self.data[(low+curr_cut):upp]]) | |
# get current entropy | |
curr_ent = weight * left_ent + (1-weight) * right_ent | |
# compare the entropy to the history | |
if curr_ent < prev_ent: | |
prev_ent, prev_cut, prev_cut_value = curr_ent, curr_cut, curr_cut_value | |
# keep these variables for calculating the stop criterio | |
entropy1, entropy2, k1, k2, w = left_ent, right_ent, left_y_cnt, right_y_cnt, weight | |
if prev_cut is None: | |
return None, None, None, True | |
else: | |
# get entropy gain | |
gain = whole_ent - (w * entropy1 + (1-w) * entropy2) | |
# get stoping value | |
delta = log(pow(3, k) - 2) - (k * whole_ent - k1 * entropy1 - k2 * entropy2) | |
# check whether to stop or not | |
stop_cut = gain <= 1 / n * (log(n - 1) + delta) | |
return prev_ent, low+prev_cut, prev_cut_value, stop_cut | |
def get_partition_points(self): | |
cut_location = [] | |
cut_points = [] | |
search_intervals = [(0, len(self.data))] | |
while len(search_intervals) > 0: | |
low, upp = search_intervals.pop() | |
entropy, cut, cut_value, stop_cut = self.get_cut(low, upp) | |
if stop_cut: | |
continue | |
search_intervals.append((low, cut)) | |
search_intervals.append((cut, upp)) | |
cut_location.append(cut) | |
cut_points.append(cut_value) | |
return sorted(cut_location), sorted(cut_points) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import timeit | |
data_str = """ | |
x = [139., 139., 139., 139., 1490., 1490., 1490., 32456., 32456., | |
33444., 33444., 33444., 35666., 35666., 35666., 35666.] * 1000 | |
y = [1, 1, 1, 1, 2, 2, 2, 3, 4, 3, 3, 4, 3, 3, 4, 4] * 1000 | |
""" | |
module_setup = """ | |
from mdlp.discretization import MDLP | |
import numpy as np | |
""" + data_str + """ | |
x = np.array(x).reshape([len(x), -1]) | |
y = np.array(y) | |
""" | |
python_setup = "import DiscretizationMDLP" + data_str | |
cython_setup = "import discretization" + data_str | |
print("mdlp-discretization: ", timeit.timeit("mdlpfit = MDLP()\nmdlpfit.fit(x, y)", | |
setup=module_setup, number=50), "seconds") | |
print("Python code: ", timeit.timeit('DiscretizationMDLP.DiscretizationMDLP(x, y).get_partition_points()', | |
setup=python_setup, number=50), "seconds") | |
print("Cython code: ", timeit.timeit('discretization.DiscretizationMDLP(x, y).get_partition_points()', | |
setup=cython_setup, number=50), "seconds") | |
# mdlp-discretization: 53.94980424300002 seconds | |
# Python code: 0.9393027749999945 seconds | |
# Cython code: 0.053218557000001 seconds |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# put in discretization/ | |
from distutils.core import setup, Extension | |
from Cython.Build import cythonize | |
setup(ext_modules = cythonize(Extension(name="discretization", | |
sources=["discretization.pyx"], | |
language='c++'))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment