ChingChuan-Chen/DiscretizationMDLP.py

## __init__.py
# put in discretization/
from .discretization import *

## build.sh
#!/usr/bin/env bash
cd discretization
python setup.py build_ext --inplace

## discretization.pyx
# put in discretization/
from libc.math cimport log, pow
from libcpp cimport bool
from libcpp.map cimport map as cppmap
from libcpp.vector cimport vector as cppvector
from libcpp.stack cimport stack as cppstack
from libcpp.set cimport set as cppset
from libcpp.iterator cimport iterator as cppiterator
from libcpp.algorithm cimport sort as cppsort
from libcpp.algorithm cimport upper_bound
from libcpp.pair cimport pair as cpppair

cdef bool cmp(double v, cpppair[double, long] data):
    return data.first >= v

cdef class DiscretizationMDLP:
    cdef:
        cppvector[cpppair[double, long]] data

    def __init__(self, cppvector[double] x, cppvector[long] y):
        if x.size() != y.size():
            raise Exception("The length of x must be equal to the length of y.")
        cdef size_t i
        for i in range(x.size()):
            self.data.push_back(cpppair[double, long](x[i], y[i]))
        cppsort(self.data.begin(), self.data.end())

    def get_data(self):
        return self.data

    def get_entropy(self, size_t low, size_t upp):
        cdef:
            cppmap[long, long] freq
            size_t n = upp-low, i

        for i in range(low, upp):
            freq[self.data[i].second] += 1;

        cdef:
            double ent = 0.0, temp
        for _, v in freq:
            temp = <double> v / n
            ent += temp * log(temp)
        return -ent, freq.size()

    def get_cut(self, size_t low, size_t upp):
        cdef:
            size_t i, n = upp - low, left_y_cnt, right_y_cnt, k1, k2
            long curr_cut, prev_cut = -1
            double whole_ent, prev_cut_value, curr_cut_value, prev_ent = 9999.0, weight, curr_ent
            double left_ent, right_ent, entropy1, entropy2, w
        whole_ent, k = self.get_entropy(low, upp)
        for i in range(low, upp-1):
            if self.data[i].first != self.data[i+1].first:
                curr_cut_value = (self.data[i].first + self.data[i+1].first) / 2.0
                curr_cut = upper_bound(self.data.begin(), self.data.end(),
                                       curr_cut_value, cmp) - self.data.begin()
                weight = <double> (curr_cut-low)/n
                left_ent, left_y_cnt = self.get_entropy(low, curr_cut)
                right_ent, right_y_cnt = self.get_entropy(curr_cut, upp)
                curr_ent = weight * left_ent + (1.-weight) * right_ent
                if curr_ent < prev_ent:
                    prev_ent, prev_cut, prev_cut_value = curr_ent, curr_cut, curr_cut_value
                    entropy1, entropy2, k1, k2, w = left_ent, right_ent, left_y_cnt, right_y_cnt, weight

        if prev_cut == -1:
            return 0.0, 0, 0.0, True
        else:
            gain = whole_ent - (w * entropy1 + (1.-w) * entropy2)
            delta = log(pow(3, <double> k) - 2.) - (<double> k * whole_ent -
                       <double> k1 * entropy1 - <double> k2 * entropy2)
            stop_cut = gain <= 1. / (<double> n) * (log(<double> n - 1.) + delta)
            return prev_ent, prev_cut, prev_cut_value, stop_cut

    def get_partition_points(self):
        cdef:
            cppvector[size_t] cut_locations
            cppvector[double] cut_points
            cppstack[cpppair[size_t, size_t]] search_intervals
            long cut
            double entropy, cut_value
            bool stop_cut
        search_intervals.push(cpppair[size_t, size_t](0, self.data.size()))
        while search_intervals.size() > 0:
            low, upp = search_intervals.top()
            search_intervals.pop()
            entropy, cut, cut_value, stop_cut = self.get_cut(low, upp)
            if stop_cut:
                continue
            search_intervals.push(cpppair[size_t, size_t](low, <size_t> cut))
            search_intervals.push(cpppair[size_t, size_t](<size_t> cut, upp))
            cut_locations.push_back(cut)
            cut_points.push_back(cut_value)
        cppsort(cut_locations.begin(), cut_locations.end())
        cppsort(cut_points.begin(), cut_points.end())
        return cut_locations, cut_points

## DiscretizationMDLP.py
from math import log
import bisect
from collections import Counter

class DiscretizationMDLP(object):
    def __init__(self, x, y):
        if len(x) != len(y):
            raise Exception("The length of x must be equal to the length of y.")
        # sort first
        self.data = sorted(zip(*[x, y]), key=lambda x: x[0])
        # store keys for binary search
        self.keys = [z[0] for z in self.data]

    def _get_entropy(self, y):
        """function to get entropy and unique count of y"""
        # get counter of y
        counter_y = Counter(y)
        # get proportions for each y
        ps = [value/len(y) for _, value in counter_y.items()]
        if len(ps) == 1:
            return 0, len(counter_y.keys())
        else:
            return -sum([p*log(p) for p in ps]), len(counter_y.keys())

    def get_cut(self, low, upp):
        """function to get cut point"""
        # get supports to check
        support = sorted(list(set(self.keys[low:upp])))
        # initialize parameters
        n = upp - low
        prev_ent, prev_cut, prev_cut_value = 9999, None, None
        entropy1, entropy2, k1, k2, w = None, None, None, None, None
        # get whole entropy
        whole_ent, k = self._get_entropy([y for _, y in self.data[low:upp]])
        for i in range(len(support)-1):
            curr_cut_value = (support[i] + support[i+1])/2
            # get current cut point
            curr_cut = bisect.bisect_right(self.keys[low:upp], curr_cut_value)
            # calculate weight
            weight = curr_cut/n
            # get the entropies of two partitions
            left_ent, left_y_cnt = self._get_entropy([y for _, y in self.data[low:(low+curr_cut)]])
            right_ent, right_y_cnt = self._get_entropy([y for _, y in self.data[(low+curr_cut):upp]])
            # get current entropy
            curr_ent = weight * left_ent + (1-weight) * right_ent
            # compare the entropy to the history
            if curr_ent < prev_ent:
                prev_ent, prev_cut, prev_cut_value = curr_ent, curr_cut, curr_cut_value
                # keep these variables for calculating the stop criterio
                entropy1, entropy2, k1, k2, w = left_ent, right_ent, left_y_cnt, right_y_cnt, weight
        if prev_cut is None:
            return None, None, None, True
        else:
            # get entropy gain
            gain = whole_ent - (w * entropy1 + (1-w) * entropy2)
            # get stoping value
            delta = log(pow(3, k) - 2) - (k * whole_ent - k1 * entropy1 - k2 * entropy2)
            # check whether to stop or not
            stop_cut = gain <= 1 / n * (log(n - 1) + delta)
            return prev_ent, low+prev_cut, prev_cut_value, stop_cut

    def get_partition_points(self):
        cut_location = []
        cut_points = []
        search_intervals = [(0, len(self.data))]
        while len(search_intervals) > 0:
            low, upp = search_intervals.pop()
            entropy, cut, cut_value, stop_cut = self.get_cut(low, upp)
            if stop_cut:
                continue
            search_intervals.append((low, cut))
            search_intervals.append((cut, upp))
            cut_location.append(cut)
            cut_points.append(cut_value)
        return sorted(cut_location), sorted(cut_points)

## main.py
import timeit
data_str = """
x = [139., 139., 139., 139., 1490., 1490., 1490., 32456., 32456.,
     33444., 33444., 33444., 35666., 35666., 35666., 35666.] * 1000
y = [1, 1, 1, 1, 2, 2, 2, 3, 4, 3, 3, 4, 3, 3, 4, 4] * 1000
"""
module_setup = """
from mdlp.discretization import MDLP
import numpy as np
""" + data_str + """
x = np.array(x).reshape([len(x), -1])
y = np.array(y)
"""
python_setup = "import DiscretizationMDLP" + data_str
cython_setup = "import discretization" + data_str

print("mdlp-discretization: ", timeit.timeit("mdlpfit = MDLP()\nmdlpfit.fit(x, y)",
                                     setup=module_setup, number=50), "seconds")
print("Python code: ", timeit.timeit('DiscretizationMDLP.DiscretizationMDLP(x, y).get_partition_points()',
                                     setup=python_setup, number=50), "seconds")
print("Cython code: ", timeit.timeit('discretization.DiscretizationMDLP(x, y).get_partition_points()',
                                     setup=cython_setup, number=50), "seconds")

# mdlp-discretization:  53.94980424300002 seconds
# Python code:  0.9393027749999945 seconds
# Cython code:  0.053218557000001 seconds

## setup.py
# put in discretization/
from distutils.core import setup, Extension
from Cython.Build import cythonize

setup(ext_modules = cythonize(Extension(name="discretization",
                                        sources=["discretization.pyx"],
                                        language='c++')))
	#!/usr/bin/env bash
	cd discretization
	python setup.py build_ext --inplace
	# put in discretization/
	from libc.math cimport log, pow
	from libcpp cimport bool
	from libcpp.map cimport map as cppmap
	from libcpp.vector cimport vector as cppvector
	from libcpp.stack cimport stack as cppstack
	from libcpp.set cimport set as cppset
	from libcpp.iterator cimport iterator as cppiterator
	from libcpp.algorithm cimport sort as cppsort
	from libcpp.algorithm cimport upper_bound
	from libcpp.pair cimport pair as cpppair

	cdef bool cmp(double v, cpppair[double, long] data):
	return data.first >= v

	cdef class DiscretizationMDLP:
	cdef:
	cppvector[cpppair[double, long]] data

	def __init__(self, cppvector[double] x, cppvector[long] y):
	if x.size() != y.size():
	raise Exception("The length of x must be equal to the length of y.")
	cdef size_t i
	for i in range(x.size()):
	self.data.push_back(cpppair[double, long](x[i], y[i]))
	cppsort(self.data.begin(), self.data.end())

	def get_data(self):
	return self.data

	def get_entropy(self, size_t low, size_t upp):
	cdef:
	cppmap[long, long] freq
	size_t n = upp-low, i

	for i in range(low, upp):
	freq[self.data[i].second] += 1;

	cdef:
	double ent = 0.0, temp
	for _, v in freq:
	temp = <double> v / n
	ent += temp * log(temp)
	return -ent, freq.size()

	def get_cut(self, size_t low, size_t upp):
	cdef:
	size_t i, n = upp - low, left_y_cnt, right_y_cnt, k1, k2
	long curr_cut, prev_cut = -1
	double whole_ent, prev_cut_value, curr_cut_value, prev_ent = 9999.0, weight, curr_ent
	double left_ent, right_ent, entropy1, entropy2, w
	whole_ent, k = self.get_entropy(low, upp)
	for i in range(low, upp-1):
	if self.data[i].first != self.data[i+1].first:
	curr_cut_value = (self.data[i].first + self.data[i+1].first) / 2.0
	curr_cut = upper_bound(self.data.begin(), self.data.end(),
	curr_cut_value, cmp) - self.data.begin()
	weight = <double> (curr_cut-low)/n
	left_ent, left_y_cnt = self.get_entropy(low, curr_cut)
	right_ent, right_y_cnt = self.get_entropy(curr_cut, upp)
	curr_ent = weight * left_ent + (1.-weight) * right_ent
	if curr_ent < prev_ent:
	prev_ent, prev_cut, prev_cut_value = curr_ent, curr_cut, curr_cut_value
	entropy1, entropy2, k1, k2, w = left_ent, right_ent, left_y_cnt, right_y_cnt, weight

	if prev_cut == -1:
	return 0.0, 0, 0.0, True
	else:
	gain = whole_ent - (w * entropy1 + (1.-w) * entropy2)
	delta = log(pow(3, <double> k) - 2.) - (<double> k * whole_ent -
	<double> k1 * entropy1 - <double> k2 * entropy2)
	stop_cut = gain <= 1. / (<double> n) * (log(<double> n - 1.) + delta)
	return prev_ent, prev_cut, prev_cut_value, stop_cut

	def get_partition_points(self):
	cdef:
	cppvector[size_t] cut_locations
	cppvector[double] cut_points
	cppstack[cpppair[size_t, size_t]] search_intervals
	long cut
	double entropy, cut_value
	bool stop_cut
	search_intervals.push(cpppair[size_t, size_t](0, self.data.size()))
	while search_intervals.size() > 0:
	low, upp = search_intervals.top()
	search_intervals.pop()
	entropy, cut, cut_value, stop_cut = self.get_cut(low, upp)
	if stop_cut:
	continue
	search_intervals.push(cpppair[size_t, size_t](low, <size_t> cut))
	search_intervals.push(cpppair[size_t, size_t](<size_t> cut, upp))
	cut_locations.push_back(cut)
	cut_points.push_back(cut_value)
	cppsort(cut_locations.begin(), cut_locations.end())
	cppsort(cut_points.begin(), cut_points.end())
	return cut_locations, cut_points
	from math import log
	import bisect
	from collections import Counter

	class DiscretizationMDLP(object):
	def __init__(self, x, y):
	if len(x) != len(y):
	raise Exception("The length of x must be equal to the length of y.")
	# sort first
	self.data = sorted(zip(*[x, y]), key=lambda x: x[0])
	# store keys for binary search
	self.keys = [z[0] for z in self.data]

	def _get_entropy(self, y):
	"""function to get entropy and unique count of y"""
	# get counter of y
	counter_y = Counter(y)
	# get proportions for each y
	ps = [value/len(y) for _, value in counter_y.items()]
	if len(ps) == 1:
	return 0, len(counter_y.keys())
	else:
	return -sum([p*log(p) for p in ps]), len(counter_y.keys())

	def get_cut(self, low, upp):
	"""function to get cut point"""
	# get supports to check
	support = sorted(list(set(self.keys[low:upp])))
	# initialize parameters
	n = upp - low
	prev_ent, prev_cut, prev_cut_value = 9999, None, None
	entropy1, entropy2, k1, k2, w = None, None, None, None, None
	# get whole entropy
	whole_ent, k = self._get_entropy([y for _, y in self.data[low:upp]])
	for i in range(len(support)-1):
	curr_cut_value = (support[i] + support[i+1])/2
	# get current cut point
	curr_cut = bisect.bisect_right(self.keys[low:upp], curr_cut_value)
	# calculate weight
	weight = curr_cut/n
	# get the entropies of two partitions
	left_ent, left_y_cnt = self._get_entropy([y for _, y in self.data[low:(low+curr_cut)]])
	right_ent, right_y_cnt = self._get_entropy([y for _, y in self.data[(low+curr_cut):upp]])
	# get current entropy
	curr_ent = weight * left_ent + (1-weight) * right_ent
	# compare the entropy to the history
	if curr_ent < prev_ent:
	prev_ent, prev_cut, prev_cut_value = curr_ent, curr_cut, curr_cut_value
	# keep these variables for calculating the stop criterio
	entropy1, entropy2, k1, k2, w = left_ent, right_ent, left_y_cnt, right_y_cnt, weight
	if prev_cut is None:
	return None, None, None, True
	else:
	# get entropy gain
	gain = whole_ent - (w * entropy1 + (1-w) * entropy2)
	# get stoping value
	delta = log(pow(3, k) - 2) - (k * whole_ent - k1 * entropy1 - k2 * entropy2)
	# check whether to stop or not
	stop_cut = gain <= 1 / n * (log(n - 1) + delta)
	return prev_ent, low+prev_cut, prev_cut_value, stop_cut

	def get_partition_points(self):
	cut_location = []
	cut_points = []
	search_intervals = [(0, len(self.data))]
	while len(search_intervals) > 0:
	low, upp = search_intervals.pop()
	entropy, cut, cut_value, stop_cut = self.get_cut(low, upp)
	if stop_cut:
	continue
	search_intervals.append((low, cut))
	search_intervals.append((cut, upp))
	cut_location.append(cut)
	cut_points.append(cut_value)
	return sorted(cut_location), sorted(cut_points)
	import timeit
	data_str = """
	x = [139., 139., 139., 139., 1490., 1490., 1490., 32456., 32456.,
	33444., 33444., 33444., 35666., 35666., 35666., 35666.] * 1000
	y = [1, 1, 1, 1, 2, 2, 2, 3, 4, 3, 3, 4, 3, 3, 4, 4] * 1000
	"""
	module_setup = """
	from mdlp.discretization import MDLP
	import numpy as np
	""" + data_str + """
	x = np.array(x).reshape([len(x), -1])
	y = np.array(y)
	"""
	python_setup = "import DiscretizationMDLP" + data_str
	cython_setup = "import discretization" + data_str

	print("mdlp-discretization: ", timeit.timeit("mdlpfit = MDLP()\nmdlpfit.fit(x, y)",
	setup=module_setup, number=50), "seconds")
	print("Python code: ", timeit.timeit('DiscretizationMDLP.DiscretizationMDLP(x, y).get_partition_points()',
	setup=python_setup, number=50), "seconds")
	print("Cython code: ", timeit.timeit('discretization.DiscretizationMDLP(x, y).get_partition_points()',
	setup=cython_setup, number=50), "seconds")

	# mdlp-discretization: 53.94980424300002 seconds
	# Python code: 0.9393027749999945 seconds
	# Cython code: 0.053218557000001 seconds
	# put in discretization/
	from distutils.core import setup, Extension
	from Cython.Build import cythonize

	setup(ext_modules = cythonize(Extension(name="discretization",
	sources=["discretization.pyx"],
	language='c++')))