Skip to content

Instantly share code, notes, and snippets.

@eeddaann
Last active August 2, 2018 20:20
Show Gist options
  • Save eeddaann/b006d82120da4430ae974c519f48d56e to your computer and use it in GitHub Desktop.
Save eeddaann/b006d82120da4430ae974c519f48d56e to your computer and use it in GitHub Desktop.

hellinger-distance-criterion

virtualenv

Assumes that python3.5 installed on the machine

git clone https://github.com/EvgeniDubov/hellinger-distance-criterion.git
cd hellinger-distance-criterion/
virtualenv dev  -p /usr/bin/python3.5 --no-site-packages
source ./dev/bin/activate
pip install jupyter Cython scikit-learn numpy pandas
wget https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/sklearn/tree/_criterion.pxd -P ./dev/lib/python3.5/site-packages/sklearn/tree/
cp -r ./{setup.py,hellinger_distance_criterion.pyx,example} ./dev
python setup.py build_ext --inplace

Docker

Dockerfile:

FROM python:3.5
RUN git clone https://github.com/EvgeniDubov/hellinger-distance-criterion.git
RUN pip install jupyter Cython scikit-learn numpy pandas scipy
RUN wget https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/sklearn/tree/_criterion.pxd -P /usr/local/lib/python3.5/site-packages/sklearn/tree/
WORKDIR /hellinger-distance-criterion
RUN python3 setup.py build_ext --inplace
COPY ./example.py /hellinger-distance-criterion
CMD python example.py

example.py

import numpy as np
import pandas as pd
from hellinger_distance_criterion import HellingerDistanceCriterion
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier

def compare_rf(X_train, y_train, X_test, y_test):
    clf = RandomForestClassifier(criterion='gini', max_depth=4, n_estimators=100)
    clf.fit(X_train, y_train)
    print('gini score: ', clf.score(X_test, y_test))

    clf = RandomForestClassifier(criterion='entropy', max_depth=4, n_estimators=100)
    clf.fit(X_train, y_train)
    print('entropy score: ', clf.score(X_test, y_test))

    hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64'))
    clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100)
    clf.fit(X_train, y_train)
    print('hellinger distance score: ', clf.score(X_test, y_test))

def compare_dt(X_train, y_train, X_test, y_test):
    clf = DecisionTreeClassifier(criterion='gini', max_depth=4)
    clf.fit(X_train, y_train)
    print('gini score: ', clf.score(X_test, y_test))

    clf = DecisionTreeClassifier(criterion='entropy', max_depth=4)
    clf.fit(X_train, y_train)
    print('entropy score: ', clf.score(X_test, y_test))

    hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64'))
    clf = DecisionTreeClassifier(criterion=hdc, max_depth=4)
    clf.fit(X_train, y_train)
    print('hellinger distance score: ', clf.score(X_test, y_test))

bc = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(bc.data, bc.target, test_size=0.3)
compare_rf(X_train, y_train, X_test, y_test)
compare_dt(X_train, y_train, X_test, y_test)

X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
compare_rf(X_train, y_train, X_test, y_test)
compare_dt(X_train, y_train, X_test, y_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment