Skip to content

Instantly share code, notes, and snippets.

@zarch
Created October 9, 2014 04:54
Show Gist options
  • Save zarch/abb3993c55fb61b834d1 to your computer and use it in GitHub Desktop.
Save zarch/abb3993c55fb61b834d1 to your computer and use it in GitHub Desktop.
Sequential forward floating feature selection with Jeffries-Matusita Distance
# -*- coding: utf-8 -*-
"""
Sequential forward floating feature selection with Jeffries-Matusita Distance
==============================================================================
Reference: Pudil, P.; Novovicová, J. & Kittler, J.
Floating search methods in feature selection Pattern recognition letters,
Elsevier, 1994, 15, 1119-1125
Authors: Michele Dalponte & Hans Ole Ørka & Pietro Zambelli
Date: Oct, 2014
Usage
------
Suppose to have a CSV file that is spaced separated, then you can select the features with:
$ python2 fs_pudil.py -m min data.csv
"""
from __future__ import print_function
from itertools import combinations
from math import log, exp, sqrt
import numpy as np
from numpy.linalg import inv, det
VERBOSE = True
def read(finput, delimiter=None, skip_header=1):
"""Read a file of input and return classes and data"""
sdata = np.genfromtxt(finput, dtype=str, delimiter=delimiter,
skip_header=skip_header)
return sdata.T[-1], sdata[:, :-1].astype(float)
def fgroup(classes, data, func, labels=None):
labels = labels if labels else sorted(set(classes))
res = {}
for label in labels:
res[label] = func(data[classes == label])
return res
def mean(data):
return np.mean(data, axis=0)
def cov(data):
return np.cov(data.T)
def v2m(arr, id0, id1):
"""Extract values from a covariance matrix. ::
>>> arr = np.arange(1, 10).reshape((3, 3))
>>> arr
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
>>> v2m(arr, (0, 1), (0, 1))
array([[1, 2],
[4, 5]])
>>> v2m(arr, (0, 2), (0, 2))
array([[1, 3],
[7, 9]])
"""
x, y = np.array([(i0, i1) for i0 in id0 for i1 in id1]).T
return arr[x, y].reshape((len(id0), len(id1)))
def verbose(numb_of_features, dist, fs):
if VERBOSE:
print("Feature to select: %d" % (numb_of_features))
print("Maximum minimum distance: %.4f" % dist)
print("Features selected: %r" % fs)
print()
def bhat1(f_id, mua, cva, mub, cvb):
mu_a, cv_a = mua[f_id], cva[f_id, f_id]
mu_b, cv_b = mub[f_id], cvb[f_id, f_id]
mu_diff = mu_a - mu_b
cv_sum = cv_a + cv_b
return (1. / 8. * mu_diff * 1./(cv_sum * 0.5) * mu_diff +
0.5 * log((cv_sum * 0.5)/sqrt(cv_a * cv_b)))
def bhatN(f_id, mua, cva, mub, cvb):
mu_a, cv_a = mua[f_id], v2m(cva, f_id, f_id)
mu_b, cv_b = mub[f_id], v2m(cvb, f_id, f_id)
mu_diff = mu_a - mu_b
cv_sum = cv_a + cv_b
return (np.dot(np.dot(mu_diff, inv(cv_sum * 0.5)), mu_diff)/8. +
0.5 * np.log(det(cv_sum * 0.5) / np.sqrt(det(cv_a) * det(cv_b))))
def jm(f_id, mu, cv, strategy, comb=None, bhat=bhatN):
comb = comb if comb else combinations(sorted(mu.keys()), 2)
return strategy([sqrt(2.*(1-exp(-bhat(f_id, mu[a], cv[a], mu[b], cv[b]))))
for a, b in comb])
def backword(fs, mu, cv, strategy, comb=None):
comb = comb if comb else combinations(sorted(mu.keys()), 2)
change = list(combinations(fs, len(fs) - 1))[::-1]
distance = np.array([jm(np.array(f_id), mu, cv, strategy, comb, bhatN)
for f_id in change])
idistmax = distance.argmax()
return -1 if (idistmax == len(fs) - 1) else idistmax
def seq_forward_floating_fs(classes, data, strategy=np.mean, precision=6):
"""Sequential Forward Floating Feature Selection with
Jeffries-Matusita Distance.
"""
labels = sorted(set(classes))
nrows, ncols = data.shape
mu = fgroup(classes, data, mean)
cv = fgroup(classes, data, cov)
##########################################################################
# STRART
##########################################################################
# computing JM for single features
classes_comb = list(combinations(sorted(mu.keys()), 2))
print('Compute single features distances')
dist = np.array([jm(f_id, mu, cv, strategy, classes_comb, bhat1)
for f_id in range(ncols)])
idistmax = dist.argmax()
res = {1: dict(features=idistmax, distance=dist[idistmax])}
verbose(1, dist[idistmax], [idistmax, ])
# computing JM for two features
features_comb = np.array(list(combinations(range(ncols), 2)))
print('Compute couple features distances')
dist = np.array([jm(f_id, mu, cv, strategy, classes_comb, bhatN)
for f_id in features_comb])
idistmax = dist.argmax()
fs = features_comb[idistmax]
res[2] = dict(features=fs, distance=dist[idistmax])
verbose(2, dist[idistmax], fs)
# computing JM for N features
i = 3
nfeat = ncols - i
check = round(sqrt(2), precision)
while (i < nfeat):
fslist = list(fs)
features_comb = np.array([fslist + [j, ]
for j in range(ncols) if j not in fs])
dist = np.array([jm(f_id, mu, cv, strategy, classes_comb, bhatN)
for f_id in features_comb])
if dist.max() is np.NaN:
return res
idistmax = dist.argmax()
fs = features_comb[idistmax]
verbose(i, dist[idistmax], fs)
bw = backword(fs, mu, cv, strategy, classes_comb)
if bw != -1:
fs = np.array([f for e, f in enumerate(fs) if e != bw])
res[i-1] = dict(features=fs,
distance=jm(fs, mu, cv, strategy, classes_comb))
else:
i += 1
res[i] = dict(features=fs, distance=dist[idistmax])
if check == round(dist[idistmax], precision):
return res
return res
if __name__ == "__main__":
import argparse
# Define the parser options
parser = argparse.ArgumentParser(description='Sequential Forward Floating'
'Feature Selection with '
'Jeffries-Matusita Distance')
parser.add_argument('data', type=argparse.FileType('r'),
help='CSV file with features data')
parser.add_argument('-m', '--method', choices=['mean', 'min', 'median'],
default='mean', dest='method',
help='Feature selection method')
parser.add_argument('-d', '--delimiter', type=str, default=' ',
dest='delimiter', help='CSV delimiter')
parser.add_argument('-s', '--skip-header', type=int, default=1,
dest='skip_header', help='Skip header rows')
args = parser.parse_args()
classes, data = read(args.data, args.delimiter, args.skip_header)
seq_forward_floating_fs(classes, data, strategy=getattr(np, args.method))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment