Last active
March 11, 2022 01:48
-
-
Save SimonErm/b06c236cafdeb79fdf7adb90aef04fec to your computer and use it in GitHub Desktop.
A testimplementation of MLSMOTE in Python(Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, Francisco. (2015). MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation. Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. ).Since I am no python programmer you will find a lot of stackoverf…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import itertools | |
import collections | |
import random | |
class MLSMOTE: | |
def __init__(self,k): | |
self.k=k | |
self.full_label_set = [] | |
self.labels=[] | |
self.features=[] | |
def fit_resample(self,X,y,k): | |
self.full_label_set = np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])])) | |
self.labels=np.array([np.array(xi) for xi in y]) | |
self.features=X | |
X_synth=[] | |
y_synth=[] | |
append_X_synth=X_synth.append | |
append_y_synth=y_synth.append | |
mean_ir=self.get_mean_imbalance_ratio() | |
for label in self.full_label_set: | |
irlbl=self.get_imbalance_ratio_per_label(label) | |
if irlbl > mean_ir: | |
min_bag=self.get_all_instances_of_label(label) | |
for sample in min_bag: | |
distances=self.calc_distances(sample,min_bag) | |
distances=np.sort(distances,order='distance') | |
neighbours=distances[:k] | |
ref_neigh=np.random.choice(neighbours,1)[0] | |
X_new,y_new=self.create_new_sample(sample,ref_neigh[1],[x[1] for x in neighbours]) | |
append_X_synth(X_new) | |
append_y_synth(y_new) | |
return np.array(X_synth),np.array(y_synth) | |
def create_new_sample(self,sample_id,ref_neigh_id,neighbour_ids): | |
sample=self.features[sample_id] | |
sample_labels=self.labels[sample_id] | |
synth_sample=np.zeros(sample.shape[0]) | |
ref_neigh=self.features[ref_neigh_id] | |
neighbours_labels=[] | |
for ni in neighbour_ids: | |
neighbours_labels.append(self.labels[ni].tolist()) | |
for i in range(synth_sample.shape[0]): | |
#if f is numeric todo:implement nominal support | |
diff=ref_neigh[i]-sample[i] | |
offset=diff*random.uniform(0,1) | |
synth_sample[i]=sample[i]+offset | |
labels=sample_labels.tolist() | |
labels+=[a for x in neighbours_labels for a in (x if isinstance(x, list) else [x])] | |
labels=list(set(labels)) | |
head_index=int((self.k+ 1)/2) | |
y=labels[:head_index] | |
X=synth_sample | |
return X,y | |
def calc_distances(self,sample,min_bag): | |
distances=[] | |
append_distances=distances.append | |
for bag_sample in min_bag: | |
#if f is numeric todo:implement nominal support | |
append_distances((np.linalg.norm(self.features[sample]-self.features[bag_sample]),bag_sample)) | |
dtype = np.dtype([('distance', float), ('index', int)]) | |
return np.array(distances,dtype=dtype) | |
def get_all_instances_of_label(self,label): | |
instance_ids=[] | |
append_instance_id=instance_ids.append | |
for i,label_set in enumerate(self.labels): | |
if label in label_set: | |
append_instance_id(i) | |
return np.array(instance_ids) | |
def get_mean_imbalance_ratio(self): | |
ratio_sum=np.sum(np.array(list(map(self.get_imbalance_ratio_per_label,self.full_label_set)))) | |
return ratio_sum/self.full_label_set.shape[0] | |
def get_imbalance_ratio_per_label(self,l): | |
sum_array=list(map(self.sum_h,self.full_label_set)) | |
sum_array=np.array(sum_array) | |
return sum_array.max()/self.sum_h(l) | |
def sum_h(self,l): | |
h_sum=0 | |
def h(l,Y): | |
if l in Y: | |
return 1 | |
else: | |
return 0 | |
for label_set in self.labels: | |
h_sum+=h(l,label_set) | |
return h_sum | |
def get_value_counts(self,labels): | |
count_map=np.array(np.unique(labels, return_counts=True)).T | |
counts=np.array([x[1] for x in count_map]) | |
return counts |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
Thank u for implementing this method in python but can u provide plz an example with real datasets such as toxic comments from kaggle?
Thank u