Skip to content

Instantly share code, notes, and snippets.

@SimonErm
Last active March 11, 2022 01:48
Show Gist options
  • Save SimonErm/b06c236cafdeb79fdf7adb90aef04fec to your computer and use it in GitHub Desktop.
Save SimonErm/b06c236cafdeb79fdf7adb90aef04fec to your computer and use it in GitHub Desktop.
A testimplementation of MLSMOTE in Python(Charte, F. & Rivera Rivas, Antonio & Del Jesus, María José & Herrera, Francisco. (2015). MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation. Knowledge-Based Systems. -. 10.1016/j.knosys.2015.07.019. ).Since I am no python programmer you will find a lot of stackoverf…
import numpy as np
import itertools
import collections
import random
class MLSMOTE:
def __init__(self,k):
self.k=k
self.full_label_set = []
self.labels=[]
self.features=[]
def fit_resample(self,X,y,k):
self.full_label_set = np.unique(np.array([a for x in y for a in (x if isinstance(x, list) else [x])]))
self.labels=np.array([np.array(xi) for xi in y])
self.features=X
X_synth=[]
y_synth=[]
append_X_synth=X_synth.append
append_y_synth=y_synth.append
mean_ir=self.get_mean_imbalance_ratio()
for label in self.full_label_set:
irlbl=self.get_imbalance_ratio_per_label(label)
if irlbl > mean_ir:
min_bag=self.get_all_instances_of_label(label)
for sample in min_bag:
distances=self.calc_distances(sample,min_bag)
distances=np.sort(distances,order='distance')
neighbours=distances[:k]
ref_neigh=np.random.choice(neighbours,1)[0]
X_new,y_new=self.create_new_sample(sample,ref_neigh[1],[x[1] for x in neighbours])
append_X_synth(X_new)
append_y_synth(y_new)
return np.array(X_synth),np.array(y_synth)
def create_new_sample(self,sample_id,ref_neigh_id,neighbour_ids):
sample=self.features[sample_id]
sample_labels=self.labels[sample_id]
synth_sample=np.zeros(sample.shape[0])
ref_neigh=self.features[ref_neigh_id]
neighbours_labels=[]
for ni in neighbour_ids:
neighbours_labels.append(self.labels[ni].tolist())
for i in range(synth_sample.shape[0]):
#if f is numeric todo:implement nominal support
diff=ref_neigh[i]-sample[i]
offset=diff*random.uniform(0,1)
synth_sample[i]=sample[i]+offset
labels=sample_labels.tolist()
labels+=[a for x in neighbours_labels for a in (x if isinstance(x, list) else [x])]
labels=list(set(labels))
head_index=int((self.k+ 1)/2)
y=labels[:head_index]
X=synth_sample
return X,y
def calc_distances(self,sample,min_bag):
distances=[]
append_distances=distances.append
for bag_sample in min_bag:
#if f is numeric todo:implement nominal support
append_distances((np.linalg.norm(self.features[sample]-self.features[bag_sample]),bag_sample))
dtype = np.dtype([('distance', float), ('index', int)])
return np.array(distances,dtype=dtype)
def get_all_instances_of_label(self,label):
instance_ids=[]
append_instance_id=instance_ids.append
for i,label_set in enumerate(self.labels):
if label in label_set:
append_instance_id(i)
return np.array(instance_ids)
def get_mean_imbalance_ratio(self):
ratio_sum=np.sum(np.array(list(map(self.get_imbalance_ratio_per_label,self.full_label_set))))
return ratio_sum/self.full_label_set.shape[0]
def get_imbalance_ratio_per_label(self,l):
sum_array=list(map(self.sum_h,self.full_label_set))
sum_array=np.array(sum_array)
return sum_array.max()/self.sum_h(l)
def sum_h(self,l):
h_sum=0
def h(l,Y):
if l in Y:
return 1
else:
return 0
for label_set in self.labels:
h_sum+=h(l,label_set)
return h_sum
def get_value_counts(self,labels):
count_map=np.array(np.unique(labels, return_counts=True)).T
counts=np.array([x[1] for x in count_map])
return counts
@rajae-Bens
Copy link

Hi,

Thank u for implementing this method in python but can u provide plz an example with real datasets such as toxic comments from kaggle?

Thank u

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment