Commit 9bbfb766 by Stalin Munoz

undersampling added

parent 678bfe85
......@@ -5,6 +5,8 @@ from matplotlib.pyplot \
import figure,plot,title,ylabel,xlabel,legend,savefig,ioff
from numpy import expand_dims as dims
from numpy import unique
from random import shuffle
from functools import reduce
from prec_recall import create_pr, avg_pr
from sklearn.metrics import confusion_matrix
......@@ -13,6 +15,76 @@ from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE,ADASYN
from tensorflow.keras.utils import to_categorical
def undesample(X,y):
"""
Balances the input and output data by removing
samples from the more represented classes
Parameters
----------
X : numpy array
Input data.
y : numpy array
class membership.
Returns
-------
numpy array
Balanced input.
numpy array
Balanced output.
"""
locs = find_classes(y)
shuffle_members(locs)
cmin,members = min(locs.items(),key=lambda x:len(x[1]))
# maximum number of samples to keep cardinality of classes balanced
n = len(members)
print('Undersampling to %d samples'%n)
indices = list(reduce(lambda x,y:x+y,map(lambda x:x[:n],locs.values())))
shuffle(indices)
return X[indices],y[indices]
def shuffle_members(cm):
"""
Shuffle the members of each class in place
Parameters
----------
cm : dictionary
keys are classes, values are list of indices of its members.
Returns
-------
None
"""
for c,_ in cm.items():
shuffle(cm[c])
print("cardinality of class %d is %d"%(c,len(cm[c])))
def find_classes(x):
"""
Find the indices of members of each class
Parameters
----------
x : iterable
heterogeneous memberships.
Returns
-------
locs : list
dictionary with the locations of the members for each class.
"""
locs = {}
for i,c in enumerate(x):
if c in locs:
locs[c].append(i)
else:
locs[c] = [i]
return locs
def encode(output,N=4):
"""
One hot encoding of the input
......@@ -66,6 +138,9 @@ def balance(X,y,method):
print('CLASS WEIGHTS')
weights = class_weight.compute_class_weight('balanced',unique(y),y)
return (X,y),weights
elif method == 'undersampling':
print('UNDERSAMPLING')
return undesample(X,y),None
class CNN_Antifrag:
"""
......
......@@ -14,11 +14,11 @@ o belongs to {0,1,2} and represents the balancing method
"""
c_ = [0,1,2,3]
b_ = [1]
e_ = [0]
o_ = [0,1,2]
b_ = [0,1]
e_ = [0,1]
o_ = [0,1,2,3]
n_experiences = 100
n_experiences = 10000
combinations = itertools.product(c_,b_,e_,o_)
......@@ -54,5 +54,4 @@ plot_confusion_matrix(cm, labels)
f = open("out/acc_auc.txt", 'w+')
f.write("Average accuracy: " + str(acc)+"\n")
f.write("Average area under the curve: " + str(auc))
f.close()
f.close()
\ No newline at end of file
......@@ -80,4 +80,6 @@ def choose_balancing_method(o):
elif o == 1:
return 'adasyn'
elif o == 2:
return 'class_weight'
\ No newline at end of file
return 'class_weight'
elif o == 3:
return 'undersampling'
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment