Commit 9bbfb766 by Stalin Munoz

undersampling added

parent 678bfe85
...@@ -5,6 +5,8 @@ from matplotlib.pyplot \ ...@@ -5,6 +5,8 @@ from matplotlib.pyplot \
import figure,plot,title,ylabel,xlabel,legend,savefig,ioff import figure,plot,title,ylabel,xlabel,legend,savefig,ioff
from numpy import expand_dims as dims from numpy import expand_dims as dims
from numpy import unique from numpy import unique
from random import shuffle
from functools import reduce
from prec_recall import create_pr, avg_pr from prec_recall import create_pr, avg_pr
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
...@@ -13,6 +15,76 @@ from sklearn.utils import class_weight ...@@ -13,6 +15,76 @@ from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE,ADASYN from imblearn.over_sampling import SMOTE,ADASYN
from tensorflow.keras.utils import to_categorical from tensorflow.keras.utils import to_categorical
def undesample(X,y):
"""
Balances the input and output data by removing
samples from the more represented classes
Parameters
----------
X : numpy array
Input data.
y : numpy array
class membership.
Returns
-------
numpy array
Balanced input.
numpy array
Balanced output.
"""
locs = find_classes(y)
shuffle_members(locs)
cmin,members = min(locs.items(),key=lambda x:len(x[1]))
# maximum number of samples to keep cardinality of classes balanced
n = len(members)
print('Undersampling to %d samples'%n)
indices = list(reduce(lambda x,y:x+y,map(lambda x:x[:n],locs.values())))
shuffle(indices)
return X[indices],y[indices]
def shuffle_members(cm):
"""
Shuffle the members of each class in place
Parameters
----------
cm : dictionary
keys are classes, values are list of indices of its members.
Returns
-------
None
"""
for c,_ in cm.items():
shuffle(cm[c])
print("cardinality of class %d is %d"%(c,len(cm[c])))
def find_classes(x):
"""
Find the indices of members of each class
Parameters
----------
x : iterable
heterogeneous memberships.
Returns
-------
locs : list
dictionary with the locations of the members for each class.
"""
locs = {}
for i,c in enumerate(x):
if c in locs:
locs[c].append(i)
else:
locs[c] = [i]
return locs
def encode(output,N=4): def encode(output,N=4):
""" """
One hot encoding of the input One hot encoding of the input
...@@ -66,6 +138,9 @@ def balance(X,y,method): ...@@ -66,6 +138,9 @@ def balance(X,y,method):
print('CLASS WEIGHTS') print('CLASS WEIGHTS')
weights = class_weight.compute_class_weight('balanced',unique(y),y) weights = class_weight.compute_class_weight('balanced',unique(y),y)
return (X,y),weights return (X,y),weights
elif method == 'undersampling':
print('UNDERSAMPLING')
return undesample(X,y),None
class CNN_Antifrag: class CNN_Antifrag:
""" """
......
...@@ -14,11 +14,11 @@ o belongs to {0,1,2} and represents the balancing method ...@@ -14,11 +14,11 @@ o belongs to {0,1,2} and represents the balancing method
""" """
c_ = [0,1,2,3] c_ = [0,1,2,3]
b_ = [1] b_ = [0,1]
e_ = [0] e_ = [0,1]
o_ = [0,1,2] o_ = [0,1,2,3]
n_experiences = 100 n_experiences = 10000
combinations = itertools.product(c_,b_,e_,o_) combinations = itertools.product(c_,b_,e_,o_)
...@@ -55,4 +55,3 @@ f = open("out/acc_auc.txt", 'w+') ...@@ -55,4 +55,3 @@ f = open("out/acc_auc.txt", 'w+')
f.write("Average accuracy: " + str(acc)+"\n") f.write("Average accuracy: " + str(acc)+"\n")
f.write("Average area under the curve: " + str(auc)) f.write("Average area under the curve: " + str(auc))
f.close() f.close()
\ No newline at end of file
...@@ -81,3 +81,5 @@ def choose_balancing_method(o): ...@@ -81,3 +81,5 @@ def choose_balancing_method(o):
return 'adasyn' return 'adasyn'
elif o == 2: elif o == 2:
return 'class_weight' return 'class_weight'
elif o == 3:
return 'undersampling'
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment