Commit b472c653 by Pamela Osuna

only one layer possibily for the moment

parents
import parser
import models as m
import roc_auc as ra
import sys
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold
from tensorflow.keras.utils import to_categorical
## global variables
N_SPLITS = 5 # for the kfold
N_CLASSES = 4
def run_nn(input_, output_, n_experiences, params):
c, b, e = params
#kfold validation
""""
X for the input and y for the output
"""
kfold = KFold(N_SPLITS, True, 1) #on definit la methode a utiliser en choisisant n_splits, shuffle on/off, random_state
X_train_kfold = []
X_test_kfold = []
y_train_kfold = []
y_test_kfold = []
#split the input data into k sets
for train_index, test_index in kfold.split(input_):
X_train_kfold.append(input_[train_index])
X_test_kfold.append(input_[test_index])
y_train_kfold.append(output_[train_index])
y_test_kfold.append(output_[test_index])
#balancing the data
sm = SMOTE(random_state=2)
for i in range(len(X_train_kfold)):
X_train_kfold[i], y_train_kfold[i] = sm.fit_sample(X_train_kfold[i],y_train_kfold[i].ravel())
# print(len(X_train_kfold[0])/(len(X_train_kfold[0])+len(X_test_kfold[0]))) #gives 0.8 OK
#build 4 sub-sub-sets out of each of the k subsets (we iterate the validation, taking it from the train set)
X_validation = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
X_train = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
y_validation = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
y_train = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
len_validation = int(len(X_train_kfold[0])/4)
for i in range(N_SPLITS):
idx = 0
for j in range(N_SPLITS-1):
X_validation[i][j] = X_train_kfold[i][idx:idx+len_validation]
X_train[i][j] = list(X_train_kfold[i][0:idx]) + list(X_train_kfold[i][idx+len_validation:])
y_validation[i][j] = y_train_kfold[i][idx:idx+len_validation]
y_train[i][j] = list(y_train_kfold[i][0:idx]) + list(y_train_kfold[i][idx+len_validation:])
idx+=len_validation
#print(len(X_validation[0][0]), len(X_train[0][0])) #we expect X_validation[0] to be 1/3 of X_train's length
validation_Y_one_hot = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
train_Y_one_hot = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
for i in range(N_SPLITS):
for j in range(N_SPLITS-1):
# change the labels from categorical to one-hot encoding
train_Y_one_hot[i][j] = to_categorical(y_train[i][j], num_classes = 4)
validation_Y_one_hot[i][j] = to_categorical(y_validation[i][j], num_classes = 4)
#convert input to np.array
X_train[i][j] = np.array(X_train[i][j])
X_validation[i][j] = np.array(X_validation[i][j])
#convert each element of the train and test set into a matrix of size 30x1(?)
X_train[i][j] = X_train[i][j].reshape(-1, 30, 1)
X_validation[i][j] = X_validation[i][j].reshape(-1, 30, 1)
#convert the data from an int8 format to a float32 type
X_train[i][j] = X_train[i][j].astype('float32')
X_validation[i][j] = X_validation[i][j].astype('float32')
# defining keras model
model = m.model_architecture(c)
#compile the keras model
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
#self reminder : warning! be careful not to use i and j as indexes later in here for something else
#i: number of the test_set (i belongs to {0, ..., k-1})
#j: number of the validation_set (j belongs to {0, ..., k-2})
total_acc = 0
total_auc = 0
bs, ep = m.choose_batch_epochs(b,e)
for i in range(N_SPLITS):
for j in range(N_SPLITS-1):
#train the model
model.fit(X_train[i][j], train_Y_one_hot[i][j], batch_size = bs, epochs = ep, verbose = 1, validation_data = (X_validation[i][j], validation_Y_one_hot[i][j]))
#calculate accuracy
_,accuracy = model.evaluate(X_validation[i][j], validation_Y_one_hot[i][j], verbose = 0)
total_acc += accuracy
print("t_set = " + str(i) + " v_set = " + str(j))
print('Test accuracy:', accuracy)
# calculate area under the curve and confu
y_pred = model.predict(X_validation, batch_size = bs)
fpr, tpr, auc = ra.roc_auc(N_CLASSES, validation_Y_one_hot, y_pred)
total_auc += auc
print("Area under the curve:", auc)
total_acc = total_acc/(N_SPLITS*(N_SPLITS-1))
total_auc = total_acc/(N_SPLITS*(N_SPLITS-1))
print("Average accuracy: ", total_acc)
print("Average area under the curve: ", total_auc)
return total_acc, total_auc
import matplotlib.pyplot as plt
import numpy as np
import itertools
def plot_confusion_matrix(cm,
target_names,
title='Confusion matrix',
cmap=None,
normalize=True):
"""
given a sklearn confusion matrix (cm), make a nice plot
Arguments
---------
cm: confusion matrix from sklearn.metrics.confusion_matrix
target_names: given classification classes such as [0, 1, 2]
the class names, for example: ['high', 'medium', 'low']
title: the text to display at the top of the matrix
cmap: the gradient of the values displayed from matplotlib.pyplot.cm
see http://matplotlib.org/examples/color/colormaps_reference.html
plt.get_cmap('jet') or plt.cm.Blues
normalize: If False, plot the raw numbers
If True, plot the proportions
Usage
-----
plot_confusion_matrix(cm = cm, # confusion matrix created by
# sklearn.metrics.confusion_matrix
normalize = True, # show proportions
target_names = y_labels_vals, # list of names of the classes
title = best_estimator_name) # title of graph
Citiation
---------
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
"""
accuracy = np.trace(cm) / float(np.sum(cm))
misclass = 1 - accuracy
if cmap is None:
cmap = plt.get_cmap('Blues')
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
#if target_names is not None:
# tick_marks = np.arange(len(target_names))
# plt.xticks(tick_marks, target_names, rotation=90)
# plt.yticks(tick_marks, target_names)
"""fig, ax = plt.subplots()
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=target_names, yticklabels=target_names,
title=title,
#ylabel='True label',
#xlabel='Predicted label'
)
"""
thresh = cm.max() / 1.5 if normalize else cm.max() / 2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
if normalize:
plt.text(j, i, "{:0.4f}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
else:
plt.text(j, i, "{:,}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
plt.show()
import cnn
import parser
import itertools
"""
(c,b,e) will be read from the command line or a script
(c,b,e) corresponds to the combinations of the specific hyperparameters to build the model
c belongs to {0,1,2,3} and represents the layer architecture
b belongs to {0,1} and represents the batch size
e belongs to {0,1} and represents the number of epochs
"""
#reading arguments from command line
#c = int(sys.argv[1])
#b = int(sys.argv[2])
#e = int(sys.argv[3])
#c_ = [0,1,2,3]
c_ = [0] ### to change after defining all of the possibilities
b_ = [0,1]
e_ = [0,1]
n_experiences = 1001
combinations = itertools.product(c_,b_,e_)
#parse the data
input_, output_ = parser.parse_data(n_experiences)
#run an specific combination
for params in combinations:
cnn.run_nn(input_, output_, n_experiences, params)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D
# models
num_classes = 4
def model_architecture(c):
"""
(c,b,e) corresponds to the combinations of the specific hyperparameters to build the model
c belongs to {0,1,2,3} and represents the layer architecture
b belongs to {0,1} and represents the batch size
e belongs to {0,1} and represents the number of epochs
"""
#defining the keras model
model = Sequential()
if c == 0:
model.add(Conv1D(64, kernel_size = 3, activation = 'linear', input_shape = (30, 1)))
model.add(Conv1D(64,3, activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Flatten())
model.add(Dense(100, activation = 'relu'))
model.add(Dense(num_classes, activation = 'softmax'))
if c == 1:
pass
if c == 2:
pass
if c == 3:
pass
return model
def choose_batch_epochs(b,e):
if b == 0 and e == 0:
return 16, 128
if b == 0 and e == 1:
return 16, 512
if b == 1 and e == 0:
return 64, 128
if b == 1 and e == 1:
return 64, 512
def output_convert(N, e, r):
"""
output meaning:
0 : not evolv. and not rob.
1 : evol. and not rob.
2 : not evol. and rob.
3 : evol. and rob.
"""
output = []
if (e[0]):
if (r[0]):
output.append(3)
else: output.append(1)
elif (r[0]):
output.append(2)
else: output.append(0)
return output
import numpy as np
import pandas as pd
import output_convert as oc
def parse_data(n_experiences):
N_DATA = n_experiences #from 2 to 1001
#DATA PARSING
X = []
input_ = []
output_ = []
N0 = 15 #15 or 20
#table of the name of the bionets
str_ = ["arabidopsis", "cardiac", "cd4", "mammalian", "metabolic", "anemia", "aurka", "b-cell", "body-drosophila", "bt474", "bt474-ErbB", "cycle-cdk", "fgf-drosophila", "gonadal", "hcc1954", "hcc1954-ErbB", "hh-drosophila", "l-arabinose-operon", "leukemia", "neurotransmitter", "oxidative-stress", "skbr-long", "skbr3-short", "spz-drosophila", "t-lgl-survival", "tol", "toll-drosophila", "trichostrongylus", "vegf-drosophila", "wg-drosophila", "yeast-cycle", "aspergillus-fumigatus", "budding-yeast", "gene-cardiac", "t-cell-differentiation", "lac-operon-bistability", "core-cell-cycle", "cortical"]
for s in str_:
dataXi_original = pd.read_csv("updated_data/"+ s + "/" + s + "_metrics.csv", sep=",", header=0)
N = int(dataXi_original.loc[1, 'N'])
antifragility_original = list(np.array(dataXi_original.loc[:, 'Antifragility']).astype(float))
# 15 or 20 points describing the relationship between the original values of antifragility in the network before perturbations and X/N
X_tmp = list(np.arange(1,N+1, 1))
X_N = [X_tmp[i]/N for i in range(N)]
original_points = [np.interp(i/N0, X_N, antifragility_original) for i in range(1, N0+1)]
for i in range(1,N_DATA):
#read the data for each experience
n = format(i, '09')
dataXi_tmp = pd.read_csv("updated_data/"+ s + "/" + s + "_" + n + "_metrics.csv", sep=",", header=0)
antifragility_tmp = list(np.array(dataXi_tmp.loc[:, 'Antifragility']).astype(float))
input_tmp = original_points + [np.interp(i/N0, X_N, antifragility_tmp) for i in range(1, N0+1)]
evolvable_tmp = list(np.array(dataXi_tmp.loc[:, 'Evolvability']).astype(int))
robust_tmp = list(np.array(dataXi_tmp.loc[:, 'Robustness']).astype(int))
output_tmp = oc.output_convert(N, evolvable_tmp, robust_tmp)
input_.append(input_tmp)
output_+= output_tmp
input_ = np.array(input_)
output_ = np.array(output_)
return input_, output_
from sklearn.metrics import roc_curve, auc
def roc_auc(n_classes, y_test, y_score):
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
return fpr[2], tpr[2], roc_auc[2]
def plot_roc_auc(fpr, tpr, roc_auc):
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment