only one layer possibily for the moment

b472c653 · Pamela Osuna · b472c653 · b472c653 · b472c653 · b472c653
Commit b472c653 authored Dec 02, 2019 by Pamela Osuna
Hide whitespace changes
Inline Side-by-side

Showing with 384 additions and 0 deletions

cnn.py
+127 -0

confusion_matrix.py
+90 -0

main.py
+31 -0

models.py
+46 -0

output_convert.py
+17 -0

parser.py
+43 -0

roc_auc.py
+30 -0

No files found.
--- a/cnn.py
+++ b/cnn.py
+import parser
+import models as m
+import roc_auc as ra
+import sys
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+from imblearn.over_sampling import SMOTE
+from sklearn.model_selection import KFold
+from tensorflow.keras.utils import to_categorical
+
+## global variables
+N_SPLITS = 5 # for the kfold
+N_CLASSES = 4
+
+
+def run_nn(input_, output_, n_experiences, params):
+
+	c, b, e = params
+
+
+	#kfold validation
+	""""
+	X for the input and y for the output
+	"""
+
+	kfold = KFold(N_SPLITS, True, 1) #on definit la methode a utiliser en choisisant n_splits, shuffle on/off, random_state
+
+	X_train_kfold = []
+	X_test_kfold = []
+	y_train_kfold = []
+	y_test_kfold = []
+
+	#split the input data into k sets
+
+	for train_index, test_index in kfold.split(input_):
+		X_train_kfold.append(input_[train_index])
+		X_test_kfold.append(input_[test_index])
+		y_train_kfold.append(output_[train_index])
+		y_test_kfold.append(output_[test_index])
+
+	#balancing the data
+	sm = SMOTE(random_state=2)
+	for i in range(len(X_train_kfold)):
+	    X_train_kfold[i], y_train_kfold[i] = sm.fit_sample(X_train_kfold[i],y_train_kfold[i].ravel())
+
+	# print(len(X_train_kfold[0])/(len(X_train_kfold[0])+len(X_test_kfold[0]))) #gives 0.8 OK
+	#build 4 sub-sub-sets out of each of the k subsets (we iterate the validation, taking it from the train set)
+	X_validation = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
+	X_train = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
+	y_validation = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
+	y_train = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
+
+	len_validation = int(len(X_train_kfold[0])/4)
+
+	for i in range(N_SPLITS):
+	    idx = 0
+	    for j in range(N_SPLITS-1):
+	        X_validation[i][j] = X_train_kfold[i][idx:idx+len_validation]
+	        X_train[i][j] = list(X_train_kfold[i][0:idx]) + list(X_train_kfold[i][idx+len_validation:])
+	        y_validation[i][j] = y_train_kfold[i][idx:idx+len_validation]
+	        y_train[i][j] = list(y_train_kfold[i][0:idx]) + list(y_train_kfold[i][idx+len_validation:])
+
+	        idx+=len_validation
+
+	#print(len(X_validation[0][0]), len(X_train[0][0])) #we expect X_validation[0] to be 1/3 of X_train's length
+
+	validation_Y_one_hot = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
+	train_Y_one_hot = [[0]*(N_SPLITS-1) for i in range(N_SPLITS)]
+
+	for i in range(N_SPLITS):
+	    for j in range(N_SPLITS-1):
+	        # change the labels from categorical to one-hot encoding
+	        train_Y_one_hot[i][j] = to_categorical(y_train[i][j], num_classes = 4)
+	        validation_Y_one_hot[i][j] = to_categorical(y_validation[i][j], num_classes = 4)
+
+	        #convert input to np.array
+	        X_train[i][j] = np.array(X_train[i][j])
+	        X_validation[i][j] = np.array(X_validation[i][j])
+
+	        #convert each element of the train and test set into a matrix of size 30x1(?)
+	        X_train[i][j] = X_train[i][j].reshape(-1, 30, 1)
+	        X_validation[i][j] = X_validation[i][j].reshape(-1, 30, 1)
+
+	        #convert the data from an int8 format to a float32 type
+	        X_train[i][j] = X_train[i][j].astype('float32')
+	        X_validation[i][j] = X_validation[i][j].astype('float32')
+
+	# defining keras model
+	model = m.model_architecture(c)
+	#compile the keras model
+	model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
+
+	#self reminder : warning! be careful not to use i and j as indexes later in here for something else
+
+	#i: number of the test_set (i belongs to {0, ..., k-1})
+	#j: number of the validation_set (j belongs to {0, ..., k-2})
+	total_acc = 0
+	total_auc = 0
+	bs, ep = m.choose_batch_epochs(b,e)
+
+
+	for i in range(N_SPLITS):
+	    for j in range(N_SPLITS-1):
+	    #train the model
+	        model.fit(X_train[i][j], train_Y_one_hot[i][j], batch_size = bs, epochs = ep, verbose = 1, validation_data = (X_validation[i][j], validation_Y_one_hot[i][j]))
+
+	        #calculate accuracy
+	        _,accuracy = model.evaluate(X_validation[i][j], validation_Y_one_hot[i][j], verbose = 0)
+	        total_acc += accuracy
+	        print("t_set = " + str(i) + " v_set = " + str(j))
+	        print('Test accuracy:', accuracy)
+
+
+	        # calculate area under the curve and confu
+	        y_pred = model.predict(X_validation, batch_size = bs)
+	        fpr, tpr, auc = ra.roc_auc(N_CLASSES, validation_Y_one_hot, y_pred)
+	        total_auc += auc
+	        print("Area under the curve:", auc)
+
+
+	total_acc = total_acc/(N_SPLITS*(N_SPLITS-1))
+	total_auc = total_acc/(N_SPLITS*(N_SPLITS-1))
+	print("Average accuracy: ", total_acc)
+	print("Average area under the curve: ", total_auc)
+
+	return total_acc, total_auc
--- a/confusion_matrix.py
+++ b/confusion_matrix.py
+import matplotlib.pyplot as plt
+import numpy as np
+import itertools
+
+def plot_confusion_matrix(cm,
+                          target_names,
+                          title='Confusion matrix',
+                          cmap=None,
+                          normalize=True):
+    """
+    given a sklearn confusion matrix (cm), make a nice plot
+
+    Arguments
+    ---------
+    cm:           confusion matrix from sklearn.metrics.confusion_matrix
+
+    target_names: given classification classes such as [0, 1, 2]
+                  the class names, for example: ['high', 'medium', 'low']
+
+    title:        the text to display at the top of the matrix
+
+    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
+                  see http://matplotlib.org/examples/color/colormaps_reference.html
+                  plt.get_cmap('jet') or plt.cm.Blues
+
+    normalize:    If False, plot the raw numbers
+                  If True, plot the proportions
+
+    Usage
+    -----
+    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
+                                                              # sklearn.metrics.confusion_matrix
+                          normalize    = True,                # show proportions
+                          target_names = y_labels_vals,       # list of names of the classes
+                          title        = best_estimator_name) # title of graph
+
+    Citiation
+    ---------
+    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
+
+    """
+
+    accuracy = np.trace(cm) / float(np.sum(cm))
+    misclass = 1 - accuracy
+
+    if cmap is None:
+        cmap = plt.get_cmap('Blues')
+
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+
+    plt.figure(figsize=(8, 6))
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title)
+    plt.colorbar()
+
+
+    #if target_names is not None:
+    #    tick_marks = np.arange(len(target_names))
+    #    plt.xticks(tick_marks, target_names, rotation=90)
+    #    plt.yticks(tick_marks, target_names)
+
+
+    """fig, ax = plt.subplots()
+    ax.set(xticks=np.arange(cm.shape[1]),
+           yticks=np.arange(cm.shape[0]),
+           # ... and label them with the respective list entries
+           xticklabels=target_names, yticklabels=target_names,
+           title=title,
+           #ylabel='True label',
+           #xlabel='Predicted label'
+           )
+    """
+    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
+
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        if normalize:
+            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
+                     horizontalalignment="center",
+                     color="white" if cm[i, j] > thresh else "black")
+        else:
+            plt.text(j, i, "{:,}".format(cm[i, j]),
+                     horizontalalignment="center",
+                     color="white" if cm[i, j] > thresh else "black")
+
+
+    plt.tight_layout()
+    plt.ylabel('True label')
+    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
+    plt.show()
--- a/main.py
+++ b/main.py
+import cnn
+import parser
+import itertools
+
+"""
+(c,b,e) will be read from the command line or a script
+(c,b,e) corresponds to the combinations of the specific hyperparameters to build the model
+c belongs to {0,1,2,3} and represents the layer architecture
+b belongs to {0,1} and represents the batch size
+e belongs to {0,1} and represents the number of epochs
+"""
+#reading arguments from command line
+
+#c = int(sys.argv[1])
+#b = int(sys.argv[2])
+#e = int(sys.argv[3])
+
+#c_ = [0,1,2,3]
+c_ = [0] ### to change after defining all of the possibilities
+b_ = [0,1]
+e_ = [0,1]
+n_experiences = 1001
+combinations = itertools.product(c_,b_,e_)
+
+
+#parse the data
+input_, output_ = parser.parse_data(n_experiences)
+
+#run an specific combination
+for params in combinations:
+    cnn.run_nn(input_, output_, n_experiences, params)
--- a/models.py
+++ b/models.py
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Dropout, Flatten
+from tensorflow.keras.layers import Conv1D, MaxPooling1D
+
+# models
+num_classes = 4
+
+
+def model_architecture(c):
+    """
+    (c,b,e) corresponds to the combinations of the specific hyperparameters to build the model
+    c belongs to {0,1,2,3} and represents the layer architecture
+    b belongs to {0,1} and represents the batch size
+    e belongs to {0,1} and represents the number of epochs
+    """
+    #defining the keras model
+    model = Sequential()
+
+    if c == 0:
+        model.add(Conv1D(64, kernel_size = 3, activation = 'linear', input_shape = (30, 1)))
+        model.add(Conv1D(64,3, activation = 'relu'))
+        model.add(MaxPooling1D(pool_size = 2))
+        model.add(Flatten())
+        model.add(Dense(100, activation = 'relu'))
+        model.add(Dense(num_classes, activation = 'softmax'))
+
+    if c == 1:
+        pass
+
+    if c == 2:
+        pass
+
+    if c == 3:
+        pass
+
+    return model
+
+def choose_batch_epochs(b,e):
+    if b == 0 and e == 0:
+        return 16, 128
+    if b == 0 and e == 1:
+        return 16, 512
+    if b == 1 and e == 0:
+        return 64, 128
+    if b == 1 and e == 1:
+        return 64, 512
--- a/output_convert.py
+++ b/output_convert.py
+def output_convert(N, e, r):
+	"""
+	output meaning:
+	0 : not evolv. and not rob.
+	1 : evol. and not rob.
+	2 : not evol. and rob.
+	3 : evol. and rob.
+	"""
+	output = []
+	if (e[0]):
+		if (r[0]):
+			output.append(3)
+		else: output.append(1)
+	elif (r[0]):
+		output.append(2)
+	else: output.append(0)
+	return output
--- a/parser.py
+++ b/parser.py
+import numpy as np
+import pandas as pd
+import output_convert as oc
+
+def parse_data(n_experiences):
+    N_DATA = n_experiences #from 2 to 1001
+    #DATA PARSING
+    X = []
+    input_ = []
+    output_ = []
+    N0 = 15 #15 or 20
+
+    #table of the name of the bionets
+    str_ = ["arabidopsis", "cardiac", "cd4", "mammalian", "metabolic", "anemia", "aurka", "b-cell", "body-drosophila", "bt474", "bt474-ErbB", "cycle-cdk", "fgf-drosophila", "gonadal", "hcc1954", "hcc1954-ErbB", "hh-drosophila", "l-arabinose-operon", "leukemia", "neurotransmitter", "oxidative-stress", "skbr-long", "skbr3-short", "spz-drosophila", "t-lgl-survival", "tol", "toll-drosophila", "trichostrongylus", "vegf-drosophila", "wg-drosophila", "yeast-cycle", "aspergillus-fumigatus", "budding-yeast", "gene-cardiac", "t-cell-differentiation", "lac-operon-bistability", "core-cell-cycle", "cortical"]
+
+    for s in str_:
+    	dataXi_original = pd.read_csv("updated_data/"+ s + "/" + s + "_metrics.csv", sep=",", header=0)
+    	N = int(dataXi_original.loc[1, 'N'])
+    	antifragility_original = list(np.array(dataXi_original.loc[:, 'Antifragility']).astype(float))
+    	# 15 or 20 points describing the relationship between the original values of antifragility in the network before perturbations and X/N
+    	X_tmp = list(np.arange(1,N+1, 1))
+    	X_N = [X_tmp[i]/N for i in range(N)]
+    	original_points = [np.interp(i/N0, X_N, antifragility_original) for i in range(1, N0+1)]
+
+    	for i in range(1,N_DATA):
+    		#read the data for each experience
+    		n = format(i, '09')
+    		dataXi_tmp = pd.read_csv("updated_data/"+ s + "/" + s + "_" + n + "_metrics.csv", sep=",", header=0)
+
+    		antifragility_tmp = list(np.array(dataXi_tmp.loc[:, 'Antifragility']).astype(float))
+
+    		input_tmp = original_points + [np.interp(i/N0, X_N, antifragility_tmp) for i in range(1, N0+1)]
+
+    		evolvable_tmp = list(np.array(dataXi_tmp.loc[:, 'Evolvability']).astype(int))
+    		robust_tmp = list(np.array(dataXi_tmp.loc[:, 'Robustness']).astype(int))
+    		output_tmp = oc.output_convert(N, evolvable_tmp, robust_tmp)
+
+    		input_.append(input_tmp)
+    		output_+= output_tmp
+
+    input_ = np.array(input_)
+    output_ = np.array(output_)
+    return input_, output_
--- a/roc_auc.py
+++ b/roc_auc.py
+from sklearn.metrics import roc_curve, auc
+
+def roc_auc(n_classes, y_test, y_score):
+    # Compute ROC curve and ROC area for each class
+    fpr = dict()
+    tpr = dict()
+    roc_auc = dict()
+    for i in range(n_classes):
+        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
+        roc_auc[i] = auc(fpr[i], tpr[i])
+
+    # Compute micro-average ROC curve and ROC area
+    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
+    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
+
+    return fpr[2], tpr[2], roc_auc[2]
+
+def plot_roc_auc(fpr, tpr, roc_auc):
+    plt.figure()
+    lw = 2
+    plt.plot(fpr, tpr, color='darkorange',
+             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
+    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('Receiver operating characteristic example')
+    plt.legend(loc="lower right")
+    plt.show()