Merge branch 'master' of https://git.c3.unam.mx/pamela.osuna/antifragility

66cff740 · Pamela Osuna · 08490059 · bb0354f0 · 66cff740 · 66cff740
Commit 66cff740 authored Jan 19, 2020 by Pamela Osuna
Expand all Show whitespace changes
Inline Side-by-side

Showing with 169 additions and 68 deletions

cnn.py
+0 -0

confusion_matrix.py
+8 -2

main.py
+30 -30

models.py
+12 -2

output_convert.py
+17 -10

parser.py
+100 -22

prec_recall.py
+2 -2

No files found.
--- a/cnn.py
+++ b/cnn.py
--- a/confusion_matrix.py
+++ b/confusion_matrix.py
@@ -49,7 +49,8 @@ def plot_confusion_matrix(cm,
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

-    plt.figure(figsize=(8, 6))
+    fig = plt.figure(figsize=(8, 6))
+    ax = fig.add_subplot()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
@@ -88,5 +89,10 @@ def plot_confusion_matrix(cm,
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    #plt.show()
-    plt.savefig("confusion_matrix")
+    # Added labels
+    labels = ['']*(2*len(target_names))
+    labels[::2]=target_names
+    ax.set_xticklabels([''] + labels)
+    ax.set_yticklabels([''] + labels)
+    plt.savefig("confusion_matrix.pdf")
    plt.close()
--- a/main.py
+++ b/main.py
-import cnn
+from cnn import CNN_Antifrag
 from parser import parse_data
 import itertools
 from confusion_matrix import plot_confusion_matrix

 """
-(c,b,e) will be read from the command line or a script
-(c,b,e) corresponds to the combinations of the specific hyperparameters to build the model
+(c,b,e,o) will be read from the command line or a script
+(c,b,e,o) corresponds to the combinations of the specific hyperparameters 
+to build the model
 c belongs to {0,1,2,3} and represents the layer architecture
 b belongs to {0,1} and represents the batch size
 e belongs to {0,1} and represents the number of epochs
+o belongs to {0,1,2} and represents the balancing method
 """
-#reading arguments from command line
-
-#c = int(sys.argv[1])
-#b = int(sys.argv[2])
-#e = int(sys.argv[3])

 c_ = [0,1,2,3]
-b_ = [0,1]
-e_ = [0,1]
-n_experiences = 10001
-combinations = itertools.product(c_,b_,e_)
+b_ = [1]
+e_ = [0]
+o_ = [0,1,2]

+n_experiences = 100
+combinations = itertools.product(c_,b_,e_,o_)

-#parse the data
-input_, output_ = parse_data(n_experiences)

-#run an specific combination
-max_params = (0,0,0)
+#parse the data
+input_, output_ = parse_data(n_experiences,kind='linear')
+#%%
 max_avg_auc = 0

 for params in combinations:
-    avg_acc, avg_auc, X_train_kfold, X_test_kfold, y_train_kfold, y_test_kfold = cnn.run_nn(input_, output_, n_experiences, params)
+    cnn = CNN_Antifrag(name="CNN_%d_%d_%d_%d"%params)
+    avg_acc, avg_auc  = cnn.run_nn(input_, output_, params)
    if avg_auc > max_avg_auc:
        max_avg_auc = avg_auc
        max_params = params
-        X_train_kfold_opt = X_train_kfold
-        X_test_kfold_opt = X_test_kfold
-        y_train_kfold_opt = y_train_kfold
-        y_test_kfold_opt = y_test_kfold
        
+#%%
+print("Best params:",max_params)
 # once we have chosen the optimal parameters we can do the normal kfold
-
-#note: the test data remains unbalanced
-acc, auc, cm, pr = cnn.run_kfold(X_train_kfold_opt, X_test_kfold_opt, y_train_kfold_opt, y_test_kfold_opt, max_params)
+cnn = CNN_Antifrag(name="CNN_%d_%d_%d_%d"%max_params)
+acc, auc, cm, pr = cnn.run_kfold(input_, output_, max_params)
 #to add: precision recall curve

-
-labels = ['~robust&~evolvable', 'evolvable&~robust', 'robust&~evolvable', 'robust&evolvable']
-plot_confusion_matrix(cm, labels) #this function saves the matrix image automatically
-
-f = open("acc_auc.txt", 'w+')
+#%%
+labels = [
+    '[~R & ~E]', 
+    '[~R &  E]', 
+    '[ R & ~E]', 
+    '[ R &  E]'
+    ]
+#this function saves the matrix image automatically
+plot_confusion_matrix(cm, labels) 
+
+f = open("out/acc_auc.txt", 'w+')
 f.write("Average accuracy: " + str(acc)+"\n")
 f.write("Average area under the curve: " + str(auc))
 f.close()

-## TO DO: code that allows to execute in parallel, make sure it's the same random shuffle ...
--- a/models.py
+++ b/models.py
@@ -61,14 +61,23 @@ def model_architecture(c):
        model.add(Dropout(0.5))
        model.add(Dense(num_classes, activation='softmax'))

+
    return model

 def choose_batch_epochs(b,e):
    if b == 0 and e == 0:
-        return 16, 128
+        return 16, 12
    if b == 0 and e == 1:
        return 16, 512
    if b == 1 and e == 0:
-        return 64, 128
+        return 64, 12
    if b == 1 and e == 1:
        return 64, 512
+
+def choose_balancing_method(o):
+    if o == 0:
+        return 'smote'
+    elif o == 1:
+        return 'adasyn'
+    elif o == 2:
+        return 'class_weight'
\ No newline at end of file
--- a/output_convert.py
+++ b/output_convert.py
-def output_convert(N, e, r):
+from itertools import product
+
+B = [0,1]
+converter = {(e,r):[i] for (e,r),i in zip(product(B,B),range(4))}
+
+def output_convert(e, r):
    """
+    Encodes outputs as integers
+    
+    Parameters
+    -----------
+    e : 1 evolvable, 0 not evolvable
+    r : 1 robust, 0 not robust
+    
+    Returns 
+    -----------
+    the encoded output
    output meaning:
    0 : not evolv. and not rob.
    1 : evol. and not rob.
    2 : not evol. and rob.
    3 : evol. and rob.
    """
-	output = []
-	if (e[0]):
-		if (r[0]):
-			output.append(3)
-		else: output.append(1)
-	elif (r[0]):
-		output.append(2)
-	else: output.append(0)
-	return output
+    return converter[r,e]
--- a/parser.py
+++ b/parser.py
 import numpy as np
 import pandas as pd
 import output_convert as oc
+import os
+from scipy.interpolate import interp1d as interp

-def parse_data(n_experiences):
-    N_DATA = n_experiences #from 2 to 1001
-    #DATA PARSING
-    X = []
+def interpolate(y,K,kind='linear'):
+    """
+    Interpolates vector x
+
+    Parameters
+    ----------
+    y : list of real numbers
+        dependen variable values.
+    K : integer
+        samples to interpolate
+        
+
+    Returns
+    -------
+    the interpolation values
+
+    """
+        
+    N = len(y)
+    x = np.arange(1/N,1+1/N,1/N)
+    f = interp(x,y,kind=kind,fill_value="extrapolate")
+    xintp = np.arange(1/K,1+1/K,1/K)
+    return f(xintp)
+    
+
+def parse_data(n_experiences,folder="data",samples=30,kind='linear'):
+    N_DATA = n_experiences #from 2 to 10001
    input_ = []
    output_ = []
-    N0 = 15 #15 or 20

    #table of the name of the bionets
-    str_ = ["arabidopsis", "cardiac", "cd4", "mammalian", "metabolic", "anemia", "aurka", "b-cell", "body-drosophila", "bt474", "bt474-ErbB", "cycle-cdk", "fgf-drosophila", "gonadal", "hcc1954", "hcc1954-ErbB", "hh-drosophila", "l-arabinose-operon", "leukemia", "neurotransmitter", "oxidative-stress", "skbr-long", "skbr3-short", "spz-drosophila", "t-lgl-survival", "tol", "toll-drosophila", "trichostrongylus", "vegf-drosophila", "wg-drosophila", "yeast-cycle", "aspergillus-fumigatus", "budding-yeast", "gene-cardiac", "t-cell-differentiation", "lac-operon-bistability", "core-cell-cycle", "cortical"]
+    str_ = [
+        'anemia',
+        'cd4',
+        'lac-operon',
+        'spz-drosophila',
+        'arabidopsis',
+        'core-cell-cycle',
+        'lac-operon-bistability',
+        't-cell-differentiation',
+        'aspergillus-fumigatus',
+        'cortical',
+        'l-arabinose-operon',
+        't-lgl-survival',
+        'aurka',
+        'cycle-cdk',
+        'leukemia',
+        'tol',
+        'b-cell',
+        'fgf-drosophila',
+        'mammalian',
+        'trichostrongylus',
+        'body-drosophila',
+        'gene-cardiac',
+        'metabolic',
+        'vegf-drosophila',
+        'bt474',
+        'gonadal',
+        'neurotransmitter',
+        'wg-drosophila',
+        'bt474-ErbB',
+        'hcc1954',
+        'oxidative-stress',
+        'yeast-cycle',
+        'budding-yeast',
+        'hcc1954-ErbB',
+        'skbr3-long',
+        'cardiac',
+        'hh-drosophila',
+        'skbr3-short'
+        ]
    
    for s in str_:
-    	dataXi_original = pd.read_csv("updated_data/"+ s + "/" + s + "_metrics.csv", sep=",", header=0)
-    	N = int(dataXi_original.loc[1, 'N'])
-    	antifragility_original = list(np.array(dataXi_original.loc[:, 'Antifragility']).astype(float))
-    	# 15 or 20 points describing the relationship between the original values of antifragility in the network before perturbations and X/N
-    	X_tmp = list(np.arange(1,N+1, 1))
-    	X_N = [X_tmp[i]/N for i in range(N)]
-    	original_points = [np.interp(i/N0, X_N, antifragility_original) for i in range(1, N0+1)]
-
-    	for i in range(1,N_DATA):
+        data = pd.read_csv(
+            os.path.join(folder,s,s + "_metrics.csv"), sep=",", header=0)
+
+        # 30 points describing the relationship between the original 
+        # values of antifragility in the network before perturbations and X/N        
+        before =interpolate(
+            np.array(data.loc[:, 'Antifragility']).astype(float),
+            samples,
+            kind=kind)
+        
+        for i in range(N_DATA):
            #read the data for each experience
-    		n = format(i, '09')
-    		dataXi_tmp = pd.read_csv("updated_data/"+ s + "/" + s + "_" + n + "_metrics.csv", sep=",", header=0)
+            n = format(i+1, '09')
+            data = pd.read_csv(
+                os.path.join(folder,s,s + "_" + n + "_metrics.csv"), 
+                sep=",", header=0)
+
+            #antifragility of the mutant
+            after = interpolate(
+                np.array(data.loc[:, 'Antifragility']).astype(float),
+                samples,
+                kind=kind)
+            
+            # computes the difference of the antifragility curves
+            input_tmp = after-before

-    		antifragility_tmp = list(np.array(dataXi_tmp.loc[:, 'Antifragility']).astype(float))
+            evolvable_tmp = list(
+                np.array(data.loc[:, 'Evolvability']).astype(int))
            
-    		input_tmp = original_points + [np.interp(i/N0, X_N, antifragility_tmp) for i in range(1, N0+1)]
+            robust_tmp = list(
+                np.array(data.loc[:, 'Robustness']).astype(int))

-    		evolvable_tmp = list(np.array(dataXi_tmp.loc[:, 'Evolvability']).astype(int))
-    		robust_tmp = list(np.array(dataXi_tmp.loc[:, 'Robustness']).astype(int))
-    		output_tmp = oc.output_convert(N, evolvable_tmp, robust_tmp)
+            # even though evolvable_tmp and robust_tmp are vectors, 
+            # all entries are the same, we take the first
+            output_tmp = oc.output_convert(evolvable_tmp[0], robust_tmp[0])

            input_.append(input_tmp)
            output_+= output_tmp

--- a/prec_recall.py
+++ b/prec_recall.py
@@ -31,7 +31,7 @@ def plot_pr(recall, precision, average_precision):
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Average precision score, micro-averaged over all classes: AP={0:0.2f}'.format(average_precision["micro"]))
-    plt.savefig("precision_recall_curve")
+    plt.savefig("out/precision_recall_curve.pdf")
    #plt.show()
    plt.close()

@@ -67,4 +67,4 @@ def avg_pr(n_splits, num_classes, recs_k, precs_k, avgs_k):
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.title('Average precision score, over class {0}: AP={1:0.2f}'.format(i, avg_prec[i]))
-        plt.savefig("pr_curve_class_" +str(i))
+        plt.savefig("out/pr_curve_class_" +str(i)+".pdf")