25
25
int_smiles = dict (zip (elements_smiles , range (1 , len (elements_smiles )+ 1 )))
26
26
int_fasta = dict (zip (elements_fasta , range (1 , len (elements_fasta )+ 1 )))
27
27
28
- def convert (arx = file_path ):
29
-
30
- #Function to convert all elements (both smiles and fasta) into int, in order to be trained in the model
31
-
32
- smiles_w_numbers = [] # Smiles obtained with int_smiles[1] and the smiles of the df
33
- for i in arx .smiles :
34
- smiles_list = []
35
- for elements in i : # Elements refers to the elements that make up elements_smile
36
- try :
37
- smiles_list .append (int_smiles [elements ])
38
- except :
39
- pass
40
- while (len (smiles_list ) != max_smiles ):
41
- smiles_list .append (0 )
28
+ def convert (file_path = file_path ):
29
+
30
+ '''
31
+ Function to convert all elements (both smiles and fasta) into int, in order to be trained in the model
32
+
33
+ Parameters:
34
+
35
+ file_path (path): DataFrame containing the SMILES, FASTA and IC50 columns. Columns must be named "smiles", "sequence" and "IC50". This file is generated from src/fix_data_for_models.py
36
+
37
+ Returns:
38
+
39
+ smiles_w_numbers (list): List of SMILES converted to integers
40
+ fasta_w_numbers (list): List of FASTA converted to integers
41
+
42
+ '''
43
+
44
+ smiles_w_numbers = []
45
+ for i in file_path .smiles :
46
+ smiles_list = [int_smiles .get (element , 0 ) for element in i ]
47
+ smiles_list .extend ([0 ] * (max_smiles - len (smiles_list )))
42
48
smiles_w_numbers .append (smiles_list )
43
49
44
50
fasta_w_numbers = []
45
- for i in arx .sequence :
46
- fasta_list = []
47
- for elements in i : # Elements fa referència a els elements que formen elements_smile
48
- try :
49
- fasta_list .append (int_fasta [elements ])
50
- except :
51
- pass
52
- while (len (fasta_list ) != max_fasta ):
53
- fasta_list .append (0 )
51
+ for i in file_path .sequence :
52
+ fasta_list = [int_fasta .get (element , 0 ) for element in i ]
53
+ fasta_list .extend ([0 ] * (max_fasta - len (fasta_list )))
54
54
fasta_w_numbers .append (fasta_list )
55
55
56
- ic50_numeros = list (arx .IC50 )
56
+ ic50_numeros = list (file_path .IC50 )
57
57
58
58
return smiles_w_numbers , fasta_w_numbers , ic50_numeros
59
59
60
60
61
- X_test_smile , X_test_fasta , T_test_IC50 = convert (arx [350000 :])
62
61
62
+ X_test_smile , X_test_fasta , T_test_IC50 = convert (file_path [350000 :])
63
63
64
- def model_cnn ():
65
- # model to train
66
-
67
- # kernel regularizer
68
- regulatos = l2 (0.001 )
69
-
70
- # model per a smiles
71
- smiles_input = Input (
72
- shape = (max_smiles ,), dtype = 'int32' , name = 'smiles_input' )
73
- embed = Embedding (input_dim = len (
74
- elements_smiles )+ 1 , input_length = max_smiles , output_dim = 128 )(smiles_input )
75
- x = Conv1D (
76
- filters = 32 , kernel_size = 3 , padding = "SAME" , input_shape = (50700 , max_smiles ))(embed )
64
+
65
+ def model_cnn (file_path = file_path ):
66
+
67
+ '''
68
+ Function to train a model using CNN. The model is trained using the SMILES and FASTA sequences.
69
+ The model is trained using the IC50 values.
70
+
71
+ Parameters:
72
+ file_path (path): DataFrame containing the SMILES, FASTA and IC50 columns. Columns must be named "smiles", "sequence" and "IC50". This file is generated from src/fix_data_for_models.py
73
+
74
+ '''
75
+ regulator = l2 (0.001 )
76
+
77
+ # Model for SMILES
78
+ smiles_input = Input (shape = (max_smiles ,), dtype = 'int32' , name = 'smiles_input' )
79
+ embed_smiles = Embedding (input_dim = len (elements_smiles )+ 1 , input_length = max_smiles , output_dim = 128 )(smiles_input )
80
+ x = Conv1D (filters = 32 , kernel_size = 3 , padding = "SAME" , kernel_regularizer = regulator )(embed_smiles )
77
81
x = PReLU ()(x )
78
82
79
83
x = Conv1D (filters = 64 , kernel_size = 3 , padding = "SAME" )(x )
80
84
x = BatchNormalization ()(x )
81
85
x = PReLU ()(x )
82
- x = Conv1D (
83
- filters = 128 , kernel_size = 3 , padding = "SAME" )(x )
86
+ x = Conv1D (filters = 128 , kernel_size = 3 , padding = "SAME" )(x )
84
87
x = BatchNormalization ()(x )
85
88
x = PReLU ()(x )
86
- pool = GlobalMaxPooling1D ()(
87
- x ) # maxpool to get a 1d vector
89
+ pool_smiles = GlobalMaxPooling1D ()(x )
88
90
89
- # model per fastas
91
+ # Model for FASTA
90
92
fasta_input = Input (shape = (max_fasta ,), name = 'fasta_input' )
91
- embed2 = Embedding (input_dim = len (
92
- elements_fasta )+ 1 , input_length = max_fasta , output_dim = 256 )(fasta_input )
93
- x2 = Conv1D (
94
- filters = 32 , kernel_size = 3 , padding = "SAME" , input_shape = (50700 , max_fasta ))(embed2 )
95
- x2 = PReLU ()(embed2 )
96
-
97
- x2 = Conv1D (
98
- filters = 64 , kernel_size = 3 , padding = "SAME" )(x2 )
93
+ embed_fasta = Embedding (input_dim = len (elements_fasta )+ 1 , input_length = max_fasta , output_dim = 256 )(fasta_input )
94
+ x2 = Conv1D (filters = 32 , kernel_size = 3 , padding = "SAME" )(embed_fasta )
95
+ x2 = PReLU ()(x2 )
96
+
97
+ x2 = Conv1D (filters = 64 , kernel_size = 3 , padding = "SAME" )(x2 )
99
98
x2 = BatchNormalization ()(x2 )
100
99
x2 = PReLU ()(x2 )
101
- x2 = Conv1D (
102
- filters = 128 , kernel_size = 3 , padding = "SAME" )(x2 )
100
+ x2 = Conv1D (filters = 128 , kernel_size = 3 , padding = "SAME" )(x2 )
103
101
x2 = BatchNormalization ()(x2 )
104
102
x2 = PReLU ()(x2 )
105
- pool2 = GlobalMaxPooling1D ()(
106
- x2 ) #maxpool to get a 1d vector
107
-
108
- junt = concatenate (inputs = [pool , pool2 ])
109
-
110
- # dense
103
+ pool_fasta = GlobalMaxPooling1D ()(x2 )
111
104
112
- de = Dense (units = 1024 , activation = "relu" )(junt )
113
- dr = Dropout (0.3 )(de )
114
- de = Dense (units = 1024 , activation = "relu" )(dr )
115
- dr = Dropout (0.3 )(de )
116
- de2 = Dense (units = 512 , activation = "relu" )(dr )
105
+ # Concatenate and Dense layers
106
+ combined = concatenate ([pool_smiles , pool_fasta ])
107
+ dense = Dense (units = 1024 , activation = "relu" )(combined )
108
+ dense = Dropout (0.3 )(dense )
109
+ dense = Dense (units = 1024 , activation = "relu" )(dense )
110
+ dense = Dropout (0.3 )(dense )
111
+ dense = Dense (units = 512 , activation = "relu" )(dense )
117
112
118
- # output
113
+ output = Dense ( 1 , activation = "relu" , name = " output" )( dense )
119
114
120
- output = Dense (
121
- 1 , activation = "relu" , name = "output" , kernel_initializer = "normal" )(de2 )
115
+ model = tf .keras .models .Model (inputs = [smiles_input , fasta_input ], outputs = [output ])
122
116
123
- model = tf .keras .models .Model (
124
- inputs = [smiles_input , fasta_input ], outputs = [output ])
125
-
126
-
127
- # funció per mirar la precisió del model (serà la nostra metric)
128
117
def r2_score (y_true , y_pred ):
129
118
SS_res = K .sum (K .square (y_true - y_pred ))
130
119
SS_tot = K .sum (K .square (y_true - K .mean (y_true )))
131
- return (1 - SS_res / (SS_tot ) + K .epsilon ())
120
+ return (1 - SS_res / (SS_tot + K .epsilon () ))
132
121
133
- model .load_weights (
134
- r"" )
135
- # In case you want to continue training a model
136
-
137
122
model .compile (optimizer = "adam" ,
138
- loss = {'output' : "mean_squared_logarithmic_error" },
139
- metrics = {'output' : r2_score })
140
-
141
- # To do checkpoints
123
+ loss = {'output' : "mean_squared_logarithmic_error" },
124
+ metrics = {'output' : r2_score })
125
+
142
126
save_model_path = "models/cnn_model.hdf5"
143
- checkpoint = ModelCheckpoint (save_model_path ,
144
- monitor = 'val_loss' ,
145
- verbose = 1 ,
146
- save_best_only = True )
127
+ checkpoint = ModelCheckpoint (save_model_path , monitor = 'val_loss' , verbose = 1 , save_best_only = True )
147
128
148
- # We use a high value to get better results
149
129
size_per_epoch = 50700
150
-
151
- train = arx [:355000 ]
130
+ train = file_path [:355000 ]
152
131
loss = []
153
132
loss_validades = []
154
133
epochs = 50
155
134
156
- for epoch in range (epochs ): #Amount of epochs you want to use
135
+ for epoch in range (epochs ):
157
136
start = 0
158
137
end = size_per_epoch
159
- print (f"Començant el epoch { epoch + 1 } " )
138
+ print (f"Comenzando el epoch { epoch + 1 } " )
160
139
161
- while final < 355000 :
140
+ while end <= 355000 :
162
141
X_smiles , X_fasta , y_train = convert (train [start :end ])
163
142
164
143
r = model .fit ({'smiles_input' : np .array (X_smiles ),
165
- 'fasta_input' : np .array (X_fasta )}, {'output' : np .array (y_train )},
144
+ 'fasta_input' : np .array (X_fasta )},
145
+ {'output' : np .array (y_train )},
166
146
validation_data = ({'smiles_input' : np .array (X_test_smile ),
167
- 'fasta_input' : np .array (X_test_fasta )}, {'output' : np .array (T_test_IC50 )}), callbacks = [checkpoint ], epochs = 20 , batch_size = 64 , shuffle = True )
147
+ 'fasta_input' : np .array (X_test_fasta )},
148
+ {'output' : np .array (T_test_IC50 )}),
149
+ callbacks = [checkpoint ], epochs = 1 , batch_size = 64 , shuffle = True )
168
150
169
- inici += size_per_epoch
170
- final += size_per_epoch
151
+ start += size_per_epoch
152
+ end += size_per_epoch
171
153
172
- loss .append (r .history ["loss" ])
173
- loss_validades .append (r .history ["val_loss" ])
154
+ loss .append (np . mean ( r .history ["loss" ]) )
155
+ loss_validades .append (np . mean ( r .history ["val_loss" ]) )
174
156
175
157
plt .plot (range (epochs ), loss , label = "loss" )
176
158
plt .plot (range (epochs ), loss_validades , label = "val_loss" )
177
159
plt .legend ()
178
160
plt .show ()
179
161
180
162
181
- model_cnn ()
163
+ # Example usage
164
+ model_cnn (file_path = file_path )
0 commit comments