GEDLIB  1.0
ml_based_method.ipp
Go to the documentation of this file.
1 /***************************************************************************
2  * *
3  * Copyright (C) 2018 by David B. Blumenthal *
4  * *
5  * This file is part of GEDLIB. *
6  * *
7  * GEDLIB is free software: you can redistribute it and/or modify it *
8  * under the terms of the GNU Lesser General Public License as published *
9  * by the Free Software Foundation, either version 3 of the License, or *
10  * (at your option) any later version. *
11  * *
12  * GEDLIB is distributed in the hope that it will be useful, *
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15  * GNU Lesser General Public License for more details. *
16  * *
17  * You should have received a copy of the GNU Lesser General Public *
18  * License along with GEDLIB. If not, see <http://www.gnu.org/licenses/>. *
19  * *
20  ***************************************************************************/
21 
27 #ifndef SRC_METHODS_ML_BASED_METHOD_IPP_
28 #define SRC_METHODS_ML_BASED_METHOD_IPP_
29 
30 namespace ged {
31 
32 // === Definitions of destructor and constructor. ===
33 template<class UserNodeLabel, class UserEdgeLabel>
34 MLBasedMethod<UserNodeLabel, UserEdgeLabel>::
35 ~MLBasedMethod() {
36  delete ground_truth_method_;
37 }
38 
39 template<class UserNodeLabel, class UserEdgeLabel>
40 MLBasedMethod<UserNodeLabel, UserEdgeLabel>::
41 MLBasedMethod(const GEDData<UserNodeLabel, UserEdgeLabel> & ged_data) :
42 LSAPEBasedMethod<UserNodeLabel, UserEdgeLabel>(ged_data),
43 num_features_{undefined()},
44 prediction_initialized_(GEDGraph::dummy_node(), GEDGraph::dummy_node()),
45 assignments_(),
46 ml_method_{DNN},
47 ground_truth_method_{new IPFP<UserNodeLabel, UserEdgeLabel>(this->ged_data_)},
48 ground_truth_options_(""),
49 dnn_params_(),
50 dnn_(),
51 dnn_training_data_(),
52 dnn_feature_vectors_(),
53 dnn_types_(),
54 svm_params_(),
55 svm_(),
56 svm_training_data_(),
57 svm_feature_vectors_(),
58 svm_types_(),
59 one_class_svm_use_likelihood_{true},
60 one_class_svm_(),
61 infile_(""),
62 outfile_(""),
63 logfile_(""),
64 training_infile_(""),
65 training_outfile_(""),
66 ground_truth_infile_(""),
67 ground_truth_outfile_("") {
68  this->compute_lower_bound_ = false;
69 }
70 
71 template<class UserNodeLabel, class UserEdgeLabel>
72 double
74 predict(const GEDGraph & g, const GEDGraph & h, const NodeMap::Assignment & assignment) {
75 
76  if ((not this->initialized_) and (not initialized_for_prediction_(g, h))) {
78  ml_init_graph_(g);
79  ml_init_graph_(h);
80  }
81  if (not initialized_for_prediction_(g, h)) {
83  prediction_initialized_.first = g.id();
84  prediction_initialized_.second = h.id();
85  }
86 
87  GEDGraph::NodeID i{assignment.first};
88  GEDGraph::NodeID k{assignment.second};
89  std::vector<double> feature_vector;
90  if ((i != GEDGraph::dummy_node()) and (k != GEDGraph::dummy_node())) {
91  ml_populate_substitution_feature_vector_(g, h, i, k, feature_vector);
92  }
93  else if (i != GEDGraph::dummy_node()) {
94  ml_populate_deletion_feature_vector_(g, i, feature_vector);
95  }
96  else if (k != GEDGraph::dummy_node()) {
97  ml_populate_insertion_feature_vector_(h, k, feature_vector);
98  }
99  else {
100  return 0.0;
101  }
102  Assignment_ assignment_(0, 0, false, feature_vector);
103  return decision_value_(assignment_);
104 }
105 
106 // === Definitions of member functions inherited from LSAPEBasedMethod. ===
107 
108 template<class UserNodeLabel, class UserEdgeLabel>
109 void
112 
113  ml_init_();
114 
115  // Initialize the ground truth method.
116  if (compute_or_load_ground_truth_()) {
117  if (ground_truth_infile_ == "") {
118  ground_truth_method_->init();
119  if (ground_truth_outfile_ != "") {
120  std::ofstream ofs(ground_truth_outfile_);
121  ofs.close();
122  }
123  }
124  }
125 
126  // Return if the method is initialized from a configuration file.
127  if (load_config_file_()) {
128  return;
129  }
130 
131  // Ask the derived class about the size of the feature vectors.
133 
134  // Initialize the training data.
135  load_or_generate_training_data_();
136  if (training_outfile_ != "") {
137  save_training_data_();
138  }
139 
140  // Run the machine learning method.
141  train_();
142 }
143 
144 template<class UserNodeLabel, class UserEdgeLabel>
145 void
147 lsape_pre_graph_init_(bool called_at_runtime) {
148  if (load_config_file_()) {
149  if (ml_method_ == DNN) {
150  num_features_ = dnn_.load(infile_);
151  }
152  else if (ml_method_ == SVM){
153  num_features_ = svm_.load(infile_);
154  }
155  else {
156  num_features_ = one_class_svm_.load(infile_, one_class_svm_use_likelihood_);
157  }
159  }
160  else if (called_at_runtime){
161  throw Error("You are trying to run a machine learning based method without training it. Call init_method() before calling run_method() or provide an initialization file via the option \"--init-infile <filename>\".");
162  }
163 }
164 
165 template<class UserNodeLabel, class UserEdgeLabel>
166 void
168 lsape_populate_instance_(const GEDGraph & g, const GEDGraph & h, DMatrix & master_problem) {
169  assignments_.clear();
170  generate_assignments_(g, h);
171  double correctly_predicted{0.0};
172  double correctly_predicted_as_bad{0.0};
173  double correctly_predicted_as_good{0.0};
174  double ground_truth_bad{0.0};
175  double ground_truth_good{0.0};
176  IMatrix assignment_matrix(master_problem.num_rows(), master_problem.num_cols());
177 #ifdef _OPENMP
178  omp_set_num_threads(this->num_threads_ - 1);
179 #pragma omp parallel for if(this->num_threads_ > 1)
180 #endif
181  for (std::size_t pos = 0; pos < assignments_.size(); pos++) {
182  Assignment_ & assignment = assignments_.at(pos);
183  master_problem(assignment.row_in_master(), assignment.col_in_master()) = decision_value_(assignment);
184  if (std::isnan(master_problem(assignment.row_in_master(), assignment.col_in_master()))) {
185  std::string error_msg("master_problem(" + std::to_string(assignment.row_in_master()) + "," + std::to_string(assignment.col_in_master()) + ")=NaN\n");
186  error_msg += "assignment=" + assignment.to_string() + "\n";
187  throw Error(error_msg);
188  }
189  if (log_prediction_ratios_()) {
190 #ifdef _OPENMP
191 #pragma omp critical
192 #endif
193  {
194  bool predicted_as_bad{master_problem(assignment.row_in_master(), assignment.col_in_master()) > 0.5};
195  if (assignment.is_good_assignment()) {
196  assignment_matrix(assignment.row_in_master(), assignment.col_in_master()) = 1;
197  ground_truth_good += 1.0;
198  if (not predicted_as_bad) {
199  correctly_predicted += 1.0;
200  correctly_predicted_as_good += 1.0;
201  }
202  }
203  else {
204  assignment_matrix(assignment.row_in_master(), assignment.col_in_master()) = 0;
205  ground_truth_bad += 1.0;
206  if (predicted_as_bad) {
207  correctly_predicted += 1.0;
208  correctly_predicted_as_bad += 1.0;
209  }
210  }
211  }
212  }
213  }
214  if (log_prediction_ratios_()) {
215  double num_assignments{static_cast<double>(assignments_.size())};
216  std::ofstream logfile;
217  logfile.open(logfile_, std::ios::app);
218  logfile << "=====\ng_id=" << g.id() << "\nh_id=" << h.id() << "\nassignment_matrix=\n";
219  logfile << assignment_matrix << "\nmaster_problem=\n";
220  logfile << master_problem << "\nbaseline_accuracy=";
221  logfile << ((ground_truth_bad > ground_truth_good) ? (ground_truth_bad / num_assignments) : (ground_truth_good / num_assignments)) << "\naccuracy=";
222  logfile << (correctly_predicted / num_assignments) << "\nprecision=";
223  logfile << (correctly_predicted_as_bad / ground_truth_bad) << "\nrecall=";
224  logfile << (correctly_predicted_as_good / ground_truth_good) << "\n";
225  logfile.close();
226  }
227 }
228 
229 template<class UserNodeLabel, class UserEdgeLabel>
230 std::string
233  std::string general_options("[--ml-method <arg>] [--ground-truth-method <arg>] [--ground-truth-options <arg>] [--load <arg>] [--save <arg>] [--load-train <arg>] [--save-train <arg>] [--load-ground-truth <arg>] [--save-ground-truth <arg>] [--log <arg>]");
234  std::string dnn_options("[--dnn-activation <arg>] [--dnn-hidden-layers-range <arg>] [--dnn-neurons-per-layer-range <arg>]");
235  std::string svm_options("[--svm-gamma-exp-range <arg>] [--svm-c-exp-range <arg>] [--one-class-svm-likelihood <arg>]");
236  if (ml_valid_options_string_() == "") {
237  return (general_options + " " + dnn_options + " " + svm_options);
238  }
239  return (ml_valid_options_string_() + " " + general_options + " " + dnn_options + " " + svm_options);
240 }
241 
242 template<class UserNodeLabel, class UserEdgeLabel>
243 void
246  prediction_initialized_.first = GEDGraph::dummy_node();
247  prediction_initialized_.second = GEDGraph::dummy_node();
248  ml_method_ = DNN;
249  delete ground_truth_method_;
250  ground_truth_options_ = "";
251  ground_truth_method_ = new IPFP<UserNodeLabel, UserEdgeLabel>(this->ged_data_);
252  ground_truth_method_->set_options(std::string("--initial-solutions 80 --ratio-runs-from-initial-solutions 0.5 --lower-bound-method BRANCH_TIGHT --threads ") + std::to_string(this->num_threads_));
253  dnn_params_.activation_candidates = {FANN::activation_function_enum::RELU, FANN::activation_function_enum::SIGMOID};
254  dnn_params_.min_num_hidden_layers = 1;
255  dnn_params_.max_num_hidden_layers = 10;
256  dnn_params_.min_num_neurons_per_layer = 1;
257  dnn_params_.max_num_neurons_per_layer = 20;
258  svm_params_.min_gamma_exp = -3;
259  svm_params_.max_gamma_exp = 4;
260  svm_params_.min_c_exp = -3;
261  svm_params_.max_c_exp = 4;
262  one_class_svm_use_likelihood_ = true;
263  infile_ = std::string("");
264  outfile_ = std::string("");
265  logfile_ = std::string("");
266  training_infile_ = std::string("");
267  training_outfile_ = std::string("");
268  ground_truth_infile_ = std::string("");
269  ground_truth_outfile_ = std::string("");
272 }
273 
274 template<class UserNodeLabel, class UserEdgeLabel>
275 bool
277 lsape_parse_option_(const std::string & option, const std::string & arg) {
278  bool is_valid_option{false};
279  if (option == "ml-method") {
280  if (arg == "SVM") {
281  ml_method_ = SVM;
282  }
283  else if (arg == "ONE_CLASS_SVM") {
284  ml_method_ = ONE_CLASS_SVM;
285  }
286  else if (arg != "DNN"){
287  throw ged::Error(std::string("Invalid argument ") + arg + " for option ml-method. Usage: options = \"[--ml-method DNN|ONE_CLASS_SVM] [...]\"");
288  }
289  is_valid_option = true;
290  }
291  else if (option == "ground-truth-method") {
292  if (arg == "ANCHOR_AWARE_GED") {
293  delete ground_truth_method_;
294  ground_truth_method_ = new AnchorAwareGED<UserNodeLabel, UserEdgeLabel>(this->ged_data_);
295  }
296 #ifdef GUROBI
297  else if (arg == "F1") {
298  delete ground_truth_method_;
299  ground_truth_method_ = new F1<UserNodeLabel, UserEdgeLabel>(this->ged_data_);
300  }
301  else if (arg == "F2") {
302  delete ground_truth_method_;
303  ground_truth_method_ = new F2<UserNodeLabel, UserEdgeLabel>(this->ged_data_);
304  }
305  else if (arg == "COMPACT_MIP") {
306  delete ground_truth_method_;
307  ground_truth_method_ = new CompactMIP<UserNodeLabel, UserEdgeLabel>(this->ged_data_);
308  }
309  else if (arg != "IPFP") {
310  throw ged::Error(std::string("Invalid argument ") + arg + " for option ground-truth-method. Usage: options = \"[--ground-truth-method ANCHOR_AWARE|F1|F2|COMPACT_MIP|IPFP] [...]\"");
311  }
312 #else
313  else if (arg != "IPFP") {
314  throw ged::Error(std::string("Invalid argument ") + arg + " for option ground-truth-method. Usage: options = \"[--ground-truth-method ANCHOR_AWARE|IPFP] [...]\"");
315  }
316 #endif
317  is_valid_option = true;
318  }
319  else if (option == "ground-truth-options") {
320  ground_truth_options_ = arg;
321  std::size_t bad_option_start{ground_truth_options_.find("--threads")};
322  std::size_t next_option_start;
323  if (bad_option_start != std::string::npos) {
324  next_option_start = ground_truth_options_.find("--", bad_option_start + 1);
325  if (next_option_start != std::string::npos) {
326  ground_truth_options_ = ground_truth_options_.substr(0, bad_option_start) + ground_truth_options_.substr(next_option_start);
327  }
328  else {
329  ground_truth_options_ = ground_truth_options_.substr(0, bad_option_start);
330  }
331  }
332  is_valid_option = true;
333  }
334  else if (option == "load") {
335  infile_ = arg;
336  is_valid_option = true;
337  }
338  else if (option == "save") {
339  outfile_ = arg;
340  is_valid_option = true;
341  }
342  else if (option == "load-train") {
343  training_infile_ = arg;
344  is_valid_option = true;
345  }
346  else if (option == "save-train") {
347  training_outfile_ = arg;
348  is_valid_option = true;
349  }
350  else if (option == "load-ground-truth") {
351  ground_truth_infile_ = arg;
352  is_valid_option = true;
353  }
354  else if (option == "save-ground-truth") {
355  ground_truth_outfile_ = arg;
356  is_valid_option = true;
357  }
358  else if (option == "log") {
359  logfile_ = arg;
360  std::ofstream logfile;
361  logfile.open(logfile_);
362  logfile.close();
363  is_valid_option = true;
364  }
365  else if (option == "dnn-activation") {
366  dnn_params_.activation_candidates.clear();
367  std::stringstream activation_functions(arg);
368  std::string activation;
369  while (std::getline(activation_functions, activation, ',')) {
370  if (activation == "SIGMOID") {
371  dnn_params_.activation_candidates.push_back(FANN::activation_function_enum::SIGMOID);
372  }
373  else if (activation == "RELU") {
374  dnn_params_.activation_candidates.push_back(FANN::activation_function_enum::RELU);
375  }
376  else {
377  throw Error(std::string("Invalid argument ") + arg + " for option dnn-activation. Usage: options = \"[--dnn-activation SIGMOID|RELU[,SIGMOID|RELU]] [...]\"");
378  }
379  }
380  if (dnn_params_.activation_candidates.empty()) {
381  throw Error(std::string("Invalid argument ") + arg + " for option dnn-activation. Usage: options = \"[--dnn-activation SIGMOID|RELU[,SIGMOID|RELU]] [...]\"");
382  }
383  is_valid_option = true;
384  }
385  else if (option == "dnn-hidden-layers-range") {
386  std::stringstream hidden_layers_range(arg);
387  std::string min_num_hidden_layers, max_num_hidden_layers;
388  if (std::getline(hidden_layers_range, min_num_hidden_layers, ',') and std::getline(hidden_layers_range, max_num_hidden_layers, ',')) {
389  try {
390  dnn_params_.min_num_hidden_layers = std::stoi(min_num_hidden_layers);
391  }
392  catch (...) {
393  throw Error(std::string("Invalid argument \"") + arg + "\" for option dnn-hidden-layers-range. Usage: options = \"[--dnn-hidden-layers-range <smaller convertible to int greater 0>,<larger convertible to int greater 0>] [...]");
394  }
395  try {
396  dnn_params_.max_num_hidden_layers = std::stoi(max_num_hidden_layers);
397  }
398  catch (...) {
399  throw Error(std::string("Invalid argument \"") + arg + "\" for option dnn-hidden-layers-range. Usage: options = \"[--dnn-hidden-layers-range <smaller convertible to int greater 0>,<larger convertible to int greater 0>] [...]");
400  }
401  if ((dnn_params_.min_num_hidden_layers > dnn_params_.max_num_hidden_layers) or (dnn_params_.min_num_hidden_layers < 0)) {
402  throw Error(std::string("Invalid argument \"") + arg + "\" for option dnn-hidden-layers-range. Usage: options = \"[--dnn-hidden-layers-range <smaller convertible to int greater 0>,<larger convertible to int greater 0>] [...]");
403  }
404  }
405  else {
406  throw Error(std::string("Invalid argument \"") + arg + "\" for option dnn-hidden-layers-range. Usage: options = \"[--dnn-hidden-layers-range <smaller convertible to int greater 0>,<larger convertible to int greater 0>] [...]");
407  }
408  is_valid_option = true;
409  }
410  else if (option == "dnn-neurons-per-layer-range") {
411  std::stringstream neurons_per_layer_range(arg);
412  std::string min_neurons_per_layer, max_neurons_per_layer;
413  if (std::getline(neurons_per_layer_range, min_neurons_per_layer, ',') and std::getline(neurons_per_layer_range, max_neurons_per_layer, ',')) {
414  try {
415  dnn_params_.min_num_neurons_per_layer = std::stoi(min_neurons_per_layer);
416  }
417  catch (...) {
418  throw Error(std::string("Invalid argument \"") + arg + "\" for option dnn-neurons-per-layer-range. Usage: options = \"[--dnn-neurons-per-layer-range <smaller convertible to int greater 0>,<larger convertible to int greater 0>] [...]");
419  }
420  try {
421  dnn_params_.max_num_neurons_per_layer = std::stoi(max_neurons_per_layer);
422  }
423  catch (...) {
424  throw Error(std::string("Invalid argument \"") + arg + "\" for option dnn-neurons-per-layer-range. Usage: options = \"[--dnn-neurons-per-layer-range <smaller convertible to int greater 0>,<larger convertible to int greater 0>] [...]");
425  }
426  if ((dnn_params_.min_num_neurons_per_layer > dnn_params_.max_num_neurons_per_layer) or (dnn_params_.min_num_neurons_per_layer <= 0)) {
427  throw Error(std::string("Invalid argument \"") + arg + "\" for option dnn-neurons-per-layer-range. Usage: options = \"[--dnn-neurons-per-layer-range <smaller convertible to int greater 0>,<larger convertible to int greater 0>] [...]");
428  }
429  }
430  else {
431  throw Error(std::string("Invalid argument \"") + arg + "\" for option dnn-neurons-per-layer-range. Usage: options = \"[--dnn-neurons-per-layer-range <smaller convertible to int greater 0>,<larger convertible to int greater 0>] [...]");
432  }
433  is_valid_option = true;
434  }
435  else if (option == "svm-gamma-exp-range") {
436  std::stringstream gamma_exp_range(arg);
437  std::string min_gamma_exp, max_gamma_exp;
438  if (std::getline(gamma_exp_range, min_gamma_exp, ',') and std::getline(gamma_exp_range, max_gamma_exp, ',')) {
439  try {
440  svm_params_.min_gamma_exp = std::stoi(min_gamma_exp);
441  }
442  catch (...) {
443  throw Error(std::string("Invalid argument \"") + arg + "\" for option svm-gamma-exp-range. Usage: options = \"[--svm-gamma-exp-range <smaller convertible to int>,<larger convertible to int>] [...]");
444  }
445  try {
446  svm_params_.max_gamma_exp = std::stoi(max_gamma_exp);
447  }
448  catch (...) {
449  throw Error(std::string("Invalid argument \"") + arg + "\" for option svm-gamma-exp-range. Usage: options = \"[--svm-gamma-exp-range <smaller convertible to int>,<larger convertible to int>] [...]");
450  }
451  if (svm_params_.min_gamma_exp > svm_params_.max_gamma_exp) {
452  throw Error(std::string("Invalid argument \"") + arg + "\" for option svm-gamma-exp-range. Usage: options = \"[--svm-gamma-exp-range <smaller convertible to int>,<larger convertible to int>] [...]");
453  }
454  }
455  else {
456  throw Error(std::string("Invalid argument \"") + arg + "\" for option svm-gamma-exp-range. Usage: options = \"[--svm-gamma-exp-range <smaller convertible to int>,<larger convertible to int>] [...]");
457  }
458  is_valid_option = true;
459  }
460  else if (option == "svm-c-exp-range") {
461  std::stringstream c_exp_range(arg);
462  std::string min_c_exp, max_c_exp;
463  if (std::getline(c_exp_range, min_c_exp, ',') and std::getline(c_exp_range, max_c_exp, ',')) {
464  try {
465  svm_params_.min_c_exp = std::stoi(min_c_exp);
466  }
467  catch (...) {
468  throw Error(std::string("Invalid argument \"") + arg + "\" for option svm-c-exp-range. Usage: options = \"[--svm-c-exp-range <smaller convertible to int>,<larger convertible to int>] [...]");
469  }
470  try {
471  svm_params_.max_c_exp = std::stoi(max_c_exp);
472  }
473  catch (...) {
474  throw Error(std::string("Invalid argument \"") + arg + "\" for option svm-c-exp-range. Usage: options = \"[--svm-c-exp-range <smaller convertible to int>,<larger convertible to int>] [...]");
475  }
476  if (svm_params_.min_c_exp > svm_params_.max_c_exp) {
477  throw Error(std::string("Invalid argument \"") + arg + "\" for option svm-c-exp-range. Usage: options = \"[--svm-c-exp-range <smaller convertible to int>,<larger convertible to int>] [...]");
478  }
479  }
480  else {
481  throw Error(std::string("Invalid argument \"") + arg + "\" for option svm-c-exp-range. Usage: options = \"[--svm-c-exp-range <smaller convertible to int>,<larger convertible to int>] [...]");
482  }
483  is_valid_option = true;
484  }
485  else if (option == "one-class-svm-likelihood") {
486  if (arg == "TRUE") {
487  one_class_svm_use_likelihood_ = true;
488  }
489  else if (arg == "FALSE") {
490  one_class_svm_use_likelihood_ = false;
491  }
492  else {
493  throw Error(std::string("Invalid argument ") + arg + " for option one-class-svm-likelihood. Usage: options = \"[--one-class-svm-likelihood TRUE|FALSE] [...]\"");
494  }
495  is_valid_option = true;
496  }
497  if (dynamic_cast<IPFP<UserNodeLabel, UserEdgeLabel> *>(ground_truth_method_)) {
498  if (ground_truth_options_ == "") {
499  ground_truth_method_->set_options(std::string("--initial-solutions 80 --ratio-runs-from-initial-solutions 0.5 --lower-bound-method BRANCH_TIGHT --threads ") + std::to_string(this->num_threads_));
500  }
501  else {
502  ground_truth_method_->set_options(ground_truth_options_ + " --threads " + std::to_string(this->num_threads_));
503  }
504  }
505  else {
506  if (ground_truth_options_ == "") {
507  ground_truth_method_->set_options(std::string("--threads ") + std::to_string(this->num_threads_));
508  }
509  else {
510  ground_truth_method_->set_options(ground_truth_options_ + " --threads " + std::to_string(this->num_threads_));
511  }
512  }
513  is_valid_option = is_valid_option or ml_parse_option_(option, arg);
514  return is_valid_option;
515 }
516 
517 template<class UserNodeLabel, class UserEdgeLabel>
518 void
520 lsape_init_graph_(const GEDGraph & graph) {
521  ml_init_graph_(graph);
522 }
523 
524 template<class UserNodeLabel, class UserEdgeLabel>
525 void
528 
529 // === Definition of private helper functions. ===
530 template<class UserNodeLabel, class UserEdgeLabel>
531 void
533 train_() {
534  if (ml_method_ == DNN) {
535  dnn_.train(dnn_training_data_, dnn_params_, outfile_, this->num_threads_);
536  }
537  else if (ml_method_ == SVM) {
538  svm_.train(&svm_training_data_, svm_params_, num_features_, outfile_, this->num_threads_);
539  }
540  else {
541  one_class_svm_.train(&svm_training_data_, one_class_svm_use_likelihood_, num_features_, outfile_);
542  }
543 }
544 
545 template<class UserNodeLabel, class UserEdgeLabel>
546 bool
548 initialized_for_prediction_(const GEDGraph & g, const GEDGraph & h) const {
549  return ((prediction_initialized_.first == g.id()) and (prediction_initialized_.second == h.id()));
550 }
551 
552 template<class UserNodeLabel, class UserEdgeLabel>
553 void
555 generate_assignments_(const GEDGraph & g, const GEDGraph & h) {
556 
557  // Compute ground truth.
558  NodeMap ground_truth(g.num_nodes(), h.num_nodes());
559  if (compute_or_load_ground_truth_()) {
560  if (ground_truth_infile_ != "") {
561  this->ged_data_.load_node_map(ground_truth_infile_, g.id(), h.id(), ground_truth);
562  }
563  else {
564  Result result;
565  ground_truth_method_->run_as_util(g, h, result);
566  ground_truth = result.node_map(0);
567  if (ground_truth_outfile_ != "") {
568  this->ged_data_.save_node_map(ground_truth_outfile_, g.id(), h.id(), ground_truth);
569  }
570  }
571  }
572 
573  // Initialize variables used for computing the feature vectors
575  std::vector<std::pair<std::size_t, std::size_t>> bad_assignments;
576  std::size_t num_good_assignments{0};
577 
578  // Compute feature vectors. Skip bad assignments if called at initialization.
579 #ifdef _OPENMP
580  omp_set_num_threads(this->num_threads_ - 1);
581 #pragma omp parallel for if(this->num_threads_ > 1)
582 #endif
583  for (std::size_t row_in_master = 0; row_in_master <= g.num_nodes(); row_in_master++) {
584  for (std::size_t col_in_master = 0; col_in_master <= h.num_nodes(); col_in_master++) {
585  if ((row_in_master == g.num_nodes()) and (col_in_master == h.num_nodes())) {
586  continue;
587  }
588  std::vector<double> feature_vector;
589  bool good_assignment{false};
590  if ((row_in_master < g.num_nodes()) and (col_in_master < h.num_nodes())) {
591  if (compute_or_load_ground_truth_()) {
592  good_assignment = (ground_truth.image(row_in_master) == col_in_master);
593  }
594  if ((not this->initialized_) and (not good_assignment)) {
595 #ifdef _OPENMP
596 #pragma omp critical
597 #endif
598  {
599  bad_assignments.emplace_back(row_in_master, col_in_master);
600  }
601  continue;
602  }
603  ml_populate_substitution_feature_vector_(g, h, row_in_master, col_in_master, feature_vector);
604  }
605  else if (row_in_master < g.num_nodes()) {
606  if (compute_or_load_ground_truth_()) {
607  good_assignment = (ground_truth.image(row_in_master) == GEDGraph::dummy_node());
608  }
609  if ((not this->initialized_) and (not good_assignment)) {
610 #ifdef _OPENMP
611 #pragma omp critical
612 #endif
613  {
614  bad_assignments.emplace_back(row_in_master, col_in_master);
615  }
616  continue;
617  }
618  ml_populate_deletion_feature_vector_(g, row_in_master, feature_vector);
619  }
620  else {
621  if (compute_or_load_ground_truth_()) {
622  good_assignment = (ground_truth.pre_image(col_in_master) == GEDGraph::dummy_node());
623  }
624  if ((not this->initialized_) and (not good_assignment)) {
625 #ifdef _OPENMP
626 #pragma omp critical
627 #endif
628  {
629  bad_assignments.emplace_back(row_in_master, col_in_master);
630  }
631  continue;
632  }
633  ml_populate_insertion_feature_vector_(h, col_in_master, feature_vector);
634  }
635 #ifdef _OPENMP
636 #pragma omp critical
637 #endif
638  {
639  assignments_.emplace_back(row_in_master, col_in_master, good_assignment, feature_vector);
640  if (not this->initialized_) {
641  num_good_assignments++;
642  }
643  }
644  }
645  }
646 
647  // Sample bad assignments if called at initialization and two class SVM or DNN is used.
648  if ((not this->initialized_) and (ml_method_ != ONE_CLASS_SVM)) {
649  std::random_device rng;
650  std::mt19937 urng(rng());
651  std::shuffle(bad_assignments.begin(), bad_assignments.end(), urng);
652  std::size_t num_bad_assignments{bad_assignments.size()};
653 #ifdef _OPENMP
654  omp_set_num_threads(this->num_threads_ - 1);
655 #pragma omp parallel for if(this->num_threads_ > 1)
656 #endif
657  for (std::size_t i = 0; i < std::min(num_good_assignments, num_bad_assignments); i++) {
658  std::size_t row_in_master{bad_assignments.at(i).first};
659  std::size_t col_in_master{bad_assignments.at(i).second};
660  std::vector<double> feature_vector;
661  if ((row_in_master < g.num_nodes()) and (col_in_master < h.num_nodes())) {
662  ml_populate_substitution_feature_vector_(g, h, row_in_master, col_in_master, feature_vector);
663  }
664  else if (row_in_master < g.num_nodes()) {
665  ml_populate_deletion_feature_vector_(g, row_in_master, feature_vector);
666  }
667  else {
668  ml_populate_insertion_feature_vector_(h, col_in_master, feature_vector);
669  }
670 #ifdef _OPENMP
671 #pragma omp critical
672 #endif
673  {
674  assignments_.emplace_back(row_in_master, col_in_master, false, feature_vector);
675  }
676  }
677  }
678 }
679 
680 template<class UserNodeLabel, class UserEdgeLabel>
681 void
684  std::ofstream training_data_file(training_outfile_);
685  for (Assignment_ assignment : assignments_) {
686  training_data_file << assignment.to_string() << "\n";
687  }
688  training_data_file.close();
689 }
690 
691 template<class UserNodeLabel, class UserEdgeLabel>
692 void
695 
696  // Clear variables.
697  assignments_.clear();
698  dnn_feature_vectors_.clear();
699  dnn_types_.clear();
700  svm_feature_vectors_.clear();
701  svm_types_.clear();
702 
703 
704  // Load or generate training data.
705  if (training_infile_ != "") {
706  std::cout << "Loading training data from file " << training_infile_ << " ... " << std::flush;
707  std::ifstream training_data_file(training_infile_);
708  if (not training_data_file.good()) {
709  throw Error("Error loading training data from file " + training_infile_ + ". File cannot be opened.");
710  }
711  std::string line;
712  while(std::getline(training_data_file, line)) {
713  assignments_.emplace_back(line, num_features_);
714  }
715  training_data_file.close();
716  std::cout << "Done. Training data has size " << assignments_.size() << ".\n";
717  }
718  else {
719  // Generate the assignments.
720  ProgressBar progress_bar(this->ged_data_.num_graphs() * this->ged_data_.num_graphs());
721  std::cout << "\rGenerating training data: " << progress_bar << std::flush;
722  for (auto g = this->ged_data_.begin(); g != this->ged_data_.end(); g++) {
723  if (this->ged_data_.is_shuffled_graph_copy(g->id())) {
724  continue;
725  }
726  for (auto h = this->ged_data_.begin(); h != this->ged_data_.end(); h++) {
727  if (this->ged_data_.is_shuffled_graph_copy(h->id())) {
728  continue;
729  }
730  if (g->num_nodes() == 0 or h->num_nodes() == 0) {
731  progress_bar.increment();
732  std::cout << "\rGenerating training data: " << progress_bar << std::flush;
733  continue;
734  }
735  // Generate assignments between g and h.
736  if (this->ged_data_.shuffled_graph_copies_available() and (g->id() == h->id())) {
737  generate_assignments_(*g, this->ged_data_.graph(this->ged_data_.id_shuffled_graph_copy(h->id())));
738  }
739  else {
740  generate_assignments_(*g, *h);
741  }
742  progress_bar.increment();
743  std::cout << "\rGenerating training data: " << progress_bar << std::flush;
744  }
745  }
746  std::cout << "\n";
747  std::random_shuffle(assignments_.begin(), assignments_.end());
748  }
749 
750  // Initialize the specialized training data for DNN_ or SVM_.
751  if (ml_method_ == DNN) {
752  for (std::size_t pos{0}; pos < assignments_.size(); pos++) {
753  dnn_feature_vectors_.push_back(assignments_[pos].dnn_feature_vector());
754  dnn_types_.push_back(assignments_[pos].type());
755  }
756  dnn_training_data_.set_train_data(static_cast<unsigned int>(dnn_feature_vectors_.size()), static_cast<unsigned int>(num_features_), dnn_feature_vectors_.data(), 1, dnn_types_.data());
757  }
758  else {
759  for (std::size_t pos{0}; pos < assignments_.size(); pos++) {
760  svm_feature_vectors_.push_back(assignments_[pos].svm_feature_vector());
761  if (ml_method_ == SVM) {
762  svm_types_.push_back((*assignments_[pos].type()) + 1);
763  }
764  else {
765  svm_types_.push_back(1);
766  }
767  }
768  svm_training_data_.l = static_cast<int>(svm_feature_vectors_.size());
769  svm_training_data_.y = svm_types_.data();
770  svm_training_data_.x = svm_feature_vectors_.data();
771  }
772 }
773 
774 template<class UserNodeLabel, class UserEdgeLabel>
775 double
777 decision_value_(Assignment_ & assignment) {
778  if (ml_method_ == DNN) {
779  return dnn_.decision_value(assignment.dnn_feature_vector());
780  }
781  else if (ml_method_ == SVM) {
782  return svm_.decision_value(assignment.svm_feature_vector());
783  }
784  return one_class_svm_.decision_value(assignment.svm_feature_vector());
785 }
786 
787 template<class UserNodeLabel, class UserEdgeLabel>
788 bool
790 load_config_file_() const {
791  return (infile_ != "");
792 }
793 
794 template<class UserNodeLabel, class UserEdgeLabel>
795 bool
797 log_prediction_ratios_() const {
798  return (logfile_ != "");
799 }
800 
801 template<class UserNodeLabel, class UserEdgeLabel>
802 bool
805  return (((not load_config_file_()) and (not this->initialized_)) or log_prediction_ratios_());
806 }
807 
808 // === Definition of private class Assignment_. ===
809 template<class UserNodeLabel, class UserEdgeLabel>
812 Assignment_(std::size_t row_in_master, std::size_t col_in_master, bool good_assignment, const std::vector<double> & feature_vector) :
813 row_in_master_{row_in_master},
814 col_in_master_{col_in_master},
815 type_{good_assignment ? 0.0 : 1.0},
816 dnn_feature_vector_(feature_vector),
817 svm_feature_vector_() {
818  svm_node node;
819  for (std::size_t index{0}; index < feature_vector.size(); index++) {
820  if (feature_vector.at(index) != 0.0) {
821  node.index = index + 1;
822  node.value = feature_vector.at(index);
823  svm_feature_vector_.push_back(node);
824  }
825  }
826  node.index = -1;
827  node.value = -1;
828  svm_feature_vector_.push_back(node);
829 }
830 
831 template<class UserNodeLabel, class UserEdgeLabel>
834 Assignment_(const Assignment_ & assignment) :
835 row_in_master_{assignment.row_in_master_},
836 col_in_master_{assignment.col_in_master_},
837 type_{assignment.type_},
838 dnn_feature_vector_(assignment.dnn_feature_vector_),
839 svm_feature_vector_(assignment.svm_feature_vector_) {}
840 
841 template<class UserNodeLabel, class UserEdgeLabel>
844 Assignment_(const std::string & line, std::size_t num_features) :
845 row_in_master_{undefined()},
846 col_in_master_{undefined()},
847 type_{1.0},
848 dnn_feature_vector_(),
849 svm_feature_vector_() {
850  std::istringstream line_stream(line);
851  std::string type_str;
852  if (not std::getline(line_stream, type_str, ' ')) {
853  throw Error("Reading training data failed.\nExpected format of lines: \"0|1 <value 1> ... <value num_features>\"\nLine: \"" + line + "\"");
854  }
855  try {
856  type_ = std::stod(type_str);
857  }
858  catch (...) {
859  throw Error("Reading training data failed.\nExpected format of lines: \"0|1 <value 1> ... <value num_features>\"\nLine: \"" + line + "\"");
860  }
861  if ((type_ != 0) and (type_ != 1)) {
862  throw Error("Reading training data failed.\nExpected format of lines: \"0|1 <value 1> ... <value num_features>\"\nLine: \"" + line + "\"");
863  }
864  std::string value_str;
865  double value;
866  int svm_index{0};
867  svm_node node;
868  while (std::getline(line_stream, value_str, ' ')) {
869  svm_index++;
870  try {
871  value = std::stod(value_str);
872  }
873  catch (...) {
874  throw Error("Reading training data failed.\nExpected format of lines: \"0|1 <value 1> ... <value num_features>\"\nLine: \"" + line + "\"");
875  }
876  dnn_feature_vector_.push_back(value);
877  if (value != 0) {
878  node.index = svm_index;
879  node.value = value;
880  svm_feature_vector_.push_back(node);
881  }
882  }
883  node.index = -1;
884  svm_feature_vector_.push_back(node);
885  if (dnn_feature_vector_.size() != num_features) {
886  throw Error("Reading training data failed.\nExpected format of lines: \"0|1 <value 1> ... <value num_features>\"\nLine: \"" + line + "\"");
887  }
888 }
889 
890 template<class UserNodeLabel, class UserEdgeLabel>
891 std::size_t
894 row_in_master() const {
895  return row_in_master_;
896 }
897 
898 template<class UserNodeLabel, class UserEdgeLabel>
899 std::size_t
902 col_in_master() const {
903  return col_in_master_;
904 }
905 
906 template<class UserNodeLabel, class UserEdgeLabel>
907 std::string
910 to_string() const {
911  std::string return_str(std::to_string(type_));
912  for (double feature : dnn_feature_vector_) {
913  return_str += " " + std::to_string(feature);
914  }
915  return return_str;
916 }
917 
918 template<class UserNodeLabel, class UserEdgeLabel>
919 double *
923  return dnn_feature_vector_.data();
924 }
925 
926 template<class UserNodeLabel, class UserEdgeLabel>
927 struct svm_node *
931  return svm_feature_vector_.data();
932 }
933 
934 template<class UserNodeLabel, class UserEdgeLabel>
935 double *
938 type() {
939  return &type_;
940 }
941 
942 template<class UserNodeLabel, class UserEdgeLabel>
943 bool
946 is_good_assignment() const {
947  return (type_ == 0);
948 }
949 
950 template<class UserNodeLabel, class UserEdgeLabel>
951 std::size_t
954 num_features() const {
955  return dnn_feature_vector_.size();
956 }
957 
958 // === Definition of private struct DNNParams_. ===
959 template<class UserNodeLabel, class UserEdgeLabel>
962 DNNParams_() :
963 activation_candidates{FANN::activation_function_enum::RELU, FANN::activation_function_enum::SIGMOID},
964 min_num_hidden_layers{1},
965 max_num_hidden_layers{10},
966 min_num_neurons_per_layer{1},
967 max_num_neurons_per_layer{20} {}
968 
969 // === Definition of private class DNN_. ===
970 template<class UserNodeLabel, class UserEdgeLabel>
972 DNN_ ::
973 DNN_() :
974 neural_net_() {}
975 
976 template<class UserNodeLabel, class UserEdgeLabel>
977 std::size_t
979 DNN_ ::
980 load(const std::string & filename) {
981  neural_net_.create_from_file(filename);
982  return static_cast<std::size_t>(neural_net_.get_num_input());
983 }
984 
985 template<class UserNodeLabel, class UserEdgeLabel>
986 float
988 DNN_ ::
989 cross_validate_(FANN::training_data & training_data, const MLBasedMethod::DNNParams_ & params, unsigned int num_hidden_layers, unsigned int num_neurons_per_layer, FANN::activation_function_enum hidden_activation) {
990 
991  std::size_t num_folds{5};
992  std::size_t max_num_epochs{100};
993 
994  // Setup the neural network.
995  std::vector<unsigned int> structure_dnn{training_data.num_input_train_data()};
996  for (unsigned int hidden_layer{0}; hidden_layer < num_hidden_layers; hidden_layer++) {
997  structure_dnn.push_back(num_neurons_per_layer);
998  }
999  structure_dnn.push_back(training_data.num_output_train_data());
1000  FANN::neural_net neural_net;
1001  neural_net.create_standard_array(structure_dnn.size(), structure_dnn.data());
1002  neural_net.set_activation_function_hidden(hidden_activation);
1003  neural_net.set_activation_steepness_hidden(1.0);
1004  neural_net.set_activation_function_output(FANN::activation_function_enum::SIGMOID);
1005  neural_net.set_activation_steepness_output(1.0);
1006  neural_net.set_train_error_function(FANN::error_function_enum::ERRORFUNC_LINEAR);
1007  neural_net.set_training_algorithm(FANN::training_algorithm_enum::TRAIN_INCREMENTAL);
1008 
1009  // Divide the training data into folds.
1010  unsigned int size_training_data{training_data.length_train_data()};
1011  unsigned int size_folds{size_training_data / static_cast<unsigned int>(num_folds)};
1012  double ** input_train_data{training_data.get_input()};
1013  double ** output_train_data{training_data.get_output()};
1014  std::vector<std::vector<double *>> folds_input(num_folds);
1015  std::vector<std::vector<double *>> folds_output(num_folds);
1016  std::size_t pos{0};
1017  for (unsigned int fold{0}; fold < num_folds - 1; fold++) {
1018  for (unsigned int counter{0}; counter < size_folds; counter++) {
1019  folds_input[fold].push_back(input_train_data[pos]);
1020  folds_output[fold].push_back(output_train_data[pos++]);
1021  }
1022  }
1023  for (; pos < size_training_data; pos++) {
1024  folds_input[num_folds - 1].push_back(input_train_data[pos]);
1025  folds_output[num_folds - 1].push_back(output_train_data[pos++]);
1026  }
1027 
1028  // Train and validate on the folds.
1029  float validation_error{0.0};
1030  for (unsigned int validation_fold{0}; validation_fold < num_folds; validation_fold++) {
1031  std::vector<double *> training_folds_input;
1032  std::vector<double *> training_folds_output;
1033  for (unsigned int fold{0}; fold < num_folds; fold++) {
1034  if (fold != validation_fold) {
1035  for (auto ptr : folds_input[fold]) {
1036  training_folds_input.push_back(ptr);
1037  }
1038  for (auto ptr : folds_output[fold]) {
1039  training_folds_output.push_back(ptr);
1040  }
1041  }
1042  }
1043  FANN::training_data training_folds_data;
1044  training_folds_data.set_train_data(training_folds_input.size(), training_data.num_input_train_data(), training_folds_input.data(), training_data.num_output_train_data(), training_folds_output.data());
1045  FANN::training_data validation_fold_data;
1046  validation_fold_data.set_train_data(folds_input[validation_fold].size(), training_data.num_input_train_data(), folds_input[validation_fold].data(), training_data.num_output_train_data(), folds_output[validation_fold].data());
1047  validation_error += train_and_validate_(neural_net, training_folds_data, validation_fold_data, max_num_epochs);
1048  }
1049  return validation_error;
1050 }
1051 
1052 template<class UserNodeLabel, class UserEdgeLabel>
1053 float
1055 DNN_ ::
1056 train_and_validate_(FANN::neural_net & neural_net, FANN::training_data & training_data, FANN::training_data & validation_data, std::size_t max_num_epochs) {
1057  float min_validation_error{std::numeric_limits<float>::infinity()};
1058  float current_validation_error{std::numeric_limits<float>::infinity()};
1059  for (std::size_t epoch{0}; epoch < max_num_epochs; epoch++) {
1060  neural_net.train_epoch(training_data);
1061  current_validation_error = neural_net.test_data(validation_data);
1062  min_validation_error = std::min(min_validation_error, current_validation_error);
1063  if ((current_validation_error < 0.01) or (current_validation_error / min_validation_error > 1.05)) {
1064  break;
1065  }
1066  }
1067  return current_validation_error;
1068 }
1069 
1070 template<class UserNodeLabel, class UserEdgeLabel>
1071 void
1073 DNN_ ::
1074 train(FANN::training_data & training_data, const MLBasedMethod::DNNParams_ & params, const std::string & filename, std::size_t num_threads) {
1075 
1076  training_data.scale_input_train_data(0.0, 1.0);
1077 
1078  // Carry out cross-validation to determine the network structure.
1079  unsigned int optimal_num_hidden_layers{params.min_num_hidden_layers};
1080  unsigned int optimal_num_neurons_per_layer{params.min_num_neurons_per_layer};
1081  FANN::activation_function_enum optimal_hidden_activation{params.activation_candidates.at(0)};
1082  std::size_t num_activation_candidates{params.activation_candidates.size()};
1083  std::size_t num_hidden_layers_canditates{1 + params.max_num_hidden_layers - params.min_num_hidden_layers};
1084  std::size_t num_neurons_per_layer_canditates{1 + params.max_num_neurons_per_layer - params.min_num_neurons_per_layer};
1085  std::size_t progress_bar_size{1};
1086  if ((num_activation_candidates * num_hidden_layers_canditates * num_neurons_per_layer_canditates) > 1) {
1087  progress_bar_size = 1 + (num_activation_candidates * num_hidden_layers_canditates * num_neurons_per_layer_canditates);
1088  }
1089 
1090  ProgressBar progress_bar(progress_bar_size);
1091  std::cout << "\rTraining DNN: " << progress_bar << std::flush;
1092  if ((num_activation_candidates * num_hidden_layers_canditates * num_neurons_per_layer_canditates) > 1) {
1093  float optimal_validation_error{std::numeric_limits<float>::infinity()};
1094 #ifdef _OPENMP
1095  omp_set_num_threads(num_threads - 1);
1096 #pragma omp parallel for schedule(dynamic) if(num_threads > 1)
1097 #endif
1098  for (unsigned int num_hidden_layers = params.min_num_hidden_layers; num_hidden_layers <= params.max_num_hidden_layers; num_hidden_layers++) {
1099  for (unsigned int num_neurons_per_layer = params.min_num_neurons_per_layer; num_neurons_per_layer <= params.max_num_neurons_per_layer; num_neurons_per_layer++) {
1100  for (FANN::activation_function_enum hidden_activation : params.activation_candidates) {
1101  float validation_error{cross_validate_(training_data, params, num_hidden_layers, num_neurons_per_layer, hidden_activation)};
1102 #ifdef _OPENMP
1103 #pragma omp critical
1104 #endif
1105  {
1106  if (validation_error < optimal_validation_error) {
1107  optimal_num_hidden_layers = num_hidden_layers;
1108  optimal_num_neurons_per_layer = num_neurons_per_layer;
1109  optimal_validation_error = validation_error;
1110  optimal_hidden_activation = hidden_activation;
1111  }
1112  progress_bar.increment();
1113  std::cout << "\rTraining DNN: " << progress_bar << std::flush;
1114  }
1115  }
1116  }
1117  }
1118  }
1119 
1120  // Setup the neural network.
1121  std::vector<unsigned int> structure_dnn{training_data.num_input_train_data()};
1122  for (unsigned int hidden_layer{0}; hidden_layer < optimal_num_hidden_layers; hidden_layer++) {
1123  structure_dnn.push_back(optimal_num_neurons_per_layer);
1124  }
1125  structure_dnn.push_back(training_data.num_output_train_data());
1126  neural_net_.create_standard_array(structure_dnn.size(), structure_dnn.data());
1127  neural_net_.set_activation_function_hidden(optimal_hidden_activation);
1128  neural_net_.set_activation_steepness_hidden(1.0);
1129  neural_net_.set_activation_function_output(FANN::activation_function_enum::SIGMOID);
1130  neural_net_.set_activation_steepness_output(1.0);
1131  neural_net_.set_train_error_function(FANN::error_function_enum::ERRORFUNC_LINEAR);
1132  neural_net_.set_training_algorithm(FANN::training_algorithm_enum::TRAIN_INCREMENTAL);
1133 
1134  // Divide the training data into training and validation data.
1135  double ** input_data{training_data.get_input()};
1136  double ** output_data{training_data.get_output()};
1137  unsigned int size_data{training_data.length_train_data()};
1138  std::vector<double *> train_input;
1139  std::vector<double *> train_output;
1140  std::vector<double *> valid_input;
1141  std::vector<double *> valid_output;
1142  for (unsigned int pos{0}; pos < size_data; pos++) {
1143  if (pos <= size_data / 5) {
1144  valid_input.push_back(input_data[pos]);
1145  valid_output.push_back(output_data[pos]);
1146  }
1147  else {
1148  train_input.push_back(input_data[pos]);
1149  train_output.push_back(output_data[pos]);
1150  }
1151  }
1152 
1153  // Train the neural network.
1154  std::size_t max_num_epochs_training{5000};
1155  FANN::training_data train_data;
1156  train_data.set_train_data(train_input.size(), training_data.num_input_train_data(), train_input.data(), training_data.num_output_train_data(), train_output.data());
1157  FANN::training_data valid_data;
1158  valid_data.set_train_data(valid_input.size(), training_data.num_input_train_data(), valid_input.data(), training_data.num_output_train_data(), valid_output.data());
1159  float valiation_error{train_and_validate_(neural_net_, train_data, valid_data, max_num_epochs_training)};
1160  progress_bar.increment();
1161  std::cout << "\rTraining DNN: " << progress_bar << std::flush;
1162  std::cout << "\nNetwork structure: " << training_data.num_input_train_data() << " x ";
1163  for (unsigned int layer{0}; layer < optimal_num_hidden_layers; layer++) {
1164  std::cout << optimal_num_neurons_per_layer << " x ";
1165  }
1166  std::cout << training_data.num_output_train_data() << ". Hidden layer activation: ";
1167  if (optimal_hidden_activation == FANN::activation_function_enum::SIGMOID) {
1168  std::cout << "Sigmoid";
1169  }
1170  else {
1171  std::cout << "ReLu";
1172  }
1173  std::cout << ". Validation error: " << valiation_error << "\n";
1174 
1175  // Save the trained neural network.
1176  if (filename != "") {
1177  neural_net_.save(filename);
1178  }
1179 }
1180 
1181 template<class UserNodeLabel, class UserEdgeLabel>
1182 double
1184 DNN_ ::
1185 decision_value(double * feature_vector) {
1186  double * output = neural_net_.run(feature_vector);
1187  return (*output);
1188 }
1189 
1190 // === Definition of private struct SVMParams_. ===
1191 template<class UserNodeLabel, class UserEdgeLabel>
1194 SVMParams_() :
1195 min_gamma_exp{-3},
1196 max_gamma_exp{3},
1197 min_c_exp{-3},
1198 max_c_exp{3},
1199 min_nu{0.5},
1200 max_nu{0.5} {}
1201 
1202 // === Definition of private class SVM_. ===
1203 template<class UserNodeLabel, class UserEdgeLabel>
1205 SVM_ ::
1206 SVM_() :
1207 svm_model_{nullptr} {}
1208 
1209 template<class UserNodeLabel, class UserEdgeLabel>
1211 SVM_ ::
1212 ~SVM_() {
1213  svm_free_and_destroy_model(&svm_model_);
1214 }
1215 
1216 template<class UserNodeLabel, class UserEdgeLabel>
1217 std::size_t
1219 SVM_ ::
1220 load(const std::string & filename) {
1221  svm_model_ = svm_load_model(filename.c_str());
1222  if (svm_check_probability_model(svm_model_) == 0) {
1223  throw Error("SVM model does not support probability estimates.");
1224  }
1225  std::map<std::string, std::string> options;
1226  util::parse_config_file(filename + ".nf", options);
1227  return std::stoul(options.at("num_features"));
1228 }
1229 
1230 template<class UserNodeLabel, class UserEdgeLabel>
1231 void
1233 SVM_ ::
1234 train(struct svm_problem * training_data, const MLBasedMethod::SVMParams_ & params, std::size_t num_features, const std::string & filename, std::size_t num_threads) {
1235 
1236  // Set the meta-parameters.
1237  struct svm_parameter svm_params;
1238  svm_params.gamma = std::pow(10, params.min_gamma_exp);
1239  svm_params.C = std::pow(10, params.min_c_exp);
1240  svm_params.svm_type = C_SVC;
1241  svm_params.kernel_type = RBF;
1242  svm_params.coef0 = 0;
1243  svm_params.degree = 0;
1244  svm_params.eps = 0.001;
1245  svm_params.cache_size = 100;
1246  svm_params.shrinking = 0;
1247  svm_params.probability = 1;
1248  svm_params.nr_weight = 0;
1249  svm_params.weight_label = nullptr;
1250  svm_params.weight = nullptr;
1251  svm_params.p = 0;
1252  svm_params.nu = 0;
1253  const char * error_msg;
1254  std::string error_msg_string;
1255 
1256  std::size_t progress_bar_size{1};
1257  if ((params.min_gamma_exp < params.max_gamma_exp) or (params.min_c_exp < params.max_c_exp)) {
1258  progress_bar_size += (1 + params.max_gamma_exp - params.min_gamma_exp) * (1 + params.max_c_exp - params.min_c_exp);
1259  }
1260  ProgressBar progress_bar(progress_bar_size);
1261  std::cout << "\rTraining SVM: " << progress_bar << std::flush;
1262  // Cross-validate to find the best values for gamma and C.
1263  if ((params.min_gamma_exp < params.max_gamma_exp) or (params.min_c_exp < params.max_c_exp)) {
1264  std::vector<double> predicted_y(training_data->l);
1265  std::size_t max_correct{0};
1266  int best_gamma_exp{params.min_gamma_exp};
1267  int best_c_exp{params.min_c_exp};
1268  std::size_t num_folds{5};
1269 #ifdef _OPENMP
1270  omp_set_num_threads(num_threads - 1);
1271 #pragma omp parallel for if(num_threads > 1)
1272 #endif
1273  for (int gamma_exp = params.min_gamma_exp; gamma_exp <= params.max_gamma_exp; gamma_exp++) {
1274  struct svm_parameter local_svm_params = svm_params;
1275  local_svm_params.probability = 0;
1276  local_svm_params.gamma = std::pow(10, static_cast<double>(gamma_exp));
1277  for (int c_exp{params.min_c_exp}; c_exp <= params.max_c_exp; c_exp++) {
1278  local_svm_params.C = std::pow(10, static_cast<double>(c_exp));
1279  error_msg = svm_check_parameter(training_data, &local_svm_params);
1280  if (error_msg) {
1281  error_msg_string = std::string(error_msg);
1282  throw Error(error_msg_string);
1283  }
1284  svm_cross_validation(training_data, &local_svm_params, num_folds, predicted_y.data());
1285  std::size_t correct{0};
1286  for (int i{0}; i < training_data->l; i++) {
1287  if (training_data->y[i] == predicted_y.at(i)) {
1288  correct++;
1289  }
1290  }
1291 #ifdef _OPENMP
1292 #pragma omp critical
1293 #endif
1294  {
1295  if (correct > max_correct) {
1296  max_correct = correct;
1297  best_gamma_exp = gamma_exp;
1298  best_c_exp = c_exp;
1299  }
1300  progress_bar.increment();
1301  std::cout << "\rTraining SVM: " << progress_bar << std::flush;
1302  }
1303  }
1304  }
1305  svm_params.gamma = std::pow(10, static_cast<double>(best_gamma_exp));
1306  svm_params.C = std::pow(10, static_cast<double>(best_c_exp));
1307  }
1308 
1309  // Train the SVM with the found parameters.
1310  error_msg = svm_check_parameter(training_data, &svm_params);
1311  if (error_msg) {
1312  error_msg_string = std::string(error_msg);
1313  throw Error(error_msg_string);
1314  }
1315  svm_model_ = svm_train(training_data, &svm_params);
1316  progress_bar.increment();
1317  std::cout << "\rTraining SVM: " << progress_bar << std::flush << "\n";
1318  if (svm_check_probability_model(svm_model_) == 0) {
1319  throw Error("SVM model does not support probability estimates.");
1320  }
1321 
1322  // Save the trained SVM.
1323  if (filename != "") {
1324  if (svm_save_model(filename.c_str(), svm_model_)) {
1325  throw Error("Cannot save SVM model to " + filename + ".");
1326  }
1327  std::map<std::string, std::string> options;
1328  options["num_features"] = std::to_string(num_features);
1329  util::save_as_config_file(filename + ".nf", options);
1330  }
1331 }
1332 
1333 template<class UserNodeLabel, class UserEdgeLabel>
1334 double
1336 SVM_ ::
1337 decision_value(struct svm_node * feature_vector) const {
1338  std::vector<double> probability_estimates(2);
1339  double predicted_type{svm_predict_probability(svm_model_, feature_vector, probability_estimates.data())};
1340  double higher_prob = std::max(probability_estimates.at(0), probability_estimates.at(1));
1341  double lower_prob = std::min(probability_estimates.at(0), probability_estimates.at(1));
1342  if (predicted_type == 2) {
1343  return higher_prob;
1344  }
1345  else if (predicted_type != 1) {
1346  throw Error("Unexpected predicted type " + std::to_string(predicted_type) + ".");
1347  }
1348  return lower_prob;
1349 }
1350 
1351 // === Definition of private class OneClassSVM_. ===
1352 template<class UserNodeLabel, class UserEdgeLabel>
1355 OneClassSVM_() :
1356 svm_model_{nullptr},
1357 rho_{0},
1358 sum_alpha_{0},
1359 scale_factor_{1},
1360 use_likelihood_{false} {}
1361 
1362 template<class UserNodeLabel, class UserEdgeLabel>
1365 ~OneClassSVM_() {
1366  svm_free_and_destroy_model(&svm_model_);
1367 }
1368 
1369 template<class UserNodeLabel, class UserEdgeLabel>
1370 std::size_t
1373 load(const std::string & filename, bool use_likelihood) {
1374  use_likelihood_ = use_likelihood;
1375  svm_model_ = svm_load_model(filename.c_str());
1376  std::map<std::string, std::string> options;
1377  util::parse_config_file(filename + ".nf", options);
1378  std::size_t num_features{std::stoul(options.at("num_features"))};
1379  compute_rho_and_scale_factor_(num_features);
1380  return num_features;
1381 }
1382 
1383 template<class UserNodeLabel, class UserEdgeLabel>
1384 void
1387 train(struct svm_problem * training_data, bool use_likelihood, std::size_t num_features, const std::string & filename) {
1388 
1389  // Set the meta-parameters.
1390  use_likelihood_ = use_likelihood;
1391  struct svm_parameter svm_params;
1392  svm_params.gamma = 1.0 / static_cast<double>(num_features);
1393  svm_params.C = 0;
1394  svm_params.svm_type = ONE_CLASS;
1395  svm_params.kernel_type = RBF;
1396  svm_params.coef0 = 0;
1397  svm_params.degree = 0;
1398  svm_params.eps = 0.001;
1399  svm_params.cache_size = 100;
1400  svm_params.shrinking = 0;
1401  svm_params.probability = 0;
1402  svm_params.nr_weight = 0;
1403  svm_params.weight_label = nullptr;
1404  svm_params.weight = nullptr;
1405  svm_params.p = 0;
1406  svm_params.nu = 0.5;
1407  const char * error_msg;
1408  std::string error_msg_string;
1409 
1410  std::size_t progress_bar_size{1};
1411  ProgressBar progress_bar(progress_bar_size);
1412  std::cout << "\rTraining one class SVM: " << progress_bar << std::flush;
1413 
1414  // Train the SVM.
1415  error_msg = svm_check_parameter(training_data, &svm_params);
1416  if (error_msg) {
1417  error_msg_string = std::string(error_msg);
1418  throw Error(error_msg_string);
1419  }
1420  svm_model_ = svm_train(training_data, &svm_params);
1421  progress_bar.increment();
1422  std::cout << "\rTraining SVM: " << progress_bar << std::flush << "\n";
1423  compute_rho_and_scale_factor_(num_features);
1424 
1425  // Save the trained SVM.
1426  if (filename != "") {
1427  if (svm_save_model(filename.c_str(), svm_model_)) {
1428  throw Error("Cannot save SVM model to " + filename + ".");
1429  }
1430  std::map<std::string, std::string> options;
1431  options["num_features"] = std::to_string(num_features);
1432  util::save_as_config_file(filename + ".nf", options);
1433  }
1434 }
1435 
1436 template<class UserNodeLabel, class UserEdgeLabel>
1437 double
1440 decision_value(struct svm_node * feature_vector) const {
1441  std::vector<double> dec_values(1);
1442  double predicted_type{svm_predict_values(svm_model_, feature_vector, dec_values.data())};
1443  if (predicted_type != 1 and predicted_type != -1) {
1444  throw Error("Unexpected predicted type " + std::to_string(predicted_type) + ".");
1445  }
1446  if (use_likelihood_) {
1447  return (1 - ((dec_values.at(0) - rho_) * scale_factor_));
1448  }
1449  else if (dec_values.at(0) >= 0.0) {
1450  return (0.5 - 0.5 * (dec_values.at(0) / (sum_alpha_ - rho_)));
1451  }
1452  return (0.5 - 0.5 * (dec_values.at(0) / rho_));
1453 }
1454 
1455 template<class UserNodeLabel, class UserEdgeLabel>
1456 void
1459 compute_rho_and_scale_factor_(std::size_t num_features) {
1460  rho_ = svm_model_->rho[0];
1461  double * alpha{svm_model_->sv_coef[0]};
1462  sum_alpha_ = 0;
1463  for (int i{0}; i < svm_model_->l; i++) {
1464  sum_alpha_ += alpha[i];
1465  }
1466  scale_factor_ = std::pow(svm_model_->param.gamma / pi(), static_cast<double>(num_features) / 2.0) / sum_alpha_;
1467 }
1468 
1469 // === Default definitions of virtual member functions to be overridden by derived classes. ===
1470 template<class UserNodeLabel, class UserEdgeLabel>
1471 void
1473 ml_init_feature_variables_(const GEDGraph & g, const GEDGraph & h, std::size_t num_threads) {}
1474 
1475 template<class UserNodeLabel, class UserEdgeLabel>
1476 std::string
1479  return "";
1480 }
1481 
1482 template<class UserNodeLabel, class UserEdgeLabel>
1483 bool
1485 ml_parse_option_(const std::string & option, const std::string & arg) {
1486  return false;
1487 }
1488 
1489 template<class UserNodeLabel, class UserEdgeLabel>
1490 void
1493 
1494 template<class UserNodeLabel, class UserEdgeLabel>
1495 void
1498 
1499 template<class UserNodeLabel, class UserEdgeLabel>
1500 void
1502 ml_init_graph_(const GEDGraph & graph) {}
1503 
1504 template<class UserNodeLabel, class UserEdgeLabel>
1505 std::size_t
1508  return undefined();
1509 }
1510 
1511 template<class UserNodeLabel, class UserEdgeLabel>
1512 void
1515 
1516 template<class UserNodeLabel, class UserEdgeLabel>
1517 void
1519 ml_populate_substitution_feature_vector_(const GEDGraph & g, const GEDGraph & h, GEDGraph::NodeID i, GEDGraph::NodeID k, std::vector<double> & feature_vector) {}
1520 
1521 template<class UserNodeLabel, class UserEdgeLabel>
1522 void
1524 ml_populate_deletion_feature_vector_(const GEDGraph & g, GEDGraph::NodeID i, std::vector<double> & feature_vector) {}
1525 
1526 template<class UserNodeLabel, class UserEdgeLabel>
1527 void
1529 ml_populate_insertion_feature_vector_(const GEDGraph & h, GEDGraph::NodeID k, std::vector<double> & feature_vector) {}
1530 
1531 }
1532 
1533 #endif /* SRC_METHODS_ML_BASED_METHOD_IPP_ */
std::size_t num_nodes() const
Returns the number of nodes.
Definition: ged_graph.ipp:211
virtual void lsape_pre_graph_init_(bool called_at_runtime) final
Initializes the method at runtime or during initialization before initializing the global variables f...
std::size_t num_threads_
The number of threads to be used.
virtual bool lsape_parse_option_(const std::string &option, const std::string &value) final
Parses one option that is not among the ones shared by all derived classes of ged::LSAPEBasedMethod.
A class for node maps.
Definition: node_map.hpp:43
Mixed integer linear programming formulation of the graph edit distance.
Definition: f1.hpp:42
constexpr double pi()
Provides constant with double precision.
virtual std::string lsape_valid_options_string_() const final
Returns string of all valid options that are not among the ones shared by all derived classes of ged:...
const GEDData< UserNodeLabel, UserEdgeLabel > & ged_data_
The data on which the method is run.
Definition: ged_method.hpp:124
virtual void lsape_set_default_options_() final
Sets all options that are not among the ones shared by all derived classes of ged::LSAPEBasedMethod t...
virtual bool ml_parse_option_(const std::string &option, const std::string &arg)
Parses one option that is not among the ones shared by all derived classes of ged::MLBasedMethod.
virtual std::string ml_valid_options_string_() const
Returns string of all valid options that are not among the ones shared by all derived classes of ged:...
A wrapper structure for the result of calls to ged::GEDMethod::run_as_util() and ged::GEDMethod::ged_...
Definition: result.hpp:38
std::size_t num_cols() const
Returns the number of columns.
Definition: matrix.ipp:110
Mixed integer linear programming formulation of the graph edit distance.
Definition: compact_mip.hpp:43
Runtime error class.
Definition: error.hpp:37
std::size_t num_features_
The size of the feature vectors.
static NodeID dummy_node()
Returns a dummy node.
Definition: ged_graph.ipp:56
void parse_config_file(const std::string &filename, std::map< std::string, std::string > &options)
Parses a configuration file.
Definition: misc.ipp:49
void run(GEDGraph::GraphID g_id, GEDGraph::GraphID h_id)
Runs the method with options specified by set_options().
Definition: ged_method.ipp:113
virtual std::size_t ml_get_num_features_()
Returns the number of features.
bool compute_lower_bound_
Flag that should be set to true if and only if the method computes a lower bound. ...
virtual void lsape_init_graph_(const GEDGraph &graph) final
Initializes global variables for one graph.
virtual void ml_init_feature_variables_(const GEDGraph &g, const GEDGraph &h, std::size_t num_threads)
Initializes variables that are used for populating the feature vectors of assignments between two inp...
bool initialized_
A flag that equals true if init() has been called and false otherwise.
Definition: ged_method.hpp:119
virtual void lsape_init_() final
Initializes the method after initializing the global variables for the graphs.
A progress bar class.
virtual void lsape_populate_instance_(const GEDGraph &g, const GEDGraph &h, DMatrix &master_problem) final
Populates the LSAPE instance.
virtual void ml_init_graph_(const GEDGraph &graph)
Initializes global variables for one graph.
The normalized input graphs used by GEDLIB. All labels are integers.
Definition: ged_graph.hpp:104
virtual void ml_populate_deletion_feature_vector_(const GEDGraph &g, GEDGraph::NodeID i, std::vector< double > &feature_vector)
Computes deletion feature vector.
constexpr std::size_t undefined()
Returns undefined size.
virtual void ml_init_()
Initializes the method after initializing the global variables for the graphs.
Global namespace for GEDLIB.
virtual void ml_set_default_options_()
Sets all options that are not among the ones shared by all derived classes of ged::MLBasedMethod to d...
void increment()
Increments the number of solved tasks.
virtual void ml_init_for_num_features_()
Initializes the derived class for running with feature vectors of size ged::MLBasedMethod::num_featur...
virtual void ml_populate_substitution_feature_vector_(const GEDGraph &g, const GEDGraph &h, GEDGraph::NodeID i, GEDGraph::NodeID k, std::vector< double > &feature_vector)
Computes substitution feature vector.
Abstract class for methods that transform GED to LSAPE by using a SVM or a DNN to predict the cost of...
Computes the exact graph edit distance for general edit costs.
NodeMap & node_map(std::size_t index_node_map)
Provides access to a node map.
Definition: result.ipp:74
virtual void lsape_default_post_graph_init_() final
Default initializes the method at runtime after initializing the global variables for the graphs...
void save_as_config_file(const std::string &filename, const std::map< std::string, std::string > &options)
Saves a string map as a configuration file as expected by parse_config_file().
Definition: misc.ipp:76
Computes an upper bound for general edit costs.
Definition: ipfp.hpp:64
std::size_t NodeID
Internally used vertex ID type.
Definition: ged_graph.hpp:108
Mixed integer linear programming formulation of the graph edit distance.
Definition: f2.hpp:42
GraphID id() const
Returns the ID of the graph.
Definition: ged_graph.ipp:184
double predict(const GEDGraph &g, const GEDGraph &h, const NodeMap::Assignment &assignment)
Predicts the type of a node assignment.
virtual void ml_populate_insertion_feature_vector_(const GEDGraph &h, GEDGraph::NodeID k, std::vector< double > &feature_vector)
Computes insertion feature vector.
std::size_t num_rows() const
Returns the number of rows.
Definition: matrix.ipp:85