GEDLIB  1.0
util.hpp
Go to the documentation of this file.
1 /***************************************************************************
2  * *
3  * Copyright (C) 2018 by David B. Blumenthal *
4  * *
5  * This file is part of GEDLIB. *
6  * *
7  * GEDLIB is free software: you can redistribute it and/or modify it *
8  * under the terms of the GNU Lesser General Public License as published *
9  * by the Free Software Foundation, either version 3 of the License, or *
10  * (at your option) any later version. *
11  * *
12  * GEDLIB is distributed in the hope that it will be useful, *
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15  * GNU Lesser General Public License for more details. *
16  * *
17  * You should have received a copy of the GNU Lesser General Public *
18  * License along with GEDLIB. If not, see <http://www.gnu.org/licenses/>. *
19  * *
20  ***************************************************************************/
21 
31 #ifndef SRC_TESTS_PR2018_UTIL_HPP_
32 #define SRC_TESTS_PR2018_UTIL_HPP_
33 
34 #define GXL_GEDLIB_SHARED
35 #include "../../../src/env/ged_env.hpp"
36 
37 namespace util {
38 
39 bool is_synth_mol_dataset(const std::string & dataset) {
40  return ((dataset == "S-MOL_NL01") or (dataset == "S-MOL_NL04") or (dataset == "S-MOL_NL07") or (dataset == "S-MOL_NL10"));
41 }
42 
43 bool is_chemical_dataset(const std::string & dataset) {
44  return (is_synth_mol_dataset(dataset) or (dataset == "AIDS") or (dataset == "Mutagenicity") or (dataset == "acyclic") or (dataset == "alkane") or (dataset == "mao") or (dataset == "pah") );
45 }
46 
47 bool is_letter_dataset(const std::string & dataset) {
48  return ((dataset == "Letter_HIGH") or (dataset == "Letter_LOW") or (dataset == "Letter_MED"));
49 }
50 
51 void check_dataset(const std::string & dataset) {
52  if (not (is_chemical_dataset(dataset) or is_letter_dataset(dataset) or (dataset == "CMU-GED") or (dataset == "Fingerprint") or (dataset == "GREC") or (dataset == "Protein"))) {
53  throw ged::Error(std::string("Dataset \"") + dataset + "\" does not exists.");
54  }
55 }
56 
57 std::string graph_dir(const std::string & dataset) {
58  std::string root_dir("../../../data/datasets/");
59  if ((dataset == "AIDS") or (dataset == "Fingerprint") or (dataset == "GREC") or (dataset == "Protein") or (dataset == "Mutagenicity")) {
60  return (root_dir + dataset + "/data/");
61  }
62  else if ((dataset == "Letter_HIGH")) {
63  return (root_dir + "Letter/HIGH/");
64  }
65  else if ((dataset == "Letter_LOW")) {
66  return (root_dir + "Letter/LOW/");
67  }
68  else if ((dataset == "Letter_MED")) {
69  return (root_dir + "Letter/MED/");
70  }
71  else if (dataset == "S-MOL_NL01") {
72  return (root_dir + "S-MOL/NL01/");
73  }
74  else if (dataset == "S-MOL_NL04") {
75  return (root_dir + "S-MOL/NL04/");
76  }
77  else if (dataset == "S-MOL_NL07") {
78  return (root_dir + "S-MOL/NL07/");
79  }
80  else if (dataset == "S-MOL_NL10") {
81  return (root_dir + "S-MOL/NL10/");
82  }
83  else if (dataset == "CMU-GED") {
84  return (root_dir + dataset + "/CMU/");
85  }
86  else if ((dataset == "acyclic") or (dataset == "alkane") or (dataset == "mao") or (dataset == "pah")) {
87  return (root_dir + dataset + "/");
88  }
89  else {
90  throw ged::Error(std::string("Dataset \"") + dataset + "\" does not exists.");
91  }
92  return "";
93 }
94 
95 std::string train_collection(const std::string & dataset) {
96  std::string root_dir("../collections/");
97  check_dataset(dataset);
98  if (is_letter_dataset(dataset)) {
99  return (root_dir + "Letter_50.xml");
100  }
101  if (is_synth_mol_dataset(dataset)) {
102  return (root_dir + "S-MOL_50.xml");
103  }
104  return root_dir + dataset + "_50.xml";
105 }
106 
107 std::string test_collection(const std::string & dataset) {
108  std::string root_dir("../collections/");
109  check_dataset(dataset);
110  if (is_letter_dataset(dataset)) {
111  return (root_dir + "Letter_100.xml");
112  }
113  if (is_synth_mol_dataset(dataset)) {
114  return (root_dir + "S-MOL_100.xml");
115  }
116  return root_dir + dataset + "_100.xml";
117 }
118 
119 std::string config_prefix(const std::string & dataset) {
120  check_dataset(dataset);
121  return std::string("../output/" + dataset + "_");
122 }
123 
124 std::string init_options(const std::string & dataset, const std::string & config_suffix, const std::string & data_suffix = "", bool save_train = false, bool load_train = false, std::size_t threads = 8) {
125  check_dataset(dataset);
126  std::string options("--threads ");
127  options += std::to_string(threads) + " --save ../output/";
128  options += dataset + "_" + config_suffix + ".ini";
129  if (save_train) {
130  if (load_train) {
131  throw ged::Error("Training data cannot be both saved and loaded.");
132  }
133  options += " --save-train ../output/" + dataset + "_" + data_suffix + ".data";
134  }
135  if (load_train) {
136  options += " --load-train ../output/" + dataset + "_" + data_suffix + ".data";
137  }
138  return options;
139 }
140 
141 std::string ground_truth_option(const std::string & dataset) {
142  check_dataset(dataset);
143  //if (is_letter_dataset(dataset)) {
144  // return std::string(" --ground-truth-method EXACT");
145  //}
146  return std::string(" --ground-truth-method IPFP");
147 }
148 
149 ged::Options::EditCosts edit_costs(const std::string & dataset) {
150  if (is_chemical_dataset(dataset)) {
152  }
153  else if (is_letter_dataset(dataset)) {
155  }
156  else if (dataset == "CMU-GED") {
158  }
159  else if (dataset == "Fingerprint") {
161  }
162  else if (dataset == "GREC") {
164  }
165  else if (dataset == "Protein") {
167  }
168  else {
169  throw ged::Error(std::string("Dataset \"") + dataset + "\" does not exists.");
170  }
172 }
173 
174 ged::Options::GXLNodeEdgeType node_type(const std::string & dataset) {
175  check_dataset(dataset);
176  if ((dataset == "Fingerprint") or (dataset == "CMU-GED")) {
178  }
180 }
181 
182 ged::Options::GXLNodeEdgeType edge_type(const std::string & dataset) {
183  check_dataset(dataset);
184  if (is_letter_dataset(dataset)) {
186  }
188 }
189 
190 std::unordered_set<std::string> irrelevant_node_attributes(const std::string & dataset) {
191  check_dataset(dataset);
192  std::unordered_set<std::string> irrelevant_attributes;
193  if ((dataset == "AIDS")) {
194  irrelevant_attributes.insert({"x", "y", "symbol"});
195  }
196  else if (dataset == "Protein") {
197  irrelevant_attributes.insert("aaLength");
198  }
199  return irrelevant_attributes;
200 }
201 
202 std::unordered_set<std::string> irrelevant_edge_attributes(const std::string & dataset) {
203  check_dataset(dataset);
204  std::unordered_set<std::string> irrelevant_attributes;
205  if ((dataset == "GREC")) {
206  irrelevant_attributes.insert({"angle0", "angle1"});
207  }
208  else if (dataset == "Protein") {
209  irrelevant_attributes.insert({"distance0", "distance1"});
210  }
211  else if (dataset == "Fingerprint") {
212  irrelevant_attributes.insert("angle");
213  }
214  return irrelevant_attributes;
215 }
216 
217 ged::Options::InitType init_type(const std::string & dataset) {
218  if (is_chemical_dataset(dataset) or (dataset == "Protein")) {
220  }
222 }
223 
224 std::vector<ged::GEDGraph::GraphID> setup_environment(const std::string & dataset, bool train, ged::GEDEnv<ged::GXLNodeID, ged::GXLLabel, ged::GXLLabel> & env) {
225  std::vector<ged::GEDGraph::GraphID> graph_ids(env.load_gxl_graphs(graph_dir(dataset), (train ? train_collection(dataset) : test_collection(dataset)), node_type(dataset), edge_type(dataset), irrelevant_node_attributes(dataset), irrelevant_edge_attributes(dataset)));
226  env.set_edit_costs(edit_costs(dataset));
227  env.init(init_type(dataset));
228  return graph_ids;
229 }
230 
231 void setup_datasets(std::vector<std::string> & datasets) {
232  datasets = {"Letter_HIGH", "pah", "AIDS", "Protein", "GREC", "Fingerprint"};
233 }
234 
235 }
236 
237 #endif /* SRC_TESTS_PR2018_UTIL_HPP_ */
Definition: util.hpp:37
Selects ged::Fingerprint.
void init(Options::InitType init_type=Options::InitType::EAGER_WITHOUT_SHUFFLED_COPIES)
Initializes the environment.
Definition: ged_env.ipp:655
std::vector< GEDGraph::GraphID > load_gxl_graphs(const std::string &graph_dir, const std::string &collection_file, Options::GXLNodeEdgeType node_type=Options::GXLNodeEdgeType::LABELED, Options::GXLNodeEdgeType edge_type=Options::GXLNodeEdgeType::LABELED, const std::unordered_set< std::string > &irrelevant_node_attributes=std::unordered_set< std::string >(), const std::unordered_set< std::string > &irrelevant_edge_attributes=std::unordered_set< std::string >())
Loads graphs given in the GXL file format.
void set_edit_costs(Options::EditCosts edit_costs, std::initializer_list< double > edit_cost_constants={})
Sets the edit costs to one of the predefined edit costs.
Definition: ged_env.ipp:55
Selects ged::Protein.
GXLNodeEdgeType
Selects whether nodes or edges of graphs given in GXL file format are labeled or unlabeled.
Selects ged::Constant.
Selects ged::Letter.
Runtime error class.
Definition: error.hpp:37
InitType
Selects the initialization type of the environment.
Eager initialization, no shuffled graph copies are constructed.
EditCosts
Selects the edit costs.
Lazy initialization, no shuffled graph copies are constructed.
Provides the API of GEDLIB.
Definition: ged_data.hpp:48