GEDLIB  1.0
util.hpp
Go to the documentation of this file.
1 /***************************************************************************
2  * *
3  * Copyright (C) 2018 by David B. Blumenthal *
4  * *
5  * This file is part of GEDLIB. *
6  * *
7  * GEDLIB is free software: you can redistribute it and/or modify it *
8  * under the terms of the GNU Lesser General Public License as published *
9  * by the Free Software Foundation, either version 3 of the License, or *
10  * (at your option) any later version. *
11  * *
12  * GEDLIB is distributed in the hope that it will be useful, *
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15  * GNU Lesser General Public License for more details. *
16  * *
17  * You should have received a copy of the GNU Lesser General Public *
18  * License along with GEDLIB. If not, see <http://www.gnu.org/licenses/>. *
19  * *
20  ***************************************************************************/
21 
31 #ifndef SRC_TESTS_VLDBJ2018_UTIL_HPP_
32 #define SRC_TESTS_VLDBJ2018_UTIL_HPP_
33 
34 #define GXL_GEDLIB_SHARED
35 #include "../../../src/env/ged_env.hpp"
36 
37 namespace util {
38 
39 bool is_chemical_dataset(const std::string & dataset) {
40  return ((dataset == "AIDS") or (dataset == "Mutagenicity") or (dataset == "acyclic") or (dataset == "alkane") or (dataset == "mao") or (dataset == "pah"));
41 }
42 
43 bool is_letter_dataset(const std::string & dataset) {
44  return ((dataset == "Letter_HIGH") or (dataset == "Letter_LOW") or (dataset == "Letter_MED"));
45 }
46 
47 void check_dataset(const std::string & dataset) {
48  if (not (is_chemical_dataset(dataset) or is_letter_dataset(dataset) or (dataset == "CMU-GED") or (dataset == "Fingerprint") or (dataset == "GREC") or (dataset == "Protein"))) {
49  throw ged::Error(std::string("Dataset \"") + dataset + "\" does not exists.");
50  }
51 }
52 
53 std::string graph_dir(const std::string & dataset) {
54  std::string root_dir("../../../data/datasets/");
55  if ((dataset == "AIDS") or (dataset == "Fingerprint") or (dataset == "GREC") or (dataset == "Protein") or (dataset == "Mutagenicity")) {
56  return (root_dir + dataset + "/data/");
57  }
58  else if ((dataset == "Letter_HIGH")) {
59  return (root_dir + "Letter/HIGH/");
60  }
61  else if ((dataset == "Letter_LOW")) {
62  return (root_dir + "Letter/LOW/");
63  }
64  else if ((dataset == "Letter_MED")) {
65  return (root_dir + "Letter/MED/");
66  }
67  else if (dataset == "CMU-GED") {
68  return (root_dir + dataset + "/CMU/");
69  }
70  else if ((dataset == "acyclic") or (dataset == "alkane") or (dataset == "mao") or (dataset == "pah")) {
71  return (root_dir + dataset + "/");
72  }
73  else {
74  throw ged::Error(std::string("Dataset \"") + dataset + "\" does not exists.");
75  }
76  return "";
77 }
78 
79 std::string train_collection(const std::string & dataset) {
80  std::string root_dir("../collections/");
81  check_dataset(dataset);
82  if (is_letter_dataset(dataset)) {
83  return (root_dir + "Letter_train.xml");
84  }
85  return root_dir + dataset + "_train.xml";
86 }
87 
88 std::string test_collection(const std::string & dataset) {
89  std::string root_dir("../collections/");
90  check_dataset(dataset);
91  if (is_letter_dataset(dataset)) {
92  return (root_dir + "Letter_test.xml");
93  }
94  return root_dir + dataset + "_test.xml";
95 }
96 
97 std::string size_constrained_collection(const std::string & dataset, std::size_t max_size_div_10) {
98  std::string suffix(std::to_string((max_size_div_10 * 10) - 9) + "-" + std::to_string(max_size_div_10 * 10));
99  std::string root_dir("../collections/");
100  check_dataset(dataset);
101  return root_dir + dataset + suffix + ".xml";
102 }
103 
104 std::string config_prefix(const std::string & dataset) {
105  check_dataset(dataset);
106  return std::string("../ini/" + dataset + "_");
107 }
108 
109 std::string init_options(const std::string & dataset, const std::string & config_suffix, const std::string & data_suffix = "", bool save_train = false, bool load_train = false, std::size_t threads = 8) {
110  check_dataset(dataset);
111  std::string options("--threads ");
112  options += std::to_string(threads) + " --save ../ini/";
113  options += dataset + "_" + config_suffix + ".ini";
114  if (save_train) {
115  if (load_train) {
116  throw ged::Error("Training data cannot be both saved and loaded.");
117  }
118  options += " --save-train ../ini/" + dataset + "_" + data_suffix + ".data";
119  }
120  if (load_train) {
121  options += " --load-train ../ini/" + dataset + "_" + data_suffix + ".data";
122  }
123  return options;
124 }
125 
126 std::string ground_truth_option(const std::string & dataset) {
127  check_dataset(dataset);
128  //if (is_letter_dataset(dataset)) {
129  // return std::string(" --ground-truth-method EXACT");
130  //}
131  return std::string(" --ground-truth-method IPFP");
132 }
133 
134 ged::Options::EditCosts edit_costs(const std::string & dataset) {
135  if (is_chemical_dataset(dataset)) {
137  }
138  else if (is_letter_dataset(dataset)) {
140  }
141  else if (dataset == "CMU-GED") {
143  }
144  else if (dataset == "Fingerprint") {
146  }
147  else if (dataset == "GREC") {
149  }
150  else if (dataset == "Protein") {
152  }
153  else {
154  throw ged::Error(std::string("Dataset \"") + dataset + "\" does not exists.");
155  }
157 }
158 
159 ged::Options::GXLNodeEdgeType node_type(const std::string & dataset) {
160  check_dataset(dataset);
161  if ((dataset == "Fingerprint") or (dataset == "CMU-GED")) {
163  }
165 }
166 
167 ged::Options::GXLNodeEdgeType edge_type(const std::string & dataset) {
168  check_dataset(dataset);
169  if (is_letter_dataset(dataset)) {
171  }
173 }
174 
175 std::unordered_set<std::string> irrelevant_node_attributes(const std::string & dataset) {
176  check_dataset(dataset);
177  std::unordered_set<std::string> irrelevant_attributes;
178  if ((dataset == "AIDS")) {
179  irrelevant_attributes.insert({"x", "y", "symbol"});
180  }
181  else if (dataset == "Protein") {
182  irrelevant_attributes.insert("aaLength");
183  }
184  return irrelevant_attributes;
185 }
186 
187 std::unordered_set<std::string> irrelevant_edge_attributes(const std::string & dataset) {
188  check_dataset(dataset);
189  std::unordered_set<std::string> irrelevant_attributes;
190  if ((dataset == "GREC")) {
191  irrelevant_attributes.insert({"angle0", "angle1"});
192  }
193  else if (dataset == "Protein") {
194  irrelevant_attributes.insert({"distance0", "distance1"});
195  }
196  else if (dataset == "Fingerprint") {
197  irrelevant_attributes.insert("angle");
198  }
199  return irrelevant_attributes;
200 }
201 
202 ged::Options::InitType init_type(const std::string & dataset) {
203  if (is_chemical_dataset(dataset) or (dataset == "Protein")) {
205  }
207 }
208 
209 std::vector<ged::GEDGraph::GraphID> setup_environment(const std::string & dataset, bool train, ged::GEDEnv<ged::GXLNodeID, ged::GXLLabel, ged::GXLLabel> & env) {
210  std::vector<ged::GEDGraph::GraphID> graph_ids(env.load_gxl_graphs(graph_dir(dataset), (train ? train_collection(dataset) : test_collection(dataset)), node_type(dataset), edge_type(dataset), irrelevant_node_attributes(dataset), irrelevant_edge_attributes(dataset)));
211  env.set_edit_costs(edit_costs(dataset));
212  env.init(init_type(dataset));
213  return graph_ids;
214 }
215 
216 std::vector<ged::GEDGraph::GraphID> setup_environment(const std::string & dataset, std::size_t max_size_div_10, ged::GEDEnv<ged::GXLNodeID, ged::GXLLabel, ged::GXLLabel> & env) {
217  std::vector<ged::GEDGraph::GraphID> graph_ids(env.load_gxl_graphs(graph_dir(dataset), size_constrained_collection(dataset, max_size_div_10), node_type(dataset), edge_type(dataset), irrelevant_node_attributes(dataset), irrelevant_edge_attributes(dataset)));
218  env.set_edit_costs(edit_costs(dataset));
219  env.init(init_type(dataset));
220  return graph_ids;
221 }
222 
223 void setup_datasets(std::vector<std::string> & datasets) {
224  datasets = {"Letter_HIGH", "Mutagenicity", "AIDS", "Protein", "GREC", "Fingerprint"};
225 }
226 
227 void setup_size_test_datasets(std::vector<std::string> & datasets) {
228  datasets = {"Mutagenicity", "AIDS", "Protein"};
229 }
230 
231 }
232 
233 #endif /* SRC_TESTS_VLDBJ2018_UTIL_HPP_ */
Definition: util.hpp:37
Selects ged::Fingerprint.
void init(Options::InitType init_type=Options::InitType::EAGER_WITHOUT_SHUFFLED_COPIES)
Initializes the environment.
Definition: ged_env.ipp:655
std::vector< GEDGraph::GraphID > load_gxl_graphs(const std::string &graph_dir, const std::string &collection_file, Options::GXLNodeEdgeType node_type=Options::GXLNodeEdgeType::LABELED, Options::GXLNodeEdgeType edge_type=Options::GXLNodeEdgeType::LABELED, const std::unordered_set< std::string > &irrelevant_node_attributes=std::unordered_set< std::string >(), const std::unordered_set< std::string > &irrelevant_edge_attributes=std::unordered_set< std::string >())
Loads graphs given in the GXL file format.
void set_edit_costs(Options::EditCosts edit_costs, std::initializer_list< double > edit_cost_constants={})
Sets the edit costs to one of the predefined edit costs.
Definition: ged_env.ipp:55
Selects ged::Protein.
GXLNodeEdgeType
Selects whether nodes or edges of graphs given in GXL file format are labeled or unlabeled.
Selects ged::Constant.
Selects ged::Letter.
Runtime error class.
Definition: error.hpp:37
InitType
Selects the initialization type of the environment.
Eager initialization, no shuffled graph copies are constructed.
EditCosts
Selects the edit costs.
Lazy initialization, no shuffled graph copies are constructed.
Provides the API of GEDLIB.
Definition: ged_data.hpp:48