GEDLIB  1.0
generate_molecules.py
Go to the documentation of this file.
1 #//////////////////////////////////////////////////////////////////////////#
2 # #
3 # Copyright (C) 2018 by David B. Blumenthal #
4 # #
5 # This file is part of GEDLIB. #
6 # #
7 # GEDLIB is free software: you can redistribute it and/or modify it #
8 # under the terms of the GNU Lesser General Public License as published #
9 # by the Free Software Foundation, either version 3 of the License, or #
10 # (at your option) any later version. #
11 # #
12 # GEDLIB is distributed in the hope that it will be useful, #
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
15 # GNU Lesser General Public License for more details. #
16 # #
17 # You should have received a copy of the GNU Lesser General Public #
18 # License along with GEDLIB. If not, see <http://www.gnu.org/licenses/>. #
19 # #
20 #//////////////////////////////////////////////////////////////////////////#
21 
22 
36 '''
37 Python script for generating synthetic molecules.
38 '''
39 
40 from random import randint
41 from random import shuffle
42 from os.path import join
43 
44 class Tree:
45 
46  def __init__(self, num_nodes, edge_list):
47  self.num_nodes = num_nodes
48  self.node_labels = [0 for node in range(num_nodes)]
49  self.edge_list = edge_list
50 
51  def __repr__(self):
52  string = "num_nodes = " + str(self.num_nodes) + "\n"
53  string = string + "node_labels = " + str(self.node_labels) + "\n"
54  string = string + "edge_list = " + str(self.edge_list)
55  return string
56 
57  def generate_node_labels(self, num_labels):
58  for node in range(self.num_nodes):
59  self.node_labels[node] = randint(0, num_labels - 1)
60 
61  def write_to_gxl(self, directory, file_name):
62  gxl_file_name = join(directory, file_name)
63  gxl_file = open(gxl_file_name, "w")
64  gxl_file.write("<?xml version=\"1.0\"?>\n")
65  gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
66  gxl_file.write("<gxl>\n")
67  gxl_file.write("<graph id=\"" + file_name + "\" edgeids=\"false\" edgemode=\"undirected\">\n")
68  for node in range(self.num_nodes):
69  gxl_file.write("<node id=\"_" + str(node) + "\">\n")
70  gxl_file.write("<attr name=\"chem\"><int>" + str(self.node_labels[node]) + "</int></attr>\n")
71  gxl_file.write("</node>\n")
72  for edge in self.edge_list:
73  gxl_file.write("<edge from=\"_" + str(edge[0]) + "\" to=\"_" + str(edge[1]) + "\">\n")
74  gxl_file.write("<attr name=\"valence\"><int>1</int></attr>\n")
75  gxl_file.write("</edge>\n")
76  gxl_file.write("</graph>\n")
77  gxl_file.write("</gxl>\n")
78  gxl_file.close()
79 
80 
81 def generate_canonical_pruefer_seq(num_nodes):
82  # generate Pruefer sequence
83  pruefer_sec = []
84  for i in range(num_nodes - 2):
85  pruefer_sec.append(randint(0, num_nodes - 1))
86  # compute canonical form
87  old_to_new_id = {}
88  new_id = 0
89  for i in range(num_nodes - 2):
90  old_id = pruefer_sec[i]
91  if not old_id in old_to_new_id:
92  old_to_new_id[old_id] = new_id
93  new_id = new_id + 1
94  pruefer_sec[i] = old_to_new_id[old_id]
95  # return canonical form
96  return pruefer_sec
97 
98 def pruefer_seq_to_tree(pruefer_seq):
99  # collect the degrees
100  num_nodes = len(pruefer_seq) + 2
101  deg = [1 for node in range(num_nodes)]
102  for node in pruefer_seq:
103  deg[node] = deg[node] + 1
104  # collect all edges except for the last
105  edge_list = []
106  for tail in pruefer_seq:
107  for head in range(num_nodes):
108  if deg[head] == 1:
109  edge_list.append((tail, head))
110  deg[tail] = deg[tail] - 1
111  deg[head] = deg[head] - 1
112  break
113  # collect last edge
114  u = num_nodes
115  v = num_nodes
116  for node in range(num_nodes):
117  if deg[node] == 1:
118  if u == num_nodes:
119  u = node
120  else:
121  v = node
122  break
123  edge_list.append((u, v))
124  # return tree
125  return Tree(num_nodes, edge_list)
126 
127 def generate_molecules(num_molecules, min_num_nodes, max_num_nodes, max_num_trials = 10):
128  # create non-isomorphic Pruefer sequences
129  seqs = []
130  for i in range(num_molecules):
131  found_new_pruefer_seq = False
132  num_trials = 0
133  while (not found_new_pruefer_seq) and (num_trials < max_num_trials):
134  num_nodes = randint(min_num_nodes, max_num_nodes)
135  new_pruefer_seq = generate_canonical_pruefer_seq(num_nodes)
136  found_new_pruefer_seq = True
137  for old_pruefer_seq in seqs:
138  if old_pruefer_seq == new_pruefer_seq:
139  found_new_pruefer_seq = False
140  break;
141  if found_new_pruefer_seq:
142  seqs.append(new_pruefer_seq)
143  else:
144  num_trials = num_trials + 1
145  if num_trials == max_num_trials:
146  raise Exception("Cannot generate new Pruefer sequence. Maximum number of trials reached.")
147  # shuffle the Pruefer sequences
148  for pruefer_seq in seqs:
149  num_nodes = len(pruefer_seq) + 2
150  permutation = range(num_nodes)
151  shuffle(permutation)
152  for i in range(num_nodes - 2):
153  pruefer_seq[i] = permutation[pruefer_seq[i]]
154  # return trees obtained from Pruefer sequences
155  return [pruefer_seq_to_tree(pruefer_seq) for pruefer_seq in seqs]
156 
157 molecules = generate_molecules(150, 8, 12)
158 file_names = ["molecule_" + str(i) + ".gxl" for i in range(150)]
159 num_labels = [1, 4, 7, 10]
160 dirs = ["NL01", "NL04", "NL07", "NL10"]
161 for dir_id in range(4):
162  for mol_id in range(150):
163  molecules[mol_id].generate_node_labels(num_labels[dir_id])
164  molecules[mol_id].write_to_gxl(dirs[dir_id], file_names[mol_id])
165 collection = open("../../collections/S-MOL.xml", "w")
166 collection.write("<?xml version=\"1.0\"?>\n")
167 collection.write("<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">\n")
168 collection.write("<GraphCollection>\n")
169 for file_name in file_names:
170  collection.write("<graph file=\"" + file_name + "\" class=\"a\"/>\n")
171 collection.write("</GraphCollection>\n")
172 collection.close()