37 Python script for generating synthetic molecules. 40 from random
import randint
41 from random
import shuffle
42 from os.path
import join
46 def __init__(self, num_nodes, edge_list):
47 self.num_nodes = num_nodes
48 self.node_labels = [0
for node
in range(num_nodes)]
49 self.edge_list = edge_list
52 string =
"num_nodes = " + str(self.num_nodes) +
"\n" 53 string = string +
"node_labels = " + str(self.node_labels) +
"\n" 54 string = string +
"edge_list = " + str(self.edge_list)
57 def generate_node_labels(self, num_labels):
58 for node
in range(self.num_nodes):
59 self.node_labels[node] = randint(0, num_labels - 1)
61 def write_to_gxl(self, directory, file_name):
62 gxl_file_name = join(directory, file_name)
63 gxl_file = open(gxl_file_name,
"w")
64 gxl_file.write(
"<?xml version=\"1.0\"?>\n")
65 gxl_file.write(
"<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
66 gxl_file.write(
"<gxl>\n")
67 gxl_file.write(
"<graph id=\"" + file_name +
"\" edgeids=\"false\" edgemode=\"undirected\">\n")
68 for node
in range(self.num_nodes):
69 gxl_file.write(
"<node id=\"_" + str(node) +
"\">\n")
70 gxl_file.write(
"<attr name=\"chem\"><int>" + str(self.node_labels[node]) +
"</int></attr>\n")
71 gxl_file.write(
"</node>\n")
72 for edge
in self.edge_list:
73 gxl_file.write(
"<edge from=\"_" + str(edge[0]) +
"\" to=\"_" + str(edge[1]) +
"\">\n")
74 gxl_file.write(
"<attr name=\"valence\"><int>1</int></attr>\n")
75 gxl_file.write(
"</edge>\n")
76 gxl_file.write(
"</graph>\n")
77 gxl_file.write(
"</gxl>\n")
81 def generate_canonical_pruefer_seq(num_nodes):
84 for i
in range(num_nodes - 2):
85 pruefer_sec.append(randint(0, num_nodes - 1))
89 for i
in range(num_nodes - 2):
90 old_id = pruefer_sec[i]
91 if not old_id
in old_to_new_id:
92 old_to_new_id[old_id] = new_id
94 pruefer_sec[i] = old_to_new_id[old_id]
98 def pruefer_seq_to_tree(pruefer_seq):
100 num_nodes = len(pruefer_seq) + 2
101 deg = [1
for node
in range(num_nodes)]
102 for node
in pruefer_seq:
103 deg[node] = deg[node] + 1
106 for tail
in pruefer_seq:
107 for head
in range(num_nodes):
109 edge_list.append((tail, head))
110 deg[tail] = deg[tail] - 1
111 deg[head] = deg[head] - 1
116 for node
in range(num_nodes):
123 edge_list.append((u, v))
125 return Tree(num_nodes, edge_list)
127 def generate_molecules(num_molecules, min_num_nodes, max_num_nodes, max_num_trials = 10):
130 for i
in range(num_molecules):
131 found_new_pruefer_seq =
False 133 while (
not found_new_pruefer_seq)
and (num_trials < max_num_trials):
134 num_nodes = randint(min_num_nodes, max_num_nodes)
135 new_pruefer_seq = generate_canonical_pruefer_seq(num_nodes)
136 found_new_pruefer_seq =
True 137 for old_pruefer_seq
in seqs:
138 if old_pruefer_seq == new_pruefer_seq:
139 found_new_pruefer_seq =
False 141 if found_new_pruefer_seq:
142 seqs.append(new_pruefer_seq)
144 num_trials = num_trials + 1
145 if num_trials == max_num_trials:
146 raise Exception(
"Cannot generate new Pruefer sequence. Maximum number of trials reached.")
148 for pruefer_seq
in seqs:
149 num_nodes = len(pruefer_seq) + 2
150 permutation = range(num_nodes)
152 for i
in range(num_nodes - 2):
153 pruefer_seq[i] = permutation[pruefer_seq[i]]
155 return [pruefer_seq_to_tree(pruefer_seq)
for pruefer_seq
in seqs]
158 file_names = [
"molecule_" + str(i) +
".gxl" for i
in range(150)]
159 num_labels = [1, 4, 7, 10]
160 dirs = [
"NL01",
"NL04",
"NL07",
"NL10"]
161 for dir_id
in range(4):
162 for mol_id
in range(150):
163 molecules[mol_id].generate_node_labels(num_labels[dir_id])
164 molecules[mol_id].write_to_gxl(dirs[dir_id], file_names[mol_id])
165 collection = open(
"../../collections/S-MOL.xml",
"w")
166 collection.write(
"<?xml version=\"1.0\"?>\n")
167 collection.write(
"<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">\n")
168 collection.write(
"<GraphCollection>\n")
169 for file_name
in file_names:
170 collection.write(
"<graph file=\"" + file_name +
"\" class=\"a\"/>\n")
171 collection.write(
"</GraphCollection>\n")