46 Python script that computes statistics of a given dataset. 49 import xml.etree.ElementTree
as ET
53 from scipy.sparse
import csr_matrix
54 from scipy.sparse.csgraph
import connected_components
57 def is_planar(edge_list):
58 return planarity.is_planar(edge_list)
60 def num_connected_components(adj_matrix):
61 graph = csr_matrix(adj_matrix)
62 num_components = connected_components(csgraph=graph, directed=
False, return_labels=
False)
65 def is_cyclic_util(adj_list, node_id, visited, parent_id):
66 visited[node_id] =
True 67 for neighbor_id
in adj_list[node_id]:
68 if not visited[neighbor_id]:
69 if is_cyclic_util(adj_list, neighbor_id, visited, node_id):
71 elif neighbor_id != parent_id:
75 def is_cyclic(adj_list, num_nodes):
76 visited = [
False for node_id
in range(num_nodes)]
77 for node_id
in range(num_nodes):
78 if not visited[node_id]:
79 if is_cyclic_util(adj_list, node_id, visited, -1):
83 def as_2d_hist(min_num_nodes, max_num_nodes, min_num_edges, max_num_edges, nums_nodes, nums_edges):
84 hist = {num_nodes : {num_edges : 0
for num_edges
in range(min_num_edges, max_num_edges + 1)}
for num_nodes
in range(min_num_nodes, max_num_nodes + 1)}
85 num_graphs = len(nums_nodes)
86 for i
in range(num_graphs):
87 hist[nums_nodes[i]][nums_edges[i]] = hist[nums_nodes[i]][nums_edges[i]] + 1
90 def parse_graph(dir, gxl_file):
92 graph = ET.parse(os.path.join(dir, gxl_file)).getroot()
94 for node
in graph.findall(
"graph/node"):
95 str_to_id.update({node.attrib[
"id"] : num_nodes})
96 num_nodes = num_nodes + 1
98 adj_list = {node_id : []
for node_id
in range(num_nodes)}
99 adj_matrix = [[0
for col
in range(num_nodes)]
for row
in range(num_nodes)]
100 for edge
in graph.findall(
"graph/edge"):
101 tail = str_to_id[edge.attrib[
"from"]]
102 head = str_to_id[edge.attrib[
"to"]]
103 if adj_matrix[tail][head] == 1:
105 adj_matrix[tail][head] = 1
106 adj_matrix[head][tail] = 1
107 edge_list.append((tail,head))
108 adj_list[tail].append(head)
109 adj_list[head].append(tail)
110 return edge_list, adj_list, adj_matrix, num_nodes, len(edge_list)
113 parser = argparse.ArgumentParser(description=
"Computes dataset statistics.")
114 parser.add_argument(
"dataset", help=
"path to existing dataset file")
115 parser.add_argument(
"dir", help=
"path to directory containing GXL files")
116 parser.add_argument(
"--max_size", help=
"maximal size", type=int)
117 parser.add_argument(
"--topology", help=
"compute topology avgerage number components and ratio of acyclic and planar graphs", action=
"store_true")
118 parser.add_argument(
"--distr", help=
"file to save distibution of number of nodes and edges")
119 args = parser.parse_args()
120 if not os.path.isdir(args.dir):
121 raise Exception(
"Invalid argument \"" + dir +
"\": not a directory. Usage: python analyze_dataset.py <dataset> <dir> [--max-size <arg>]")
124 dataset = ET.parse(args.dataset).getroot()
125 graphs = [graph.attrib[
"file"]
for graph
in dataset]
132 nums_nodes = np.array([], dtype=int)
133 nums_edges = np.array([], dtype=int)
134 nums_components = np.array([], dtype=int)
135 for gxl_file
in graphs:
136 total_num_graphs = total_num_graphs + 1
137 edge_list, adj_list, adj_matrix, num_nodes, num_edges = parse_graph(args.dir, gxl_file)
138 if args.max_size
and num_nodes > args.max_size:
140 num_graphs = num_graphs + 1
141 nums_nodes = np.append(nums_nodes, [num_nodes])
142 nums_edges = np.append(nums_edges, [num_edges])
144 num_components = num_nodes
146 num_components = num_connected_components(adj_matrix)
147 nums_components = np.append(nums_components, [components])
148 if num_edges == 0
or not is_cyclic(adj_list, num_nodes):
149 ratio_acyclic = ratio_acyclic + 1.0
150 if num_edges == 0
or is_planar(edge_list):
151 ratio_planar = ratio_planar + 1.0
152 ratio_acyclic = ratio_acyclic / float(num_graphs)
153 ratio_planar = ratio_planar / float(num_graphs)
154 print(
"graphs (total / not filtered): " + str(total_num_graphs) +
" / " + str(num_graphs))
155 print(
"nodes (min / max / mean / std / median): " + str(np.min(nums_nodes)) +
" & " + str(np.max(nums_nodes)) +
" & " + str(np.mean(nums_nodes)) +
" & " + str(np.std(nums_nodes, ddof=1)) +
" & " + str(np.median(nums_nodes)))
156 print(
"edges (min / max / mean / std / median): " + str(np.min(nums_edges)) +
" & " + str(np.max(nums_edges)) +
" & " + str(np.mean(nums_edges)) +
" & " + str(np.std(nums_edges, ddof=1)) +
" & " + str(np.median(nums_edges)))
158 print(
"components (min / max / mean / std / median): " + str(np.min(nums_components)) +
" & " + str(np.max(nums_components)) +
" & " + str(np.mean(nums_components)) +
" & " + str(np.std(nums_components, ddof=1)) +
" & " + str(np.median(nums_components)))
159 print(
"ratio planar graphs: " + str(ratio_planar))
160 print(
"ratio acylcic graphs: " + str(ratio_acyclic))
163 hist = as_2d_hist(min_num_nodes, max_num_nodes, min_num_edges, max_num_edges, nums_nodes, nums_edges)
164 file = open(args.distr +
".dat",
"w")
165 for num_nodes
in range(min_num_nodes, max_num_nodes + 1):
166 for num_edges
in range(min_num_edges, max_num_edges + 1):
167 file.write(str(num_nodes) +
" " + str(num_edges) +
" " + str(hist[num_nodes][num_edges]) +
"\n")