GEDLIB  1.0
analyze_dataset.py
Go to the documentation of this file.
1 #//////////////////////////////////////////////////////////////////////////#
2 # #
3 # Copyright (C) 2018 by David B. Blumenthal #
4 # #
5 # This file is part of GEDLIB. #
6 # #
7 # GEDLIB is free software: you can redistribute it and/or modify it #
8 # under the terms of the GNU Lesser General Public License as published #
9 # by the Free Software Foundation, either version 3 of the License, or #
10 # (at your option) any later version. #
11 # #
12 # GEDLIB is distributed in the hope that it will be useful, #
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
15 # GNU Lesser General Public License for more details. #
16 # #
17 # You should have received a copy of the GNU Lesser General Public #
18 # License along with GEDLIB. If not, see <http://www.gnu.org/licenses/>. #
19 # #
20 #//////////////////////////////////////////////////////////////////////////#
21 
22 
45 '''
46 Python script that computes statistics of a given dataset.
47 '''
48 
49 import xml.etree.ElementTree as ET
50 import argparse
51 import os.path
52 import planarity
53 from scipy.sparse import csr_matrix
54 from scipy.sparse.csgraph import connected_components
55 import numpy as np
56 
57 def is_planar(edge_list):
58  return planarity.is_planar(edge_list)
59 
60 def num_connected_components(adj_matrix):
61  graph = csr_matrix(adj_matrix)
62  num_components = connected_components(csgraph=graph, directed=False, return_labels=False)
63  return num_components
64 
65 def is_cyclic_util(adj_list, node_id, visited, parent_id):
66  visited[node_id] = True
67  for neighbor_id in adj_list[node_id]:
68  if not visited[neighbor_id]:
69  if is_cyclic_util(adj_list, neighbor_id, visited, node_id):
70  return True
71  elif neighbor_id != parent_id:
72  return True
73  return False
74 
75 def is_cyclic(adj_list, num_nodes):
76  visited = [False for node_id in range(num_nodes)]
77  for node_id in range(num_nodes):
78  if not visited[node_id]:
79  if is_cyclic_util(adj_list, node_id, visited, -1):
80  return True
81  return False
82 
83 def as_2d_hist(min_num_nodes, max_num_nodes, min_num_edges, max_num_edges, nums_nodes, nums_edges):
84  hist = {num_nodes : {num_edges : 0 for num_edges in range(min_num_edges, max_num_edges + 1)} for num_nodes in range(min_num_nodes, max_num_nodes + 1)}
85  num_graphs = len(nums_nodes)
86  for i in range(num_graphs):
87  hist[nums_nodes[i]][nums_edges[i]] = hist[nums_nodes[i]][nums_edges[i]] + 1
88  return hist
89 
90 def parse_graph(dir, gxl_file):
91  num_nodes = 0
92  graph = ET.parse(os.path.join(dir, gxl_file)).getroot()
93  str_to_id = {}
94  for node in graph.findall("graph/node"):
95  str_to_id.update({node.attrib["id"] : num_nodes})
96  num_nodes = num_nodes + 1
97  edge_list = []
98  adj_list = {node_id : [] for node_id in range(num_nodes)}
99  adj_matrix = [[0 for col in range(num_nodes)] for row in range(num_nodes)]
100  for edge in graph.findall("graph/edge"):
101  tail = str_to_id[edge.attrib["from"]]
102  head = str_to_id[edge.attrib["to"]]
103  if adj_matrix[tail][head] == 1:
104  continue
105  adj_matrix[tail][head] = 1
106  adj_matrix[head][tail] = 1
107  edge_list.append((tail,head))
108  adj_list[tail].append(head)
109  adj_list[head].append(tail)
110  return edge_list, adj_list, adj_matrix, num_nodes, len(edge_list)
111 
112 # Parse the command line arguments.
113 parser = argparse.ArgumentParser(description="Computes dataset statistics.")
114 parser.add_argument("dataset", help="path to existing dataset file")
115 parser.add_argument("dir", help="path to directory containing GXL files")
116 parser.add_argument("--max_size", help="maximal size", type=int)
117 parser.add_argument("--topology", help="compute topology avgerage number components and ratio of acyclic and planar graphs", action="store_true")
118 parser.add_argument("--distr", help="file to save distibution of number of nodes and edges")
119 args = parser.parse_args()
120 if not os.path.isdir(args.dir):
121  raise Exception("Invalid argument \"" + dir + "\": not a directory. Usage: python analyze_dataset.py <dataset> <dir> [--max-size <arg>]")
122 
123 # Parse the dataset file.
124 dataset = ET.parse(args.dataset).getroot()
125 graphs = [graph.attrib["file"] for graph in dataset]
126 
127 # Analyze the dataset.
128 num_graphs = 0
129 total_num_graphs = 0
130 ratio_acyclic = 0.0
131 ratio_planar = 0.0
132 nums_nodes = np.array([], dtype=int)
133 nums_edges = np.array([], dtype=int)
134 nums_components = np.array([], dtype=int)
135 for gxl_file in graphs:
136  total_num_graphs = total_num_graphs + 1
137  edge_list, adj_list, adj_matrix, num_nodes, num_edges = parse_graph(args.dir, gxl_file)
138  if args.max_size and num_nodes > args.max_size:
139  continue
140  num_graphs = num_graphs + 1
141  nums_nodes = np.append(nums_nodes, [num_nodes])
142  nums_edges = np.append(nums_edges, [num_edges])
143  if args.topology:
144  num_components = num_nodes
145  if num_edges > 0:
146  num_components = num_connected_components(adj_matrix)
147  nums_components = np.append(nums_components, [components])
148  if num_edges == 0 or not is_cyclic(adj_list, num_nodes):
149  ratio_acyclic = ratio_acyclic + 1.0
150  if num_edges == 0 or is_planar(edge_list):
151  ratio_planar = ratio_planar + 1.0
152 ratio_acyclic = ratio_acyclic / float(num_graphs)
153 ratio_planar = ratio_planar / float(num_graphs)
154 print("graphs (total / not filtered): " + str(total_num_graphs) + " / " + str(num_graphs))
155 print("nodes (min / max / mean / std / median): " + str(np.min(nums_nodes)) + " & " + str(np.max(nums_nodes)) + " & " + str(np.mean(nums_nodes)) + " & " + str(np.std(nums_nodes, ddof=1)) + " & " + str(np.median(nums_nodes)))
156 print("edges (min / max / mean / std / median): " + str(np.min(nums_edges)) + " & " + str(np.max(nums_edges)) + " & " + str(np.mean(nums_edges)) + " & " + str(np.std(nums_edges, ddof=1)) + " & " + str(np.median(nums_edges)))
157 if args.topology:
158  print("components (min / max / mean / std / median): " + str(np.min(nums_components)) + " & " + str(np.max(nums_components)) + " & " + str(np.mean(nums_components)) + " & " + str(np.std(nums_components, ddof=1)) + " & " + str(np.median(nums_components)))
159  print("ratio planar graphs: " + str(ratio_planar))
160  print("ratio acylcic graphs: " + str(ratio_acyclic))
161 
162 if args.distr:
163  hist = as_2d_hist(min_num_nodes, max_num_nodes, min_num_edges, max_num_edges, nums_nodes, nums_edges)
164  file = open(args.distr + ".dat", "w")
165  for num_nodes in range(min_num_nodes, max_num_nodes + 1):
166  for num_edges in range(min_num_edges, max_num_edges + 1):
167  file.write(str(num_nodes) + " " + str(num_edges) + " " + str(hist[num_nodes][num_edges]) + "\n")
168  file.write("\n")
169  file.close()
170