46 Python script that generates a random sample of given size from a given dataset.    49 import xml.etree.ElementTree 
as ET
    54 parser = argparse.ArgumentParser(description=
"Generates a random sample of given size from a given dataset.")
    55 parser.add_argument(
"dataset", help=
"path to existing dataset file")
    56 parser.add_argument(
"sample", help=
"path to sample file to be generated by the script")
    57 parser.add_argument(
"--exclude", help=
"path to existing file that list the graphs contained in the dataset which should not appear in the sample")
    58 parser.add_argument(
"--balanced", help=
"generate sample with equal number of graphs per class", action=
"store_true")
    59 group = parser.add_mutually_exclusive_group(required=
True)
    60 group.add_argument(
"--size", help=
"size of sample; must be greater that 0; if larger than size of and the size of the dataset", type=int)
    61 group.add_argument(
"--size_ratio", help=
"size of sample divided by size of dataset; must be between 0 and 1", type=float)
    62 args = parser.parse_args()
    63 if args.dataset == args.sample:
    64     raise Exception(
"dataset file equals sample file")
    67 excluded_graphs = set()
    69     tree = ET.parse(args.exclude)
    70     excluded_dataset = tree.getroot()
    71     for graph 
in excluded_dataset:
    72         excluded_graphs.add(graph.attrib[
"file"])
    75 dataset = ET.parse(args.dataset).getroot()
    77 graph_classes = {graph.attrib[
"file"] : graph.attrib[
"class"] 
for graph 
in dataset}
    79     classes.add(graph.attrib[
"class"])
    80 num_classes = len(classes)
    83 candidate_graphs = {cl : [] 
for cl 
in classes}
    85     if not graph.attrib[
"file"] 
in excluded_graphs:
    86         candidate_graphs[graph.attrib[
"class"]].append(graph.attrib[
"file"])
    87 candidate_sizes = {cl : len(candidate_graphs[cl]) 
for cl 
in classes}
    88 total_candidate_size = sum([candidate_sizes[cl] 
for cl 
in candidate_sizes])
    89 min_candidate_sizes = min([candidate_sizes[cl] 
for cl 
in candidate_sizes])
    93     if args.size_ratio < 0 
or args.size_ratio > 1:
    94         raise Exception(
"SIZE_RATIO must be between 0 and 1")
    96         sample_sizes = {cl : min(min_candidate_sizes, int((total_candidate_size * args.size_ratio) / num_classes)) 
for cl 
in classes}
    98         sample_sizes = {cl : int(candidate_sizes[cl] * args.size_ratio) 
for cl 
in classes}
   101         raise Exception(
"SIZE must be greater than 0")
   102     if args.size > total_candidate_size:
   103         args.size = total_candidate_size
   108     sample_sizes = {cl : min(min_candidate_sizes, int(args.size / num_classes)) 
for cl 
in classes}
   109     sampled_graphs = [graph 
for cl 
in classes 
for graph 
in random.sample(candidate_graphs[cl], sample_sizes[cl])]
   111     sampled_graphs = random.sample([graph 
for cl 
in classes 
for graph 
in candidate_graphs[cl]], args.size)
   114 file = open(args.sample, 
"w")
   115 file.write(
"<?xml version=\"1.0\"?>")
   116 file.write(
"\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">")
   117 file.write(
"\n<GraphCollection>")
   118 for graph 
in sampled_graphs:
   119     file.write(
"\n\t<graph file=\"" + graph + 
"\" class=\"" + graph_classes[graph] + 
"\"/>")
   120 file.write(
"\n</GraphCollection>")