diff --git a/config/link_properties.txt b/config/link_properties.txt new file mode 100644 index 0000000..63eecf5 --- /dev/null +++ b/config/link_properties.txt @@ -0,0 +1,11 @@ +# Only properties listed as keys are evaluated for the linksets. The values are used to build the LinkSet name +http://www.w3.org/2004/02/skos/core#broadMatch , skos-broadMatch +http://www.w3.org/2004/02/skos/core#narrowMatch , skos-narrowMatch +http://www.w3.org/2004/02/skos/core#closeMatch , skos-closeMatch +http://www.w3.org/2004/02/skos/core#exactMatch , skos-exactMatch +http://www.w3.org/2004/02/skos/core#relatedMatch , skos-relatedMatch +http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy1 , gnd-ddc1 +http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy2 , gnd-ddc2 +http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy3 , gnd-ddc3 +http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy4 , gnd-ddc4 +http://www.w3.org/2002/07/owl#sameAs , owl-sameAs \ No newline at end of file diff --git a/config/targets.txt b/config/targets.txt new file mode 100644 index 0000000..7b3574c --- /dev/null +++ b/config/targets.txt @@ -0,0 +1,12 @@ +# Maps from a prefix to a dataset name (used to build the LinkSet name) and a dataset identifier, which is a URI +# The dataset identifier will be used to identify the void:subjectsTarget and the void:objectsTarget of each void:LinkSet +# This could possibly be generated from other void files... +http://d-nb.info/gnd/, GND, http://d-nb.info/dataset/authorities +http://lod.gesis.org/thesoz/concept/, TheSoz, http://lod.gesis.org/thesoz/void#thesoz +http://id.loc.gov/authorities/subjects/, LCSH, http://id.loc.gov/authorities/subjects/ +http://zbw.eu/stw/descriptor/,STW, http://zbw.eu/stw +http://dewey.info/class/, DDC, http://dewey.info/class/ +http://dbpedia.org/resource/, dbpedia, http://dbpedia.org/void/Dataset +http://viaf.org/viaf/, viaf, http://viaf.org/viaf/data +http://sws.geonames.org/, geonames, http://sws.geonames.org/ +http://data.bnf.fr/ark:/12148/, data.bnf.fr, http://data.bnf.fr/ diff --git a/lodstats/RDFStats.py b/lodstats/RDFStats.py index fbbb225..5343723 100644 --- a/lodstats/RDFStats.py +++ b/lodstats/RDFStats.py @@ -89,10 +89,10 @@ def start_statistics(self): callback_function=self.callback_function_statistics, rdf_format=self.rdf_format) - logger.debug("Clean up the /tmp folder") - for filename in os.listdir("/tmp"): - if filename.startswith("lodstats"): - os.remove(os.path.join("/tmp", filename)) +# logger.debug("Clean up the /tmp folder") +# for filename in os.listdir("/tmp"): +# if filename.startswith("lodstats"): +# os.remove(os.path.join("/tmp", filename)) def get_stats_results(self): diff --git a/lodstats/stats/ClassesDefined.py b/lodstats/stats/ClassesDefined.py index 62aa504..6d5ef74 100644 --- a/lodstats/stats/ClassesDefined.py +++ b/lodstats/stats/ClassesDefined.py @@ -44,13 +44,6 @@ def count(self, s, p, o, s_blank, o_l, o_blank, statement): def voidify(self, void_model, dataset): namespaces = lodstats.util.rdf_namespaces.RDFNamespaces() datatype_uri = namespaces.get_rdf_namespace("xsd").integer.uri - number_of_distinct_classes = str(self.results['count']) - number_of_distinct_classes_node = RDF.Node(literal=number_of_distinct_classes, - datatype=datatype_uri) - void_model.append(RDF.Statement(dataset, - namespaces.get_rdf_namespace("void").classes, - number_of_distinct_classes_node)) - for class_uri_k, class_uri_v in self.usage_count.iteritems(): class_partitions_node = RDF.Node() diff --git a/lodstats/stats/LinkSets.py b/lodstats/stats/LinkSets.py new file mode 100644 index 0000000..718ead3 --- /dev/null +++ b/lodstats/stats/LinkSets.py @@ -0,0 +1,112 @@ +from RDFStatInterface import RDFStatInterface +from lodstats.util.namespace import get_namespace +import lodstats.util.rdf_namespaces +import RDF +import logging + +logger = logging.getLogger("lodstats") + +class LinkSets(RDFStatInterface): + global targets, link_properties + targets={} + link_properties={} + + def __init__(self, results): + super(LinkSets, self).__init__(results) + self.usage_count = self.results['usage_count'] = {} + with open('../config/targets.txt') as f: + for line in f: + if( not line.startswith('#') ): # comment lines + global targets + (uriPrefix,datasetName,datasetId) = line.split(',') + targets[uriPrefix.strip()] = [datasetName.strip(),datasetId.strip()] + with open('../config/link_properties.txt') as f: + for line in f: + if( not line.startswith('#') ): # comment lines + global link_properties + (propertyUri,shortName) = line.split(',') + link_properties[propertyUri.strip()] = shortName.strip() + + def count(self, s, p, o, s_blank, o_l, o_blank, statement): + if not (statement.subject.is_resource and statement.object.is_resource): + return + + subject_ns = get_namespace(s) + object_ns = get_namespace(o) + + if object_ns is None or subject_ns is None: + return + global link_properties + if subject_ns != object_ns and p in link_properties.keys(): + key = LinkSets.LinkSetKey( subject_ns, p, object_ns ) + self.usage_count[key] = self.usage_count.get(key, 0) + 1 + + def voidify(self, void_model, dataset): + namespaces = lodstats.util.rdf_namespaces.RDFNamespaces() + global targets + + for link_set_key, link_count in self.usage_count.iteritems(): + link_set_subject_pair = targets.get(link_set_key.subject_ns) + if link_set_subject_pair is None: + link_set_subject_target=link_set_key.subject_ns + subjects_target_uri = "http://example.com#" + link_set_subject_target + else: + link_set_subject_target = link_set_subject_pair[0] + subjects_target_uri = link_set_subject_pair[1] + + link_set_object_pair = targets.get(link_set_key.object_ns) + if link_set_object_pair is None: + link_set_object_target=link_set_key.object_ns + objects_target_uri = "http://example.com#" + link_set_object_target + else: + link_set_object_target = link_set_object_pair[0] + objects_target_uri = link_set_object_pair[1] + + link_set_predicate = link_set_key.predicate + global link_properties + link_set_predicate_alias = link_properties.get(link_set_predicate) + # the following really should never happen, since we only evaluate predicates in link_properties.keys() + if link_set_predicate_alias is None: + link_set_predicate_alias = link_set_predicate + logger.error("no alias found for " + link_set_predicate) + + link_set_uri = "#" + link_set_subject_target + "_" + link_set_predicate_alias + "_" + link_set_object_target + + int_datatype_uri = namespaces.get_rdf_namespace("xsd").integer.uri + link_set_node = RDF.Node() + statement_link_set_definition = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("rdf").type, namespaces.get_rdf_namespace("void").Linkset) + statement_linked_triples_value = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").triples, RDF.Node(literal=str(link_count), datatype=int_datatype_uri)) + statement_link_predicate = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").linkPredicate, RDF.Node(uri_string=link_set_predicate)) + statement_subjects_target = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").subjectsTarget, RDF.Node(uri_string=subjects_target_uri)) + statement_objects_target = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").objectsTarget, RDF.Node(uri_string=objects_target_uri)) + statement_subset = RDF.Statement(dataset, namespaces.get_rdf_namespace("void").subset, link_set_node ); + + void_model.append(statement_link_set_definition) + void_model.append(statement_linked_triples_value) + void_model.append(statement_link_predicate) + void_model.append(statement_subjects_target) + void_model.append(statement_objects_target) + void_model.append(statement_subset) + + + + def sparql(self, endpoint): + pass + + class LinkSetKey: + def __init__(self, subject_ns, predicate, object_ns ): + self.subject_ns = subject_ns + self.predicate = predicate + self.object_ns = object_ns + + def __attrs(self): + return( self.subject_ns, self.predicate, self.object_ns ) + + def __eq__(self, other): + return isinstance(other, LinkSets.LinkSetKey) and self.__attrs() == other.__attrs() + + def __hash__(self): + return hash(self.__attrs()) + + def __str__(self): + return 'subjectNamespace="'+self.subject_ns+'", predicate="' + self.predicate + '", objectNamespace="' + self.object_ns + '"' \ No newline at end of file diff --git a/lodstats/stats/UsedClasses.py b/lodstats/stats/UsedClasses.py index e94a966..86f2b1d 100644 --- a/lodstats/stats/UsedClasses.py +++ b/lodstats/stats/UsedClasses.py @@ -37,6 +37,12 @@ def count(self, s, p, o, s_blank, o_l, o_blank, statement): def voidify(self, void_model, dataset): namespaces = lodstats.util.rdf_namespaces.RDFNamespaces() datatype_uri = namespaces.get_rdf_namespace("xsd").int.uri + number_of_distinct_classes = str(len(self.usage_count)) + number_of_distinct_classes_node = RDF.Node(literal=number_of_distinct_classes, + datatype=datatype_uri) + void_model.append(RDF.Statement(dataset, + namespaces.get_rdf_namespace("void").classes, + number_of_distinct_classes_node)) for class_uri_k, class_uri_v in self.usage_count.iteritems(): class_partitions_node = RDF.Node() diff --git a/lodstats/stats/__init__.py b/lodstats/stats/__init__.py index 58de299..a24222a 100644 --- a/lodstats/stats/__init__.py +++ b/lodstats/stats/__init__.py @@ -39,6 +39,7 @@ from LabeledSubjects import LabeledSubjects from SameAs import SameAs from Links import Links +from LinkSets import LinkSets # max per property # avg per property from Vocabularies import Vocabularies, VocabulariesPerNode @@ -77,7 +78,7 @@ # stats for owl, rdf-schema, -syntax vocab_stats = [RDFSyntax, RDFSchema, Owl] # stats necessary for VoiD -void_stats = [Vocabularies, Entities, ClassesDefined, UsedClasses, PropertiesDefined, PropertyUsage, DistinctSubjects, DistinctObjects] +void_stats = [Vocabularies, Entities, ClassesDefined, UsedClasses, PropertiesDefined, PropertyUsage, DistinctSubjects, DistinctObjects, LinkSets] # links only link_stats = [Links] diff --git a/lodstats/util/archiveextractor.py b/lodstats/util/archiveextractor.py index fc97b33..7a57c7f 100644 --- a/lodstats/util/archiveextractor.py +++ b/lodstats/util/archiveextractor.py @@ -93,6 +93,9 @@ def extract_archive(self, uri, filepath, compression_format, callback_function): elif compression_format == 'zip': extracted_file_uri = self.decompress_zip(filepath, callback_function) f.close() + if( filepath.startswith('/tmp')): + os.remove(filepath) + logger.debug("removed file %s" % filepath) # call back one last time to push final numbers if callback_function is not None: callback_function(self) diff --git a/lodstats/util/namespace.py b/lodstats/util/namespace.py index 82f2d1b..93d1cd0 100644 --- a/lodstats/util/namespace.py +++ b/lodstats/util/namespace.py @@ -31,7 +31,7 @@ def get_namespace(uri): if not uri.startswith('http://'): return None uri_no_http = uri[len('http://'):] - for sep in ['#', ':', '/']: + for sep in ['#', '/' , ':']: split_uri = uri_no_http.rsplit(sep, 1) if len(split_uri) == 2: # base_uri is uri minus non-namepsace-part diff --git a/lodstats/util/rdf2rdf.py b/lodstats/util/rdf2rdf.py index 11725d2..34b9323 100644 --- a/lodstats/util/rdf2rdf.py +++ b/lodstats/util/rdf2rdf.py @@ -43,5 +43,9 @@ def convert_ttl_to_nt(self, uri=None): for data in stdout: logger.debug(str(data)) + if( input_file_path.startswith('/tmp')): + #remove the input file since we don't need it any more + os.remove(input_file_path) + logger.debug("removed file %s" % input_file_path) return "file://%s" % output_file_path diff --git a/lodstats/util/rdf_namespaces.py b/lodstats/util/rdf_namespaces.py index 5c74052..9160519 100644 --- a/lodstats/util/rdf_namespaces.py +++ b/lodstats/util/rdf_namespaces.py @@ -16,7 +16,9 @@ class RDFNamespaces(object): "xsd": "http://www.w3.org/2001/XMLSchema#", "stats": "http://example.org/XStats#", "foaf": "http://xmlns.com/foaf/0.1/", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#" + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "skos": "http://www.w3.org/2004/02/skos/core#", + "gndo": "http://d-nb.info/standards/elementset/gnd#" } def __init__(self): diff --git a/lodstats/util/rdffile.py b/lodstats/util/rdffile.py index 5b06373..2d2ffe2 100644 --- a/lodstats/util/rdffile.py +++ b/lodstats/util/rdffile.py @@ -3,6 +3,7 @@ import RDF import warnings import logging +import os logger = logging.getLogger("lodstats") @@ -29,11 +30,16 @@ def __init__(self, uri, callback_function=None, stats=None, rdf_format=None): logger.debug("Converting turtle to ntriples") rdf2rdf = RDF2RDF(self.get_uri()) self.set_uri(rdf2rdf.convert_ttl_to_nt()) + self.set_rdf_format(rdf_format='nt') logger.debug("Converted turtle to ntriples %s" % self.get_uri()) self.rdf_parser = self.identify_rdf_parser() if(self.rdf_parser is not None): self.rdf_stream = self.rdf_parser.parse_as_stream(self.uri) self.do_stats(callback_function) + file_path = self.get_uri()[7:] + if( file_path.startswith('/tmp')): + os.remove(file_path) + logger.debug("removed file %s" % file_path) def get_no_of_statements(self): return self.no_of_statements