AKSW · larsgsvensson · Sep 21, 2017 · Sep 21, 2017 · Sep 21, 2017 · Sep 21, 2017
diff --git a/config/link_properties.txt b/config/link_properties.txt
@@ -0,0 +1,11 @@
+# Only properties listed as keys are evaluated for the linksets. The values are used to build the LinkSet name
+http://www.w3.org/2004/02/skos/core#broadMatch , skos-broadMatch
+http://www.w3.org/2004/02/skos/core#narrowMatch , skos-narrowMatch
+http://www.w3.org/2004/02/skos/core#closeMatch , skos-closeMatch
+http://www.w3.org/2004/02/skos/core#exactMatch , skos-exactMatch
+http://www.w3.org/2004/02/skos/core#relatedMatch , skos-relatedMatch
+http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy1 , gnd-ddc1
+http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy2 , gnd-ddc2
+http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy3 , gnd-ddc3
+http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy4 , gnd-ddc4
+http://www.w3.org/2002/07/owl#sameAs , owl-sameAs
diff --git a/config/targets.txt b/config/targets.txt
@@ -0,0 +1,12 @@
+# Maps from a prefix to a dataset name (used to build the LinkSet name) and a dataset identifier, which is a URI
+# The dataset identifier will be used to identify the void:subjectsTarget and the void:objectsTarget of each void:LinkSet
+# This could possibly be generated from other void files...
+http://d-nb.info/gnd/, GND, http://d-nb.info/dataset/authorities
+http://lod.gesis.org/thesoz/concept/, TheSoz, http://lod.gesis.org/thesoz/void#thesoz
+http://id.loc.gov/authorities/subjects/, LCSH, http://id.loc.gov/authorities/subjects/
+http://zbw.eu/stw/descriptor/,STW, http://zbw.eu/stw
+http://dewey.info/class/, DDC, http://dewey.info/class/
+http://dbpedia.org/resource/, dbpedia, http://dbpedia.org/void/Dataset
+http://viaf.org/viaf/, viaf, http://viaf.org/viaf/data
+http://sws.geonames.org/, geonames, http://sws.geonames.org/
+http://data.bnf.fr/ark:/12148/, data.bnf.fr, http://data.bnf.fr/
diff --git a/lodstats/RDFStats.py b/lodstats/RDFStats.py
@@ -89,10 +89,10 @@ def start_statistics(self):
                                                            callback_function=self.callback_function_statistics,
                                                            rdf_format=self.rdf_format)
 
-        logger.debug("Clean up the /tmp folder")
-        for filename in os.listdir("/tmp"):
-            if filename.startswith("lodstats"):
-                os.remove(os.path.join("/tmp", filename))
+#        logger.debug("Clean up the /tmp folder")
+#        for filename in os.listdir("/tmp"):
+#            if filename.startswith("lodstats"):
+#                os.remove(os.path.join("/tmp", filename))
 
 
     def get_stats_results(self):

diff --git a/lodstats/stats/ClassesDefined.py b/lodstats/stats/ClassesDefined.py
@@ -44,13 +44,6 @@ def count(self, s, p, o, s_blank, o_l, o_blank, statement):
     def voidify(self, void_model, dataset):
         namespaces = lodstats.util.rdf_namespaces.RDFNamespaces()
         datatype_uri = namespaces.get_rdf_namespace("xsd").integer.uri
-        number_of_distinct_classes = str(self.results['count'])
-        number_of_distinct_classes_node = RDF.Node(literal=number_of_distinct_classes, 
-                                          datatype=datatype_uri)
-        void_model.append(RDF.Statement(dataset,
-                                        namespaces.get_rdf_namespace("void").classes,
-                                        number_of_distinct_classes_node))
-
 	for class_uri_k, class_uri_v in self.usage_count.iteritems():
 	    class_partitions_node = RDF.Node()
 

diff --git a/lodstats/stats/LinkSets.py b/lodstats/stats/LinkSets.py
@@ -0,0 +1,112 @@
+from RDFStatInterface import RDFStatInterface
+from lodstats.util.namespace import get_namespace
+import lodstats.util.rdf_namespaces
+import RDF
+import logging
+
+logger = logging.getLogger("lodstats")
+
+class LinkSets(RDFStatInterface):
+	global targets, link_properties
+	targets={}
+	link_properties={}
+
+	def __init__(self, results):
+		super(LinkSets, self).__init__(results)
+		self.usage_count = self.results['usage_count'] = {}
+		with open('../config/targets.txt') as f:
+			for line in f:
+				if( not line.startswith('#') ): # comment lines
+					global targets
+					(uriPrefix,datasetName,datasetId) = line.split(',')
+					targets[uriPrefix.strip()] = [datasetName.strip(),datasetId.strip()]
+		with open('../config/link_properties.txt') as f:
+			for line in f:
+				if( not line.startswith('#') ): # comment lines
+					global link_properties
+					(propertyUri,shortName) = line.split(',')
+					link_properties[propertyUri.strip()] = shortName.strip()
+
+	def count(self, s, p, o, s_blank, o_l, o_blank, statement):
+		if not (statement.subject.is_resource and statement.object.is_resource):
+			return
+
+		subject_ns = get_namespace(s)
+		object_ns = get_namespace(o)
+
+		if object_ns is None or subject_ns is None:
+			return
+		global link_properties
+		if subject_ns != object_ns and p in link_properties.keys():
+			key = LinkSets.LinkSetKey( subject_ns, p, object_ns )
+			self.usage_count[key] = self.usage_count.get(key, 0) + 1
+
+	def voidify(self, void_model, dataset):
+		namespaces = lodstats.util.rdf_namespaces.RDFNamespaces()
+		global targets
+
+		for link_set_key, link_count in self.usage_count.iteritems():
+			link_set_subject_pair = targets.get(link_set_key.subject_ns)
+			if link_set_subject_pair is None:
+				link_set_subject_target=link_set_key.subject_ns
+				subjects_target_uri = "http://example.com#" + link_set_subject_target
+			else:
+				link_set_subject_target = link_set_subject_pair[0]
+				subjects_target_uri = link_set_subject_pair[1]
+
+			link_set_object_pair = targets.get(link_set_key.object_ns)
+			if link_set_object_pair is None:
+				link_set_object_target=link_set_key.object_ns
+				objects_target_uri = "http://example.com#" + link_set_object_target
+			else:
+				link_set_object_target = link_set_object_pair[0]
+				objects_target_uri =  link_set_object_pair[1]
+
+			link_set_predicate = link_set_key.predicate
+			global link_properties
+			link_set_predicate_alias = link_properties.get(link_set_predicate)
+			# the following really should never happen, since we only evaluate predicates in link_properties.keys()
+			if link_set_predicate_alias is None:
+				link_set_predicate_alias = link_set_predicate
+				logger.error("no alias found for " + link_set_predicate)
+
+			link_set_uri = "#" + link_set_subject_target + "_" + link_set_predicate_alias + "_" + link_set_object_target
+
+			int_datatype_uri = namespaces.get_rdf_namespace("xsd").integer.uri
+			link_set_node = RDF.Node()
+			statement_link_set_definition = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("rdf").type, namespaces.get_rdf_namespace("void").Linkset)
+			statement_linked_triples_value = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").triples, RDF.Node(literal=str(link_count), datatype=int_datatype_uri))
+			statement_link_predicate = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").linkPredicate, RDF.Node(uri_string=link_set_predicate))
+			statement_subjects_target = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").subjectsTarget, RDF.Node(uri_string=subjects_target_uri))
+			statement_objects_target = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").objectsTarget, RDF.Node(uri_string=objects_target_uri))
+			statement_subset = RDF.Statement(dataset, namespaces.get_rdf_namespace("void").subset, link_set_node );
+
+			void_model.append(statement_link_set_definition)
+			void_model.append(statement_linked_triples_value)
+			void_model.append(statement_link_predicate)
+			void_model.append(statement_subjects_target)
+			void_model.append(statement_objects_target)
+			void_model.append(statement_subset)
+
+
+
+	def sparql(self, endpoint):
+		pass
+
+	class LinkSetKey:
+		def __init__(self, subject_ns, predicate, object_ns ):
+			self.subject_ns = subject_ns
+			self.predicate = predicate
+			self.object_ns = object_ns
+
+		def __attrs(self):
+			return( self.subject_ns, self.predicate, self.object_ns )
+
+		def __eq__(self, other):
+			return isinstance(other, LinkSets.LinkSetKey) and self.__attrs() == other.__attrs()
+
+		def __hash__(self):
+			return hash(self.__attrs())
+
+		def __str__(self):
+			return 'subjectNamespace="'+self.subject_ns+'", predicate="' + self.predicate + '", objectNamespace="' + self.object_ns + '"'
diff --git a/lodstats/stats/UsedClasses.py b/lodstats/stats/UsedClasses.py
@@ -37,6 +37,12 @@ def count(self, s, p, o, s_blank, o_l, o_blank, statement):
     def voidify(self, void_model, dataset):
 	namespaces = lodstats.util.rdf_namespaces.RDFNamespaces()
         datatype_uri = namespaces.get_rdf_namespace("xsd").int.uri
+        number_of_distinct_classes = str(len(self.usage_count))
+        number_of_distinct_classes_node = RDF.Node(literal=number_of_distinct_classes, 
+            datatype=datatype_uri)
+        void_model.append(RDF.Statement(dataset,
+            namespaces.get_rdf_namespace("void").classes,
+            number_of_distinct_classes_node))
 
 	for class_uri_k, class_uri_v in self.usage_count.iteritems():
 		class_partitions_node = RDF.Node()

diff --git a/lodstats/stats/__init__.py b/lodstats/stats/__init__.py
@@ -39,6 +39,7 @@
 from LabeledSubjects import LabeledSubjects
 from SameAs import SameAs
 from Links import Links
+from LinkSets import LinkSets
 # max per property
 # avg per property
 from Vocabularies import Vocabularies, VocabulariesPerNode
@@ -77,7 +78,7 @@
 # stats for owl, rdf-schema, -syntax
 vocab_stats = [RDFSyntax, RDFSchema, Owl]
 # stats necessary for VoiD
-void_stats = [Vocabularies, Entities, ClassesDefined, UsedClasses, PropertiesDefined, PropertyUsage, DistinctSubjects, DistinctObjects]
+void_stats = [Vocabularies, Entities, ClassesDefined, UsedClasses, PropertiesDefined, PropertyUsage, DistinctSubjects, DistinctObjects, LinkSets]
 # links only
 link_stats = [Links]
 

diff --git a/lodstats/util/archiveextractor.py b/lodstats/util/archiveextractor.py
@@ -93,6 +93,9 @@ def extract_archive(self, uri, filepath, compression_format, callback_function):
         elif compression_format == 'zip':
             extracted_file_uri = self.decompress_zip(filepath, callback_function)
         f.close()
+        if( filepath.startswith('/tmp')):
+            os.remove(filepath)
+            logger.debug("removed file %s" % filepath)
         # call back one last time to push final numbers
         if callback_function is not None:
             callback_function(self)

diff --git a/lodstats/util/namespace.py b/lodstats/util/namespace.py
@@ -31,7 +31,7 @@ def get_namespace(uri):
     if not uri.startswith('http://'):
         return None
     uri_no_http = uri[len('http://'):]
-    for sep in ['#', ':', '/']:
+    for sep in ['#', '/' , ':']:
         split_uri = uri_no_http.rsplit(sep, 1)
         if len(split_uri) == 2:
             # base_uri is uri minus non-namepsace-part

diff --git a/lodstats/util/rdf2rdf.py b/lodstats/util/rdf2rdf.py
@@ -43,5 +43,9 @@ def convert_ttl_to_nt(self, uri=None):
 
         for data in stdout:
             logger.debug(str(data))
+        if( input_file_path.startswith('/tmp')):
+            #remove the input file since we don't need it any more
+            os.remove(input_file_path)
+            logger.debug("removed file %s" % input_file_path)
 
         return "file://%s" % output_file_path
diff --git a/lodstats/util/rdf_namespaces.py b/lodstats/util/rdf_namespaces.py
@@ -16,7 +16,9 @@ class RDFNamespaces(object):
             "xsd": "http://www.w3.org/2001/XMLSchema#",
             "stats": "http://example.org/XStats#",
             "foaf": "http://xmlns.com/foaf/0.1/",
-            "rdfs": "http://www.w3.org/2000/01/rdf-schema#"
+            "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+			"skos": "http://www.w3.org/2004/02/skos/core#",
+			"gndo": "http://d-nb.info/standards/elementset/gnd#"
             }
 
     def __init__(self):

diff --git a/lodstats/util/rdffile.py b/lodstats/util/rdffile.py
@@ -3,6 +3,7 @@
 import RDF
 import warnings
 import logging
+import os
 
 logger = logging.getLogger("lodstats")
 
@@ -29,11 +30,16 @@ def __init__(self, uri, callback_function=None, stats=None, rdf_format=None):
             logger.debug("Converting turtle to ntriples")
             rdf2rdf = RDF2RDF(self.get_uri())
             self.set_uri(rdf2rdf.convert_ttl_to_nt())
+            self.set_rdf_format(rdf_format='nt')
             logger.debug("Converted turtle to ntriples %s" % self.get_uri())
         self.rdf_parser = self.identify_rdf_parser()
         if(self.rdf_parser is not None):
             self.rdf_stream = self.rdf_parser.parse_as_stream(self.uri)
         self.do_stats(callback_function)
+        file_path = self.get_uri()[7:]
+        if( file_path.startswith('/tmp')):
+            os.remove(file_path)
+            logger.debug("removed file %s" % file_path)
 
     def get_no_of_statements(self):
         return self.no_of_statements