Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions config/link_properties.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Only properties listed as keys are evaluated for the linksets. The values are used to build the LinkSet name
http://www.w3.org/2004/02/skos/core#broadMatch , skos-broadMatch
http://www.w3.org/2004/02/skos/core#narrowMatch , skos-narrowMatch
http://www.w3.org/2004/02/skos/core#closeMatch , skos-closeMatch
http://www.w3.org/2004/02/skos/core#exactMatch , skos-exactMatch
http://www.w3.org/2004/02/skos/core#relatedMatch , skos-relatedMatch
http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy1 , gnd-ddc1
http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy2 , gnd-ddc2
http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy3 , gnd-ddc3
http://d-nb.info/standards/elementset/gnd#relatedDdcWithDegreeOfDeterminacy4 , gnd-ddc4
http://www.w3.org/2002/07/owl#sameAs , owl-sameAs
12 changes: 12 additions & 0 deletions config/targets.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Maps from a prefix to a dataset name (used to build the LinkSet name) and a dataset identifier, which is a URI
# The dataset identifier will be used to identify the void:subjectsTarget and the void:objectsTarget of each void:LinkSet
# This could possibly be generated from other void files...
http://d-nb.info/gnd/, GND, http://d-nb.info/dataset/authorities
http://lod.gesis.org/thesoz/concept/, TheSoz, http://lod.gesis.org/thesoz/void#thesoz
http://id.loc.gov/authorities/subjects/, LCSH, http://id.loc.gov/authorities/subjects/
http://zbw.eu/stw/descriptor/,STW, http://zbw.eu/stw
http://dewey.info/class/, DDC, http://dewey.info/class/
http://dbpedia.org/resource/, dbpedia, http://dbpedia.org/void/Dataset
http://viaf.org/viaf/, viaf, http://viaf.org/viaf/data
http://sws.geonames.org/, geonames, http://sws.geonames.org/
http://data.bnf.fr/ark:/12148/, data.bnf.fr, http://data.bnf.fr/
8 changes: 4 additions & 4 deletions lodstats/RDFStats.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ def start_statistics(self):
callback_function=self.callback_function_statistics,
rdf_format=self.rdf_format)

logger.debug("Clean up the /tmp folder")
for filename in os.listdir("/tmp"):
if filename.startswith("lodstats"):
os.remove(os.path.join("/tmp", filename))
# logger.debug("Clean up the /tmp folder")
# for filename in os.listdir("/tmp"):
# if filename.startswith("lodstats"):
# os.remove(os.path.join("/tmp", filename))


def get_stats_results(self):
Expand Down
7 changes: 0 additions & 7 deletions lodstats/stats/ClassesDefined.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,6 @@ def count(self, s, p, o, s_blank, o_l, o_blank, statement):
def voidify(self, void_model, dataset):
namespaces = lodstats.util.rdf_namespaces.RDFNamespaces()
datatype_uri = namespaces.get_rdf_namespace("xsd").integer.uri
number_of_distinct_classes = str(self.results['count'])
number_of_distinct_classes_node = RDF.Node(literal=number_of_distinct_classes,
datatype=datatype_uri)
void_model.append(RDF.Statement(dataset,
namespaces.get_rdf_namespace("void").classes,
number_of_distinct_classes_node))

for class_uri_k, class_uri_v in self.usage_count.iteritems():
class_partitions_node = RDF.Node()

Expand Down
112 changes: 112 additions & 0 deletions lodstats/stats/LinkSets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from RDFStatInterface import RDFStatInterface
from lodstats.util.namespace import get_namespace
import lodstats.util.rdf_namespaces
import RDF
import logging

logger = logging.getLogger("lodstats")

class LinkSets(RDFStatInterface):
global targets, link_properties
targets={}
link_properties={}

def __init__(self, results):
super(LinkSets, self).__init__(results)
self.usage_count = self.results['usage_count'] = {}
with open('../config/targets.txt') as f:
for line in f:
if( not line.startswith('#') ): # comment lines
global targets
(uriPrefix,datasetName,datasetId) = line.split(',')
targets[uriPrefix.strip()] = [datasetName.strip(),datasetId.strip()]
with open('../config/link_properties.txt') as f:
for line in f:
if( not line.startswith('#') ): # comment lines
global link_properties
(propertyUri,shortName) = line.split(',')
link_properties[propertyUri.strip()] = shortName.strip()

def count(self, s, p, o, s_blank, o_l, o_blank, statement):
if not (statement.subject.is_resource and statement.object.is_resource):
return

subject_ns = get_namespace(s)
object_ns = get_namespace(o)

if object_ns is None or subject_ns is None:
return
global link_properties
if subject_ns != object_ns and p in link_properties.keys():
key = LinkSets.LinkSetKey( subject_ns, p, object_ns )
self.usage_count[key] = self.usage_count.get(key, 0) + 1

def voidify(self, void_model, dataset):
namespaces = lodstats.util.rdf_namespaces.RDFNamespaces()
global targets

for link_set_key, link_count in self.usage_count.iteritems():
link_set_subject_pair = targets.get(link_set_key.subject_ns)
if link_set_subject_pair is None:
link_set_subject_target=link_set_key.subject_ns
subjects_target_uri = "http://example.com#" + link_set_subject_target
else:
link_set_subject_target = link_set_subject_pair[0]
subjects_target_uri = link_set_subject_pair[1]

link_set_object_pair = targets.get(link_set_key.object_ns)
if link_set_object_pair is None:
link_set_object_target=link_set_key.object_ns
objects_target_uri = "http://example.com#" + link_set_object_target
else:
link_set_object_target = link_set_object_pair[0]
objects_target_uri = link_set_object_pair[1]

link_set_predicate = link_set_key.predicate
global link_properties
link_set_predicate_alias = link_properties.get(link_set_predicate)
# the following really should never happen, since we only evaluate predicates in link_properties.keys()
if link_set_predicate_alias is None:
link_set_predicate_alias = link_set_predicate
logger.error("no alias found for " + link_set_predicate)

link_set_uri = "#" + link_set_subject_target + "_" + link_set_predicate_alias + "_" + link_set_object_target

int_datatype_uri = namespaces.get_rdf_namespace("xsd").integer.uri
link_set_node = RDF.Node()
statement_link_set_definition = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("rdf").type, namespaces.get_rdf_namespace("void").Linkset)
statement_linked_triples_value = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").triples, RDF.Node(literal=str(link_count), datatype=int_datatype_uri))
statement_link_predicate = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").linkPredicate, RDF.Node(uri_string=link_set_predicate))
statement_subjects_target = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").subjectsTarget, RDF.Node(uri_string=subjects_target_uri))
statement_objects_target = RDF.Statement(link_set_node, namespaces.get_rdf_namespace("void").objectsTarget, RDF.Node(uri_string=objects_target_uri))
statement_subset = RDF.Statement(dataset, namespaces.get_rdf_namespace("void").subset, link_set_node );

void_model.append(statement_link_set_definition)
void_model.append(statement_linked_triples_value)
void_model.append(statement_link_predicate)
void_model.append(statement_subjects_target)
void_model.append(statement_objects_target)
void_model.append(statement_subset)



def sparql(self, endpoint):
pass

class LinkSetKey:
def __init__(self, subject_ns, predicate, object_ns ):
self.subject_ns = subject_ns
self.predicate = predicate
self.object_ns = object_ns

def __attrs(self):
return( self.subject_ns, self.predicate, self.object_ns )

def __eq__(self, other):
return isinstance(other, LinkSets.LinkSetKey) and self.__attrs() == other.__attrs()

def __hash__(self):
return hash(self.__attrs())

def __str__(self):
return 'subjectNamespace="'+self.subject_ns+'", predicate="' + self.predicate + '", objectNamespace="' + self.object_ns + '"'
6 changes: 6 additions & 0 deletions lodstats/stats/UsedClasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ def count(self, s, p, o, s_blank, o_l, o_blank, statement):
def voidify(self, void_model, dataset):
namespaces = lodstats.util.rdf_namespaces.RDFNamespaces()
datatype_uri = namespaces.get_rdf_namespace("xsd").int.uri
number_of_distinct_classes = str(len(self.usage_count))
number_of_distinct_classes_node = RDF.Node(literal=number_of_distinct_classes,
datatype=datatype_uri)
void_model.append(RDF.Statement(dataset,
namespaces.get_rdf_namespace("void").classes,
number_of_distinct_classes_node))

for class_uri_k, class_uri_v in self.usage_count.iteritems():
class_partitions_node = RDF.Node()
Expand Down
3 changes: 2 additions & 1 deletion lodstats/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from LabeledSubjects import LabeledSubjects
from SameAs import SameAs
from Links import Links
from LinkSets import LinkSets
# max per property
# avg per property
from Vocabularies import Vocabularies, VocabulariesPerNode
Expand Down Expand Up @@ -77,7 +78,7 @@
# stats for owl, rdf-schema, -syntax
vocab_stats = [RDFSyntax, RDFSchema, Owl]
# stats necessary for VoiD
void_stats = [Vocabularies, Entities, ClassesDefined, UsedClasses, PropertiesDefined, PropertyUsage, DistinctSubjects, DistinctObjects]
void_stats = [Vocabularies, Entities, ClassesDefined, UsedClasses, PropertiesDefined, PropertyUsage, DistinctSubjects, DistinctObjects, LinkSets]
# links only
link_stats = [Links]

Expand Down
3 changes: 3 additions & 0 deletions lodstats/util/archiveextractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ def extract_archive(self, uri, filepath, compression_format, callback_function):
elif compression_format == 'zip':
extracted_file_uri = self.decompress_zip(filepath, callback_function)
f.close()
if( filepath.startswith('/tmp')):
os.remove(filepath)
logger.debug("removed file %s" % filepath)
# call back one last time to push final numbers
if callback_function is not None:
callback_function(self)
Expand Down
2 changes: 1 addition & 1 deletion lodstats/util/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_namespace(uri):
if not uri.startswith('http://'):
return None
uri_no_http = uri[len('http://'):]
for sep in ['#', ':', '/']:
for sep in ['#', '/' , ':']:
split_uri = uri_no_http.rsplit(sep, 1)
if len(split_uri) == 2:
# base_uri is uri minus non-namepsace-part
Expand Down
4 changes: 4 additions & 0 deletions lodstats/util/rdf2rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,9 @@ def convert_ttl_to_nt(self, uri=None):

for data in stdout:
logger.debug(str(data))
if( input_file_path.startswith('/tmp')):
#remove the input file since we don't need it any more
os.remove(input_file_path)
logger.debug("removed file %s" % input_file_path)

return "file://%s" % output_file_path
4 changes: 3 additions & 1 deletion lodstats/util/rdf_namespaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ class RDFNamespaces(object):
"xsd": "http://www.w3.org/2001/XMLSchema#",
"stats": "http://example.org/XStats#",
"foaf": "http://xmlns.com/foaf/0.1/",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#"
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"skos": "http://www.w3.org/2004/02/skos/core#",
"gndo": "http://d-nb.info/standards/elementset/gnd#"
}

def __init__(self):
Expand Down
6 changes: 6 additions & 0 deletions lodstats/util/rdffile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import RDF
import warnings
import logging
import os

logger = logging.getLogger("lodstats")

Expand All @@ -29,11 +30,16 @@ def __init__(self, uri, callback_function=None, stats=None, rdf_format=None):
logger.debug("Converting turtle to ntriples")
rdf2rdf = RDF2RDF(self.get_uri())
self.set_uri(rdf2rdf.convert_ttl_to_nt())
self.set_rdf_format(rdf_format='nt')
logger.debug("Converted turtle to ntriples %s" % self.get_uri())
self.rdf_parser = self.identify_rdf_parser()
if(self.rdf_parser is not None):
self.rdf_stream = self.rdf_parser.parse_as_stream(self.uri)
self.do_stats(callback_function)
file_path = self.get_uri()[7:]
if( file_path.startswith('/tmp')):
os.remove(file_path)
logger.debug("removed file %s" % file_path)

def get_no_of_statements(self):
return self.no_of_statements
Expand Down