Skip to content

Instantly share code, notes, and snippets.

@scossu
Created August 12, 2016 15:28
Show Gist options
  • Save scossu/762947612333c051a1ecbea21b90204c to your computer and use it in GitHub Desktop.
Save scossu/762947612333c051a1ecbea21b90204c to your computer and use it in GitHub Desktop.
from combine.config.lpm import lpm as conf
from combine.connectors.lake_connector import LakeConnector, LPMConnector
from combine.connectors.ts_index_connector import TsIndexConnector
from combine.modules.base_module import BaseModule
from combine.modules.lake.namespaces import ns_pfx_sparql
from combine.modules.task import Task
class LpmSync(BaseModule):
'''@package combine.modules.sync
Synchronzes the LAKE master Fedora repository with another Fedora
repository designated as the "Public Mirror", containing a subset of the
master repo.
'''
def __init__(self):
self._lconn = LakeConnector()
self._lpmconn = LPMConnector()
self._tsidxconn = TsIndexConnector()
#### TASKS ####
@Task
def sync_entity(self, uri, ingest=True):
'''Syncs one resource from the LAKE master repo with the public mirror.
This task verifies if the resource referenced by the URI is viable for
publishing, and at the same time gathers the triples vetted for
publishing and builds the graph that is sent to the LPM.
The conditions for a resource to be published are:
1. The resource is of a RDF type listed in the `type_whitelist`
property of the configuration; OR
2. The resource is a pcdm:File or a pcdm:FileSet and is indirectly
related to a resource of a white-listed RDF type as per 1.
The properties to be retrieved for the resource are specified in the
`predicate_whitelist` configuration property.
Some value can also be restricted, e.g. server-managed RDF types. These
can be indicated in the `object_blacklist` and `object_filter_regexp`
properties.
@param uri (string) Full URI of the resource in the master repository
that is to be mirrored.
@param ingest (boolean) Whether to perform the actual ingest.
'''
# Ingest target URI.
target = self._replace_prefix(uri)
# This is a format()-parsable string.
retrieve_qry_base = '''{pfx_decl}
CONSTRUCT {{ <{target}> ?p ?o }}
WHERE {{
{{
<{uri}> a ?t .
<{uri}> ?p ?o .
VALUES ?p {{ {pred_wl} }} .
FILTER ( {type_wl} ) .
FILTER ( {obj_bl} ) .
}} UNION {{
{{
<{uri}> a hw:FileSet .
?pr pcdm:hasMember <{uri}> .
}} UNION {{
<{uri}> a hw:File .
?pr pcdm:hasMember/pcdm:hasFile <{uri}> .
}}
?pr a ?t .
FILTER ( {type_wl} ) .
}}
}}'''
retrieve_qry = retrieve_qry_base.format(
uri = uri,
target = target,
pfx_decl = '\n'.join(ns_pfx_sparql.values()),
pred_wl = ' '.join(conf['predicate_whitelist']),
type_wl = '?t = ' + ' || ?t = '.join(conf['type_whitelist']),
obj_bl = '?o != ' + ' && ?o != '.join(conf['object_blacklist'])
)
#print('NS Prefixes: {}'.format(ns_pfx_sparql))
print('Sending query to triplestore: {}'.format(retrieve_qry))
res = self._tsidxconn.query(retrieve_qry, 'construct')
# Occurrences of the master resource URI in the subject have already
# been replaced with the LPM resource URI in the query, but we still
# need to replace the same occurrences in the object.
update_qry_base = '''DELETE {{ ?s ?p ?o }}
INSERT {{ ?s ?p ?o1 }}
WHERE {{
?s ?p ?o .
FILTER isIRI (?o) .
FILTER STRSTARTS (STR(?o), "{master_uri}") .
BIND (URI (REPLACE (STR (?o), "{master_uri}", "{lpm_uri}")) AS ?o1) .
}}'''
update_qry = update_qry_base.format(
master_uri=self._lconn.conf['base_url'],
lpm_uri=self._lpmconn.conf['base_url'])
self._logger.debug('Update query: {}'.format(update_qry))
res.update(update_qry)
self._logger.debug('Ingesting triples into LPM: {}'.format(
res.serialize(format='n3')))
return self._lpmconn.put_ldpc(target, res)
## PRIVATE MEMBERS ##
def _replace_prefix(self, input):
'''Replace URI prefixes.'''
return input.replace(
self._lconn.conf['base_url'], self._lpmconn.conf['base_url'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment