Created
August 12, 2016 15:28
-
-
Save scossu/762947612333c051a1ecbea21b90204c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from combine.config.lpm import lpm as conf | |
from combine.connectors.lake_connector import LakeConnector, LPMConnector | |
from combine.connectors.ts_index_connector import TsIndexConnector | |
from combine.modules.base_module import BaseModule | |
from combine.modules.lake.namespaces import ns_pfx_sparql | |
from combine.modules.task import Task | |
class LpmSync(BaseModule): | |
'''@package combine.modules.sync | |
Synchronzes the LAKE master Fedora repository with another Fedora | |
repository designated as the "Public Mirror", containing a subset of the | |
master repo. | |
''' | |
def __init__(self): | |
self._lconn = LakeConnector() | |
self._lpmconn = LPMConnector() | |
self._tsidxconn = TsIndexConnector() | |
#### TASKS #### | |
@Task | |
def sync_entity(self, uri, ingest=True): | |
'''Syncs one resource from the LAKE master repo with the public mirror. | |
This task verifies if the resource referenced by the URI is viable for | |
publishing, and at the same time gathers the triples vetted for | |
publishing and builds the graph that is sent to the LPM. | |
The conditions for a resource to be published are: | |
1. The resource is of a RDF type listed in the `type_whitelist` | |
property of the configuration; OR | |
2. The resource is a pcdm:File or a pcdm:FileSet and is indirectly | |
related to a resource of a white-listed RDF type as per 1. | |
The properties to be retrieved for the resource are specified in the | |
`predicate_whitelist` configuration property. | |
Some value can also be restricted, e.g. server-managed RDF types. These | |
can be indicated in the `object_blacklist` and `object_filter_regexp` | |
properties. | |
@param uri (string) Full URI of the resource in the master repository | |
that is to be mirrored. | |
@param ingest (boolean) Whether to perform the actual ingest. | |
''' | |
# Ingest target URI. | |
target = self._replace_prefix(uri) | |
# This is a format()-parsable string. | |
retrieve_qry_base = '''{pfx_decl} | |
CONSTRUCT {{ <{target}> ?p ?o }} | |
WHERE {{ | |
{{ | |
<{uri}> a ?t . | |
<{uri}> ?p ?o . | |
VALUES ?p {{ {pred_wl} }} . | |
FILTER ( {type_wl} ) . | |
FILTER ( {obj_bl} ) . | |
}} UNION {{ | |
{{ | |
<{uri}> a hw:FileSet . | |
?pr pcdm:hasMember <{uri}> . | |
}} UNION {{ | |
<{uri}> a hw:File . | |
?pr pcdm:hasMember/pcdm:hasFile <{uri}> . | |
}} | |
?pr a ?t . | |
FILTER ( {type_wl} ) . | |
}} | |
}}''' | |
retrieve_qry = retrieve_qry_base.format( | |
uri = uri, | |
target = target, | |
pfx_decl = '\n'.join(ns_pfx_sparql.values()), | |
pred_wl = ' '.join(conf['predicate_whitelist']), | |
type_wl = '?t = ' + ' || ?t = '.join(conf['type_whitelist']), | |
obj_bl = '?o != ' + ' && ?o != '.join(conf['object_blacklist']) | |
) | |
#print('NS Prefixes: {}'.format(ns_pfx_sparql)) | |
print('Sending query to triplestore: {}'.format(retrieve_qry)) | |
res = self._tsidxconn.query(retrieve_qry, 'construct') | |
# Occurrences of the master resource URI in the subject have already | |
# been replaced with the LPM resource URI in the query, but we still | |
# need to replace the same occurrences in the object. | |
update_qry_base = '''DELETE {{ ?s ?p ?o }} | |
INSERT {{ ?s ?p ?o1 }} | |
WHERE {{ | |
?s ?p ?o . | |
FILTER isIRI (?o) . | |
FILTER STRSTARTS (STR(?o), "{master_uri}") . | |
BIND (URI (REPLACE (STR (?o), "{master_uri}", "{lpm_uri}")) AS ?o1) . | |
}}''' | |
update_qry = update_qry_base.format( | |
master_uri=self._lconn.conf['base_url'], | |
lpm_uri=self._lpmconn.conf['base_url']) | |
self._logger.debug('Update query: {}'.format(update_qry)) | |
res.update(update_qry) | |
self._logger.debug('Ingesting triples into LPM: {}'.format( | |
res.serialize(format='n3'))) | |
return self._lpmconn.put_ldpc(target, res) | |
## PRIVATE MEMBERS ## | |
def _replace_prefix(self, input): | |
'''Replace URI prefixes.''' | |
return input.replace( | |
self._lconn.conf['base_url'], self._lpmconn.conf['base_url']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment