Created
November 3, 2014 13:34
-
-
Save davidread/979a7d63f6d813052255 to your computer and use it in GitHub Desktop.
Metadata provenance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Example dataset extra field "metadata_provenance" | |
This dataset originates at Barnet Open Data, is then harvested to London Datastore and then harvested onto data.gov.uk. When it is viewed in data.gov.uk it has this metadata_provenance: | |
[ | |
{ | |
"activity_occurred": "2014-10-21T09:04:19.753433", | |
"activity": "harvest", | |
"harvest_source_url": "https://open.barnet.gov.uk/", | |
"harvest_source_title": "Barnet Open Data", | |
"harvest_source_type": "ckan", | |
"harvested_guid": "a9347796-00da-40e6-8ca5- | |
63069a1154dd", | |
"harvested_metadata_modified": "2014-09-05T09:04:19.753433" | |
}, | |
{ | |
"activity_occurred": "2014-10-22T10:05:49.475839", | |
"activity": "harvest", | |
"harvest_source_url": "https://open.london.gov.uk/", | |
"harvest_source_title": "London Datastore", | |
"harvest_source_type": "ckan", | |
"harvested_id": "a9347796-00da-40e6-8ca5-63069a1154dd", | |
"harvested_metadata_modified": "2014-10-08T09:04:19.753433" | |
} | |
] | |
CKAN code that does this: | |
https://github.com/datagovuk/ckanext-harvest/blob/2.0/ckanext/harvest/harvesters/base.py#L272-L307 | |
class BaseHarvester: | |
@classmethod | |
def get_metadata_provenance_for_just_this_harvest(cls, harvest_object): | |
return { | |
'activity_occurred': datetime.datetime.utcnow().isoformat(), | |
'activity': 'harvest', | |
'harvest_source_url': harvest_object.source.url, | |
'harvest_source_title': harvest_object.source.title, | |
'harvest_source_type': harvest_object.source.type, | |
'harvested_guid': harvest_object.guid, | |
'harvested_metadata_modified': harvest_object.metadata_modified_date.isoformat() | |
if harvest_object.metadata_modified_date else None, | |
} | |
@classmethod | |
def get_metadata_provenance(cls, harvest_object, harvested_provenance=None): | |
'''Returns the metadata_provenance for a dataset, which is the details | |
of this harvest added onto any existing metadata_provenance value in | |
the dataset. This should be stored in the metadata_provenance extra | |
when harvesting. | |
Provenance is a record of harvests, imports and perhaps other | |
activities of production too, as suggested by W3C PROV. | |
This helps keep track when a dataset is created in site A, imported | |
into site B, harvested into site C and from there is harvested into | |
site D. The metadata_provence will be a list of four dicts with the | |
details: [A, B, C, D]. | |
''' | |
if isinstance(harvested_provenance, basestring): | |
harvested_provenance = json.loads(harvested_provenance) | |
elif harvested_provenance is None: | |
harvested_provenance = [] | |
assert isinstance(harvested_provenance, list) | |
metadata_provenance = harvested_provenance + \ | |
[cls.get_metadata_provenance_for_just_this_harvest(harvest_object)] | |
return json.dumps(metadata_provenance) | |
in import_stage(): | |
https://github.com/datagovuk/ckanext-harvest/blob/2.0/ckanext/harvest/harvesters/ckanharvester.py#L438-L441 | |
# Metadata provenance | |
package_dict['extras']['metadata_provenance'] = self.get_metadata_provenance( | |
harvested_provenance=package_dict_harvested['extras'].get('metadata_provenance'), | |
harvest_object=harvest_object) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment