Created
April 14, 2023 15:39
-
-
Save SandyRogers/36f04c12739a0bf7a644c4cf218232c7 to your computer and use it in GitHub Desktop.
Package SANNTIS annotations as RO-Crates for MGnify
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<title>{{ crate.name or "New RO Crate" }}</title> | |
<meta name="keywords" content="RO Crate"> | |
<style type="text/css"> | |
html { | |
margin: 0; | |
padding: 0; | |
} | |
body { | |
font-family: Arial, sans-serif; | |
color: #333; | |
font-size: 14px; | |
background: #eee; | |
margin: 0; | |
padding: 0; | |
} | |
.main { | |
max-width: 900px; | |
margin: auto; | |
background: #fff; | |
padding: 4em; | |
} | |
.data-entity, .context-entity { | |
margin: 0.5em 0; | |
background: #fafafa; | |
border-left: 3px solid #18974c; | |
padding: 0.5em; | |
border-radius: 4px; | |
} | |
.context-entity { | |
border-left: 3px solid #734595; | |
} | |
dt { | |
font-weight: bold; | |
} | |
dd { | |
margin-bottom: 10px; | |
} | |
h1 { | |
font-size: 40px; | |
} | |
.wf_image { | |
padding: 10px 0 10px 0; | |
width: 100%; | |
} | |
.RO_crate_logo { | |
display: block; | |
margin-left: auto; | |
margin-right: auto; | |
width: 150px; | |
padding:20px; | |
} | |
</style> | |
</head> | |
<body> | |
<div class='main'> | |
<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> | |
<svg width="100%" height="100%" viewBox="0 0 100 100" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;" class="RO_crate_logo"> | |
<g transform="matrix(1.21845,0,0,1.21845,-10.9223,-13.4809)"> | |
<g transform="matrix(1,0,0,1,-40.4006,31.1723)"> | |
<path d="M94.866,31.747L94.889,23.304L110.569,23.304L110.569,43.265C108.45,45.316 105.379,47.122 101.357,48.683C97.335,50.244 93.262,51.024 89.138,51.024C83.897,51.024 79.328,49.925 75.432,47.726C71.536,45.527 68.607,42.382 66.648,38.292C64.688,34.202 63.708,29.753 63.708,24.945C63.708,19.727 64.802,15.09 66.99,11.034C69.177,6.978 72.379,3.868 76.594,1.703C79.807,0.039 83.806,-0.792 88.591,-0.792C94.812,-0.792 103.169,3.121 103.169,3.121L95.889,9.991C95.889,9.991 91.508,7.855 88.591,7.855C84.171,7.855 80.656,9.257 78.047,12.059C75.438,14.862 74.133,19.021 74.133,24.535C74.133,30.482 75.455,34.943 78.098,37.916C80.741,40.89 84.205,42.377 88.489,42.377C90.608,42.377 92.733,41.961 94.863,41.129C96.994,40.297 98.822,39.289 100.349,38.104L100.349,31.747L94.866,31.747Z" style="fill:rgb(26,28,26);fill-rule:nonzero;"/> | |
</g> | |
<g transform="matrix(0.850731,0,0,1.02684,-17.0584,15.0796)"> | |
<rect x="82.3" y="38.322" width="27.9" height="8.467" style="fill:rgb(26,28,26);"/> | |
</g> | |
<g id="g8171" transform="matrix(-0.282039,0,0,0.282039,93.5815,-1.03103)"> | |
<g id="rect8173" transform="matrix(0.912195,0.409756,-0.409756,0.912195,0,0)"> | |
<rect x="151.738" y="6.821" width="15.036" height="131.925" style="fill:rgb(26,28,26);stroke:white;stroke-width:0.78px;"/> | |
</g> | |
<g id="rect8175" transform="matrix(0.900702,0.434438,-0.402871,0.915257,0,0)"> | |
<rect x="144.088" y="33.443" width="28.964" height="86.68" style="fill:rgb(26,28,26);stroke:white;stroke-width:0.98px;"/> | |
</g> | |
<circle id="circle8177" cx="143.08" cy="132.429" r="22.553" style="fill:rgb(26,28,26);stroke:white;stroke-width:0.78px;"/> | |
</g> | |
</g> | |
<g transform="matrix(1,0,0,1,-7.44699,-6.09141)"> | |
<g transform="matrix(85.3333,0,0,85.3333,5.42212,79.8384)"> | |
<path d="M0.028,-0.299C0.053,-0.299 0.073,-0.306 0.089,-0.319C0.105,-0.332 0.115,-0.35 0.121,-0.372C0.126,-0.395 0.129,-0.433 0.129,-0.487C0.129,-0.542 0.13,-0.577 0.132,-0.595C0.135,-0.622 0.14,-0.644 0.148,-0.661C0.156,-0.677 0.166,-0.691 0.178,-0.7C0.189,-0.71 0.204,-0.718 0.223,-0.723C0.235,-0.726 0.255,-0.728 0.283,-0.728L0.311,-0.728L0.311,-0.651L0.295,-0.651C0.262,-0.651 0.239,-0.645 0.228,-0.633C0.217,-0.621 0.211,-0.594 0.211,-0.551C0.211,-0.466 0.21,-0.412 0.206,-0.39C0.2,-0.355 0.19,-0.328 0.176,-0.309C0.162,-0.29 0.14,-0.273 0.109,-0.259C0.145,-0.244 0.171,-0.221 0.187,-0.19C0.203,-0.159 0.211,-0.109 0.211,-0.039C0.211,0.024 0.212,0.062 0.213,0.074C0.216,0.096 0.223,0.112 0.233,0.121C0.244,0.129 0.264,0.134 0.295,0.134L0.311,0.134L0.311,0.21L0.283,0.21C0.251,0.21 0.228,0.208 0.214,0.203C0.193,0.195 0.176,0.183 0.162,0.166C0.148,0.149 0.14,0.128 0.135,0.103C0.131,0.077 0.129,0.035 0.129,-0.024C0.129,-0.083 0.126,-0.123 0.121,-0.146C0.115,-0.168 0.105,-0.186 0.089,-0.199C0.073,-0.212 0.053,-0.219 0.028,-0.219L0.028,-0.299Z" style="fill:rgb(26,28,26);fill-rule:nonzero;"/> | |
</g> | |
</g> | |
<g transform="matrix(-1,0,0,1,107.306,-6.09141)"> | |
<g transform="matrix(85.3333,0,0,85.3333,5.42212,79.8384)"> | |
<path d="M0.028,-0.299C0.053,-0.299 0.073,-0.306 0.089,-0.319C0.105,-0.332 0.115,-0.35 0.121,-0.372C0.126,-0.395 0.129,-0.433 0.129,-0.487C0.129,-0.542 0.13,-0.577 0.132,-0.595C0.135,-0.622 0.14,-0.644 0.148,-0.661C0.156,-0.677 0.166,-0.691 0.178,-0.7C0.189,-0.71 0.204,-0.718 0.223,-0.723C0.235,-0.726 0.255,-0.728 0.283,-0.728L0.311,-0.728L0.311,-0.651L0.295,-0.651C0.262,-0.651 0.239,-0.645 0.228,-0.633C0.217,-0.621 0.211,-0.594 0.211,-0.551C0.211,-0.466 0.21,-0.412 0.206,-0.39C0.2,-0.355 0.19,-0.328 0.176,-0.309C0.162,-0.29 0.14,-0.273 0.109,-0.259C0.145,-0.244 0.171,-0.221 0.187,-0.19C0.203,-0.159 0.211,-0.109 0.211,-0.039C0.211,0.024 0.212,0.062 0.213,0.074C0.216,0.096 0.223,0.112 0.233,0.121C0.244,0.129 0.264,0.134 0.295,0.134L0.311,0.134L0.311,0.21L0.283,0.21C0.251,0.21 0.228,0.208 0.214,0.203C0.193,0.195 0.176,0.183 0.162,0.166C0.148,0.149 0.14,0.128 0.135,0.103C0.131,0.077 0.129,0.035 0.129,-0.024C0.129,-0.083 0.126,-0.123 0.121,-0.146C0.115,-0.168 0.105,-0.186 0.089,-0.199C0.073,-0.212 0.053,-0.219 0.028,-0.219L0.028,-0.299Z" style="fill:rgb(26,28,26);fill-rule:nonzero;"/> | |
</g> | |
</g> | |
</svg> | |
<h1>{{ crate.name or "New RO Crate" }}</h1> | |
<p> | |
{% if crate.description %} | |
{{ crate.description }} | |
{%endif %} | |
</p> | |
{% if crate.image %} | |
<img src="{{ crate.image }}" class="wf_image"/> | |
{%endif %} | |
<dl> | |
{% if crate.creator %} | |
{% if is_object_list(crate.creator) %} | |
<dt>Creators</dt> | |
{% for obj in crate.creator %} | |
<dd>{{ stringify(obj) }}</dd> | |
{% endfor %} | |
{% else %} | |
<dt>Creator</dt> | |
<dd>{{ stringify(crate.creator) }}</dd> | |
{%endif %} | |
{%endif %} | |
{% if crate.publisher %} | |
{% if is_object_list(crate.publisher) %} | |
<dt>Publishers</dt> | |
{% for obj in crate.publisher %} | |
<dd>{{ stringify(obj) }}</dd> | |
{% endfor %} | |
{% else %} | |
<dt>Publisher</dt> | |
<dd>{{ stringify(crate.publisher) }}</dd> | |
{%endif %} | |
{%endif %} | |
{% if crate.url %} | |
<dt>URL</dt> | |
<dd><a href="{{ crate.url }}" target="{{ crate.url }}"></a></dd> | |
{%endif %} | |
{% if crate.license %} | |
<dt>License</dt> | |
<dd>{{ crate.license }}</dd> | |
{%endif %} | |
{% if crate.keywords %} | |
<dt>Keyword(s)</dt> | |
<dd>{{ stringify(crate.keywords) }}</dd> | |
{%endif %} | |
{% if crate.isBasedOn %} | |
<dt>isBasedOn</dt> | |
<dd>{{ crate.isBasedOn }}</dd> | |
{%endif %} | |
{% if crate.datePublished %} | |
<dt>datePublished</dt> | |
<dd>{{ crate.datePublished }}</dd> | |
{%endif %} | |
{% if crate.CreativeWorkStatus %} | |
<dt>CreativeWorkStatus</dt> | |
<dd>{{ crate.CreativeWorkStatus }}</dd> | |
{%endif %} | |
</dl> | |
<h2>Contents</h2> | |
<div id="contents"> | |
{% for entry in data %} | |
<div class="data-entity" id=""> | |
<strong>Data entity</strong> | |
<a class="data-entity-link" href="{{ entry['@id'] }}">{{ entry['@id'] }}</a> | |
<p>Type: {{ stringify(entry['@type']) }}</p> | |
{% if entry['programmingLanguage'] %} | |
<p>ProgrammingLanguage: {{ entry['programmingLanguage']['@id'] }}</p> | |
{% endif %} | |
</div> | |
{% endfor %} | |
</div> | |
{% if crate.root_dataset.get("variableMeasured") %} | |
{% if is_object_list(crate.root_dataset.get("variableMeasured")) %} | |
<h2>Dataset variables</h2> | |
<div id="variables"> | |
<dt>Variable definitions</dt> | |
{% for obj in crate.root_dataset.get("variableMeasured") %} | |
<dd><a href="#{{obj['@id']}}">{{ stringify(obj) }}</a></dd> | |
{% endfor %} | |
{% else %} | |
{{ stringify(crate.publisher) }} | |
{% endif %} | |
{% endif %} | |
<h2>Metadata</h2> | |
<div id="metadata"> | |
{% for entry in context %} | |
<a id="{{entry['@id']}}"/> | |
<div class="context-entity" id=""> | |
<strong>{{ stringify(entry['@type']) }}</strong> | |
{% if entry['@name'] %} | |
{% if entry['@name'].startswith('http') %} | |
<a class="data-entity-link" href="{{ entry['@name'] }}">{{ entry['@name'] }}</a> | |
{% else %} | |
{{ entry['@name'] }} | |
{% endif %} | |
{% else %} | |
{% if entry['@id'].startswith('http') %} | |
<a class="data-entity-link" href="{{ entry['@id'] }}">{{ entry['@id'] }}</a> | |
{% else %} | |
{{ entry['@id'] }} | |
{% endif %} | |
{% endif %} | |
<p> | |
{% for detail in details(entry) %} | |
<dt>{{ stringify(detail) }}</dt> | |
<dd>{{ stringify(entry[detail]) }}</dd> | |
{% endfor %} | |
</p> | |
</div> | |
{% endfor %} | |
</div> | |
</div> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from rocrate.rocrate import ROCrate | |
from rocrate.model.dataset import Dataset | |
from rocrate.model.preview import Preview | |
from rocrate.model.contextentity import ContextEntity | |
from uuid import uuid4 | |
import os | |
from datetime import datetime | |
from jinja2 import Template | |
class MGnifyPreview(Preview): | |
def generate_html(self): | |
template = open('mgnify-rocrate-preview-template.html.j2') | |
src = Template(template.read()) | |
def template_function(func): | |
src.globals[func.__name__] = func | |
return func | |
@template_function | |
def stringify(a): | |
if type(a) is list: | |
return ', '.join([stringify(aa) for aa in a]) | |
elif type(a) is str: | |
return a | |
elif hasattr(a, '_jsonld') and a._jsonld.get('name'): | |
return a._jsonld['name'] | |
elif type(a) is dict: | |
return stringify(list(a.values())) | |
else: | |
return a | |
@template_function | |
def is_object_list(a): | |
if type(a) is list: | |
for obj in a: | |
if obj is not str: | |
return True | |
else: | |
return False | |
@template_function | |
def details(a): | |
if type(a) is dict: | |
return {k: v for k, v in a.items() if k not in ['@id', '@type']} | |
template.close() | |
context_entities = [] | |
data_entities = [] | |
for entity in self.crate.contextual_entities: | |
context_entities.append(entity._jsonld) | |
for entity in self.crate.data_entities: | |
data_entities.append(entity._jsonld) | |
out_html = src.render(crate=self.crate, context=context_entities, data=data_entities) | |
return out_html | |
def create_sanntis_rocrate(gff_path: str): | |
try: | |
assembly = 'ERZ' + gff_path.split('ERZ')[1].split('.')[0].split('_')[0] | |
except: | |
print(f'Could not determine assembly accession from path {gff_path}') | |
return | |
crate = ROCrate(gen_preview=False) | |
crate.add(MGnifyPreview(crate)) | |
# Conform to the WFRUN profile | |
PC_PROFILE_ID = "https://w3id.org/ro/wfrun/process/0.1" | |
pc_profile = crate.add(ContextEntity(crate, PC_PROFILE_ID, properties={ | |
"@type": "CreativeWork", | |
"name": "Process Run Crate", | |
"version": "0.1" | |
})) | |
crate.root_dataset["conformsTo"] = pc_profile | |
crate.name = f'SANNTIS predictions for assembly {assembly}' | |
crate.description = f"""SanntiS (SMBGC Annotation using Neural Networks Trained on Interpro Signatures) predicts secondary metabolite biosynthetic gene clusters. | |
This is the output (a GFF feature file) of SanntiS being run on the MGnify assembly {assembly}.""" | |
# Workflow Provenance | |
sourcecode = crate.add(ContextEntity(crate, "https://github.com/Finn-Lab/SanntiS", properties={ | |
"@type": "SoftwareSourceCode", | |
"name": "SanntiS: SMBGC Annotation using Neural Networks Trained on Interpro Signatures", | |
"alternateName": "emeraldBGC", | |
"url": "https://github.com/Finn-Lab/SanntiS", | |
"codeRepository": "https://github.com/Finn-Lab/SanntiS", | |
"version": "0.2.3", | |
})) | |
bbsrc = crate.add(ContextEntity(crate, "https://ror.org/00cwqg982", properties={ | |
"@type": "Organization", | |
"name": "BBSRC", | |
"alternateName": "Biotechnology and Biological Sciences Research Council", | |
"url": "http://www.bbsrc.ac.uk/" | |
})) | |
emerald_grant = crate.add(ContextEntity(crate, "BB/S009043/1", properties={ | |
"@type": "Grant", | |
"name": "EMERALD - Enriching MEtagenomics Results using Artificial intelligence and Literature Data", | |
"url": "https://gtr.ukri.org/projects?ref=BB%2FS009043%2F1" | |
})) | |
emerald_grant.append_to("funder", bbsrc) | |
sourcecode.append_to("funding", emerald_grant) | |
# The run | |
agent = crate.add(ContextEntity(crate, "https://ror.org/02catss52", properties={ | |
"@type": "Organization", | |
"name": "EMBL-EBI", | |
"url": "https://www.ebi.ac.uk/metagenomics" | |
})) | |
crate.creator = agent | |
fin = os.path.getctime(gff_path) | |
## Add GFF output file | |
gff = crate.add_file( | |
gff_path, | |
properties={ | |
"name": "annotations gff", | |
"encodingFormat": "text/x-gff3" | |
} | |
) | |
## Add link | |
run_id = uuid4().hex | |
run = crate.add(ContextEntity(crate, run_id, properties={ | |
"@type": "CreateAction", | |
"name": f"SanntiS run on {assembly}", | |
"endTime": datetime.fromtimestamp(fin).isoformat(), | |
"description": "", | |
})) | |
run.append_to("result", gff) | |
run.append_to("agent", agent) | |
run.append_to("instrument", sourcecode) | |
## Describe the GFF columns of interest | |
gff_cols = [ | |
crate.add(ContextEntity(crate, 'gff_attribute_nearest_mibig', properties={ | |
"@type": "PropertyValue", | |
"name": "Nearest MiBIG", | |
"url": "https://mibig.secondarymetabolites.org/repository", | |
"description": "The nearest_MiBIG attribute in the GFF column 9 is the closest predicted BGC from the MiBIG ontology.", | |
"value": "nearest_MiBIG", | |
"propertyId": "https://mibig.secondarymetabolites.org/repository/@value", | |
})), | |
crate.add(ContextEntity(crate, 'gff_attribute_nearest_mibig_class', properties={ | |
"@type": "PropertyValue", | |
"name": "Nearest MiBIG class", | |
"url": "https://mibig.secondarymetabolites.org", | |
"description": "The nearest_MiBIG_class attribute in the GFF column 9 is one of the 6 (or other) BGC types from the MiBIG ontology.", | |
"value": "nearest_MiBIG_class", | |
"propertyId": "https://mibig.secondarymetabolites.org/", | |
})) | |
] | |
for col in gff_cols: | |
crate.root_dataset.append_to("variableMeasured", col) | |
crate.write_zip(f"./crates/sanntis_{sourcecode['version']}_{assembly}.zip") | |
with open('gffPaths.txt', 'r') as paths: | |
for path in paths.readlines(): | |
print(path) | |
create_sanntis_rocrate(path.strip()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment