Skip to content

Instantly share code, notes, and snippets.

@ktym ktym/refseq2ttl.rb
Last active Dec 17, 2015

Embed
What would you like to do?
refseq2ttl.rb version 8 (migrated into https://github.com/dbcls/rdfsummit/tree/master/insdc2ttl in 2014)
#!/usr/bin/env ruby
require 'rubygems'
require 'uri'
require 'bio'
require 'json'
require 'securerandom'
# [TODO] integrate this into BioRuby
module Bio
class GenBank
def dblink
fetch('DBLINK')
end
def project
dblink[/Project: (\d+)/, 1]
end
def bioproject
dblink[/BioProject: (\S+)/, 1]
end
def biosample
dblink[/BioSample: (\S+)/, 1]
end
end
end
###
### Utilities for RDF generation
###
module RDFSupport
def new_uuid(prefix = "http://purl.jp/bio/10/genome/uuid/")
#return "<#{prefix}#{SecureRandom.uuid}>"
#return "genome:uuid-#{SecureRandom.uuid}"
return "<urn:uuid:#{SecureRandom.uuid}>"
end
def quote(str)
return str.gsub('\\', '\\\\').gsub("\t", '\\t').gsub("\n", '\\n').gsub("\r", '\\r').gsub('"', '\\"').inspect
end
def triple(s, p, o)
return [s, p, o].join("\t") + " ."
end
def default_prefix
return [
triple("@prefix", "rdf:", "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>"),
triple("@prefix", "rdfs:", "<http://www.w3.org/2000/01/rdf-schema#>"),
#triple("@prefix", "dcterms:", "<http://purl.org/dc/terms/>"),
triple("@prefix", "xsd:", "<http://www.w3.org/2001/XMLSchema#>"),
#triple("@prefix", "sio:", "<http://semanticscience.org/resource#>"),
#triple("@prefix", "so:", "<http://purl.org/obo/owl/SO#>"),
triple("@prefix", "obo:", "<http://purl.obolibrary.org/obo/>"),
triple("@prefix", "faldo:", "<http://biohackathon.org/resource/faldo#>"),
]
end
def usdate2date(str)
return Date.parse(str).strftime("%Y-%m-%d")
end
end
###
### Mapping RefSeq db_xref to Identifiers.org
###
# https://gist.github.com/3985701
# https://gist.github.com/4146256
class RS_ID
include RDFSupport
def initialize
@rs_id = JSON.parse(File.read(File.dirname(__FILE__) + "/refseq2ttl/rs_id.json"))
end
def fetch(db)
@rs_id[db]
end
def labels
@rs_id.sort.each do |db, hash|
puts triple("insdc:#{hash['class']}", "rdfs:label", quote(hash['label']))
end
end
end
###
### Mapping RefSeq feature table to Sequence Ontology
###
# https://gist.github.com/3650401
class FT_SO
def initialize
@data = JSON.parse(File.read(File.dirname(__FILE__) + "/refseq2ttl/ft_so.json"))
@data["ncRNA"] = {
"so_id" => "SO:0000655",
"so_term" => "ncRNA",
"ft_desc" => "noncoding RNA",
"so_desc" => "An RNA transcript that does not encode for a protein rather the RNA molecule is the gene product."
}
end
# ftso = FT_SO.new
# puts ftso.so_id("-10_signal") # => "SO:0000175"
def so_id(feature)
if hash = @data[feature]
return hash["so_id"]
end
end
def so_term(feature)
if hash = @data[feature]
return hash["so_term"]
end
end
def so_desc(feature)
if hash = @data[feature]
return hash["so_desc"]
end
end
def ft_desc(feature)
if hash = @data[feature]
return hash["ft_desc"]
end
end
end
###
### Convert RefSeq (prokaryote) entries to RDF
###
class RefSeq2RDF
include RDFSupport
def initialize(io = nil, seqtype = nil)
set_prefixes
@seqtype = seqtype
@rs_id = RS_ID.new
@ft_so = FT_SO.new
@gene = {}
@xref_warn = {}
@feature_count = Hash.new(0)
puts prefix
puts
parse_refseq(io) if io
end
attr_accessor :prefix
def self.prefixes
puts prefix
end
def set_prefixes
@prefix = default_prefix + [
#triple("@prefix", "genome:", "<http://purl.jp/bio/10/genome/>"),
#triple("@prefix", "insdc:", "<http://insdc.org/owl/>"),
triple("@prefix", "insdc:", "<http://ddbj.nig.ac.jp/ontologies/sequence/>"),
]
end
def xref(subject, db, id)
case db
when "HOMD"
id.sub!(/^tax_/, '')
when "ECOCYC"
#id = "ECOCYC:#{id}"
when "GI", "ERIC", "HMP", "PSEUDO", "Pathema"
unless @xref_warn[db]
$stderr.puts "Warning: Need to register '#{db}' in Identifiers.org"
@xref_warn[db] = true
end
end
if hash = @rs_id.fetch(db)
uri = "<#{hash['prefix']}/#{id}>"
puts triple(subject, "rdfs:seeAlso", uri)
puts triple(uri, "rdfs:label", quote(id))
#puts triple(uri, "rdf:type", "idorg:#{hash['class']}")
puts triple(uri, "rdf:type", "<#{hash['prefix']}>")
else
unless @xref_warn[db]
$stderr.puts "Error: New database '#{db}' found. Add it to the rs_id.json file and/or Identifiers.org."
@xref_warn[db] = true
end
end
end
###
### FALDO http://biohackathon.org/faldo
###
def new_location(pos, subpart_type = false)
loc_id = new_uuid
puts triple(loc_id, "insdc:location", quote(pos))
@locations = Bio::Locations.new(pos)
pos_start = new_uuid
pos_end = new_uuid
puts triple(loc_id, "rdf:type", "faldo:Region")
puts triple(loc_id, "faldo:begin", pos_start)
puts triple(loc_id, "faldo:end", pos_end)
# [TODO] Note that positions of an object located over the origin can be faldo:begin > faldo:end
# e.g., join(800..900,1000..1024,1..234) will be faldo:start 800 and faldo:end 234
new_position(pos_start, @locations.first.from, @locations.first.strand)
new_position(pos_end, @locations.last.to, @locations.last.strand)
list = []
if subpart_type
@locations.each do |loc|
subpart_id = new_uuid
subpart_start = new_uuid
subpart_end = new_uuid
puts triple(subpart_id, "obo:so_part_of", loc_id)
puts triple(subpart_id, "rdf:type", subpart_type[:id]) + " # #{subpart_type[:term]}"
puts triple(subpart_id, "rdf:type", "faldo:Region")
puts triple(subpart_id, "faldo:begin", subpart_start)
puts triple(subpart_id, "faldo:end", subpart_end)
new_position(subpart_start, loc.from, loc.strand)
new_position(subpart_end, loc.to, loc.strand)
list << subpart_id
end
end
return loc_id, list
end
def new_position(pos_id, pos, strand)
puts triple(pos_id, "faldo:position", pos)
puts triple(pos_id, "faldo:reference", @sequence_id)
puts triple(pos_id, "rdf:type", "faldo:ExactPosition")
if strand > 0
puts triple(pos_id, "rdf:type", "faldo:ForwardStrandPosition")
else
puts triple(pos_id, "rdf:type", "faldo:ReverseStrandPosition")
end
end
###
### Main
###
def parse_refseq(io)
# Read RefSeq entry
Bio::FlatFile.auto(io).each do |entry|
@entry = entry
@features = entry.features
@source = @features.shift
parse_sequence
parse_source
parse_genes
parse_features
end
end
###
### Sequence
###
# [TODO]
# * bind sequences by BioProject ID?
# * flag complete/draft?
def parse_sequence
@sequence_id = new_uuid
# [TODO] How to identify the input is chromosome/plasmid/contig/...?
sequence_type(@seqtype)
# [TODO] Obtain rdfs:label from source /chromosome (eukaryotes) /plasmid (prokaryotes) -> see insdc:source_chromosome, insdc:source_plasmid
sequence_label(@entry.definition)
sequence_version(@entry.acc_version)
sequence_length(@entry.nalen)
# [TODO] provide REST API to retreive genomic DNA sequence by <@sequence_id.fasta>
sequence_seq(@entry.acc_version)
sequence_form(@entry.circular)
# [TODO] sequenced date, modified in the source db or in our RDF data?
sequence_date(@entry.date)
# [TODO] rdfs:seeAlso (like UniProt) or dc:relation, owl:sameAs
sequence_link_gi(@entry.gi.sub('GI:',''))
sequence_link_accver(@entry.acc_version)
if bioproject = @entry.bioproject
sequence_link_bioproject(bioproject)
elsif project = @entry.project
sequence_link_bioproject("PRJNA#{project}")
end
if biosample = @entry.biosample
sequence_link_biosample(biosample)
end
# [TODO] how to deal with direct submissions (references without PMID)?
sequence_ref(@entry.references)
end
def sequence_type(so = "SO:chromosome")
case so
when /0000340/, "SO:chromosome"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000340") + " # SO:chromosome"
when /0000155/, "SO:plasmid"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000155") + " # SO:plasmid"
when /0000736/, "SO:organelle_sequence"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000736") + " # SO:organelle_sequence"
when /0000819/, "SO:mitochondrial_chromosome"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000819") + " # SO:mitochondrial_chromosome"
when /0000740/, "SO:plastid_sequence"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000740") + " # SO:plastid_sequence"
when /0000719/, "SO:ultracontig"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000719") + " # SO:ultracontig"
when /0000148/, "SO:supercontig", "SO:scaffold"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000148") + " # SO:supercontig/scaffold"
when /0000149/, "SO:contig"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000149") + " # SO:contig"
else
puts triple(@sequence_id, "rdf:type", "obo:SO_0000353") + " # SO:sequence_assembly"
end
end
def sequence_label(str)
# Use "name:" key in the JSON representation
puts triple(@sequence_id, "rdfs:label", quote(str))
end
def sequence_version(str)
puts triple(@sequence_id, "insdc:sequence_version", quote(str))
end
def sequence_length(int)
puts triple(@sequence_id, "insdc:sequence_length", int)
end
def sequence_seq(str)
# [TODO] Where to privide the actual DNA sequence?
fasta_uri = "<http://togows.org/entry/nucleotide/#{str}.fasta>"
#fasta_uri = "<http://www.ncbi.nlm.nih.gov/nuccore/#{str}?report=fasta>"
puts triple(@sequence_id, "insdc:sequence_fasta", fasta_uri)
end
def sequence_form(form)
case form
when "linear"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000987") + " # SO:linear"
when "circular"
puts triple(@sequence_id, "rdf:type", "obo:SO_0000988") + " # SO:circular"
end
end
def sequence_date(date)
puts triple(@sequence_id, "insdc:sequence_date", quote(usdate2date(date))+"^^xsd:date")
end
def sequence_link_gi(str)
xref(@sequence_id, 'GI', str)
end
def sequence_link_accver(str)
xref(@sequence_id, 'RefSeq', str)
end
def sequence_link_bioproject(str)
xref_id = new_uuid
id_pfx = "http://identifiers.org/bioproject"
puts triple(@sequence_id, 'insdc:dblink', xref_id)
puts triple(xref_id, 'rdfs:label', quote(str))
puts triple(xref_id, 'rdfs:seeAlso', "<#{id_pfx}/#{str}>")
puts triple("<#{id_pfx}/#{str}>", 'rdf:type', "<#{id_pfx}>")
end
def sequence_link_biosample(str)
xref_id = new_uuid
id_pfx = "http://identifiers.org/biosample"
puts triple(@sequence_id, 'insdc:dblink', xref_id)
puts triple(xref_id, 'rdfs:label', quote(str))
puts triple(xref_id, 'rdfs:seeAlso', "<#{id_pfx}/#{str}>")
puts triple("<#{id_pfx}/#{str}>", 'rdf:type', "<#{id_pfx}>")
end
def sequence_ref(refs)
refs.each do |ref|
pmid = ref.pubmed
if pmid.length > 0
xref(@sequence_id, 'PubMed', pmid)
end
end
end
###
### Source
###
def parse_source
# Use @sequence_id for @source_id
@source_id = @sequence_id
hash = @source.to_hash
source_location(@source.position)
source_link(hash["db_xref"])
hash.delete("db_xref")
source_qualifiers(hash)
end
def source_location(pos)
loc_id, = new_location(pos)
puts triple(@source_id, "faldo:location", loc_id)
end
def source_link(links)
links.each do |link|
db, entry_id = link.split(':', 2)
xref(@source_id, db, entry_id)
end
end
def source_qualifiers(hash)
hash.each do |qual, vals|
vals.each do |val|
if val == true
puts triple(@source_id, "insdc:#{qual}", true)
else
data = val.to_s.gsub(/\s+/, ' ').strip
if data[/^\d+$/]
puts triple(@source_id, "insdc:#{qual}", data)
else
puts triple(@source_id, "insdc:#{qual}", quote(data))
end
end
end
end
end
###
### Genes
###
def parse_genes
genes = @features.select {|x| x.feature == "gene"}
genes.each do |gene|
gene_id = new_uuid
hash = gene.to_hash
puts triple(gene_id, "rdf:type", "obo:SO_0000704") + " # SO:gene"
puts triple(gene_id, "obo:so_part_of", @sequence_id)
loc_id, _ = new_location(gene.position)
puts triple(gene_id, "faldo:location", loc_id)
if hash["locus_tag"]
locus_tag = hash["locus_tag"].first
@gene[locus_tag] = gene_id
puts triple(gene_id, "rdfs:label", quote(locus_tag))
elsif hash["gene"]
gene = hash["gene"].first
@gene[gene] = gene_id
puts triple(gene_id, "rdfs:label", quote(gene))
else
# [TODO] Where else to find gene name?
end
parse_qualifiers(gene_id, hash)
end
end
###
### Features (part of gene: CDS, mRNA, misc_RNA, precursor_RNA, ncRNA, tRNA, rRNA)
###
def parse_features
features = @features.select {|x| x.feature != "gene" }
features.each do |feat|
feature_id = new_uuid
hash = feat.to_hash
feature = feat.feature
@feature_count[feature] += 1
case feature
when "CDS"
puts triple(feature_id, "rdf:type", "obo:SO_0000316") + " # SO:CDS"
when "mRNA"
puts triple(feature_id, "rdf:type", "obo:SO_0000234") + " # SO:mRNA"
when "misc_RNA"
puts triple(feature_id, "rdf:type", "obo:SO_0000673") + " # SO:transcript"
when "precursor_RNA"
puts triple(feature_id, "rdf:type", "obo:SO_0000185") + " # SO:primary_transcript"
when "ncRNA"
puts triple(feature_id, "rdf:type", "obo:SO_0000655") + " # SO:ncRNA"
when "tRNA"
puts triple(feature_id, "rdf:type", "obo:SO_0000253") + " # SO:tRNA"
when "rRNA"
puts triple(feature_id, "rdf:type", "obo:SO_0000252") + " # SO:rRNA"
else
if so_id = @ft_so.so_id(feature)
if so_id != "undefined"
so = so_id.sub(':', '_')
puts triple(feature_id, "rdf:type", "obo:#{so}") + " # SO:#{@ft_so.so_term(feature)}"
else
puts triple(feature_id, "rdf:type", "obo:SO_0000110") + " # SO:sequence_feature"
end
end
end
gene_id = locus_tag = gene = nil
if hash["locus_tag"]
if locus_tag = hash["locus_tag"].first
gene_id = @gene[locus_tag]
end
elsif hash["gene"]
if gene = hash["gene"].first
gene_id = @gene[gene]
end
end
puts triple(feature_id, "rdfs:label", quote(locus_tag || gene || feature))
puts triple(feature_id, "obo:so_part_of", gene_id || @sequence_id)
if gene_id
subpart_type = { :id => "obo:SO_0000147", :term => "SO:exon" }
else
subpart_type = { :id => "obo:SO_0000001", :term => "SO:region" }
end
loc_id, subparts = new_location(feat.position, subpart_type)
puts triple(feature_id, "faldo:location", loc_id)
unless subparts.empty?
puts triple(feature_id, "obo:so_has_part", "(#{subparts.join(' ')})") # rdf:List
end
parse_qualifiers(feature_id, hash)
end
$stderr.puts "Features: #{@feature_count.to_json}"
end
def parse_qualifiers(feature_id, hash)
hash.each do |qual, vals|
vals.each do |val|
if val == true
puts triple(feature_id, "insdc:#{qual}", true)
else
data = val.to_s.gsub(/\s+/, ' ').strip
case qual
when "protein_id"
xref(feature_id, 'Protein', val)
when "db_xref"
db, id = val.split(':', 2)
# ad hoc
if db == 'InterPro' and @entry.acc_version[/(NC_010994.1|NC_014958.1|NC_015385.1|NC_015386.1|NC_015387.1|NC_015388.1|NC_015389.1)/]
# PRJNA59115/NC_010994.1
# PRJNA62225/NC_014958.1
# PRJNA65781/NC_015385.1
# PRJNA65781/plasmids/NC_015386.1
# PRJNA65783/NC_015387.1
# PRJNA65785/NC_015388.1
# PRJNA65787/NC_015389.1
# /db_xref="InterPro:Chromosomal replication control,
# initiator (DnaA)/regulator (Hda"
xref(feature_id, db, id) if id[/IPR\d+/]
elsif db == "ASAP" and @entry.acc_version[/(NC_017263.1|NC_017264.1|NC_017265.1|NC_017266.1)/]
# PRJNA158537/plasmids/NC_017263.1
# PRJNA158537/plasmids/NC_017264.1
# PRJNA158537/NC_017265.1
# PRJNA158537/plasmids/NC_017266.1
# /db_xref="ASAP:BBE-0004740"
# /db_xref="ASAP:BBE-0004740 ERIC"
xref(feature_id, db, id) unless id[/\s/]
elsif db == 'TIGRFAM' and @entry.acc_version[/NC_013418.2/]
# PRJNA41287/NC_013418.2
# /db_xref="TIGRFAM:TIGR00336, TIGR01740"
# /db_xref="TIGRFAM:TIGR00197; TF"
if id[/,/]
ids = id.split(',').map{|x| x.strip}
ids.each do |x|
xref(feature_id, db, x)
end
else
xref(feature_id, db, id.sub(/;.*/, ''))
end
else
xref(feature_id, db, id)
end
else
if data[/^\d+$/]
puts triple(feature_id, "insdc:#{qual}", data)
else
puts triple(feature_id, "insdc:#{qual}", quote(data))
end
end
end
end
end
end
end
if __FILE__ == $0
require 'getoptlong'
args = GetoptLong.new(
[ '--seqtype', '-t', GetoptLong::REQUIRED_ARGUMENT ],
[ '--prefixes', '-p', GetoptLong::NO_ARGUMENT ],
)
opts = {
:seqtype => "SO:chromosome",
}
args.each_option do |name, value|
case name
when /--seqtype/
opts[:seqtype] = value
when /--prefixes/
opts[:prefixes] = true
end
end
if opts[:prefixes]
RefSeq2RDF.new
else
RefSeq2RDF.new(ARGF, opts[:seqtype])
end
end
__END__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.