Skip to content

Instantly share code, notes, and snippets.

@ktym
Created January 18, 2014 10:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ktym/8488467 to your computer and use it in GitHub Desktop.
Save ktym/8488467 to your computer and use it in GitHub Desktop.
Generate BioDAS XML from a SPARQL endpoint which stores genome sequences/annotations using FALDO
#!/usr/bin/env ruby
#
# Usage:
# DAS XML for http://togogenome.org/das/645657/features?segment=NC_017196.1:1,100000 can be generated by
# % ruby sparql2das.rb 645657 features segment=NC_017196.1:1,100000
# With ruby -d option, you'll see debug massages with pretty formatted XML (will miss <?xml> tag though)
#
# TODO:
# Wrap as an Rack app and add X-DAS-* HTTP headers
#
# References:
# http://www.biodas.org/documents/spec-1.6.html
#
require "rubygems"
require "net/http"
require "uri"
require "cgi"
require "json" # gem install json
#require "active_support/core_ext" # gem install activesupport
require "rexml/document"
class SPARQL
attr :prefix_hash
def initialize(url)
@endpoint = url
uri = URI.parse(url)
@host = uri.host
@port = uri.port
@path = uri.path
@user = uri.user
@pass = uri.password
@prefix_hash = {}
Net::HTTP.version_1_2
end
def host
return @endpoint
end
def prefix
ary = []
@prefix_hash.sort.each { |key, value|
ary << "PREFIX #{key}: <#{value}>\n"
}
return ary.join
end
def query(sparql, opts={}, &block)
result = ""
case opts[:format]
when "xml"
format = "application/sparql-results+xml"
when "json"
format = "application/sparql-results+json"
else # tabular text
format = "application/sparql-results+json"
end
Net::HTTP.start(@host, @port) do |http|
if timeout = ENV['SPARQL_TIMEOUT']
http.read_timeout = timeout.to_i
end
sparql_qry = prefix + sparql
sparql_str = CGI.escape(sparql_qry)
path = "#{@path}?query=#{sparql_str}"
if $DEBUG
$stderr.puts "SPARQL_ENDPOINT host: #{@host}, port: #{@port}, path: #{@path}, user: #{@user}, pass: #{@pass}"
$stderr.puts "SPARQL_TIMEOUT timeout: #{http.read_timeout} seconds"
$stderr.puts sparql_qry
$stderr.puts path
end
req = Net::HTTP::Get.new(path, {"Accept" => "#{format}"})
if @user and @pass
req.basic_auth @user, @pass
end
http.request(req) { |res|
if block and opts[:format] # xml or json
yield res.body
else # tabular text
result += res.body
end
}
end
if opts[:format] # xml or json
return result
else # generate tabular text
if $DEBUG
$stderr.puts result
end
table = format_json(result)
if block
yield table
else
return table
end
end
end
def find(keyword, opts={}, &block)
sparql = "select ?s ?p ?o where { ?s ?t '#{keyword}'. ?s ?p ?o . }"
query(sparql, opts, &block)
end
def head(opts={}, &block)
limit = opts[:limit] || 20
offset = (opts[:offset] || 1).to_i
sparql = "select ?s ?p ?o where { ?s ?p ?o . } offset #{offset} limit #{limit}"
query(sparql, opts, &block)
end
def prefix_default
@prefix_hash = {
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"rdfs" => "http://www.w3.org/2000/01/rdf-schema#",
"owl" => "http://www.w3.org/2002/07/owl#",
"xsd" => "http://www.w3.org/2001/XMLSchema#",
"pext" => "http://proton.semanticweb.org/protonext#",
"psys" => "http://proton.semanticweb.org/protonsys#",
"xhtml" => "http://www.w3.org/1999/xhtml#",
"dc" => "http://purl.org/dc/elements/1.1/",
"dcterms" => "http://purl.org/dc/terms/",
"foaf" => "http://xmlns.com/foaf/0.1/",
"skos" => "http://www.w3.org/2004/02/skos/core#",
"void" => "http://rdfs.org/ns/void#",
"dbpedia" => "http://dbpedia.org/resource/",
"dbp" => "http://dbpedia.org/property/",
"dbo" => "http://dbpedia.org/ontology/",
"yago" => "http://dbpedia.org/class/yago/",
"fb" => "http://rdf.freebase.com/ns/",
"sioc" => "http://rdfs.org/sioc/ns#",
"geo" => "http://www.w3.org/2003/01/geo/wgs84_pos#",
"geonames" => "http://www.geonames.org/ontology#",
"bibo" => "http://purl.org/ontology/bibo/",
"prism" => "http://prismstandard.org/namespaces/basic/2.1/",
}
end
private
def format_json(json)
begin
hash = JSON.parse(json)
head = hash["head"]["vars"]
body = hash["results"]["bindings"]
rescue
return ""
end
text = ""
text << head.join("\t") + "\n"
body.each do |result|
ary = []
head.each do |key|
data = result[key] || { "type" => '', "value" => ''}
if data["type"] == "uri"
uri = '<' + data["value"].gsub('\\', '') + '>'
ary << uri
else
val = data["value"].gsub('\/', '/')
ary << val
end
end
text << ary.join("\t") + "\n"
end
return text
end
end # class SPARQL
QUERY = <<"QUERY"
DEFINE sql:select-option "order"
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX faldo: <http://biohackathon.org/resource/faldo#>
PREFIX insdc: <http://ddbj.nig.ac.jp/ontologies/sequence#>
SELECT DISTINCT ?start ?end ?strand ?type ?name ?description ?uniqueID ?parentUniqueID
FROM <http://togogenome.org/graph/refseq/>
FROM <http://togogenome.org/graph/so/>
FROM <http://togogenome.org/graph/faldo/>
WHERE
{
{
SELECT ?start ?end ?strand ?type ?uniqueID ?parentUniqueID
WHERE
{
?seq_id ?p "{ref}" .
?uniqueID obo:so_part_of+ ?seq_id .
FILTER ( !(?start > {end} || ?end < {start}) )
?uniqueID faldo:location ?loc .
?loc faldo:begin/faldo:position ?start .
?loc faldo:end/faldo:position ?end .
?loc faldo:begin/rdf:type ?faldo_type FILTER ( ?faldo_type IN (faldo:ForwardStrandPosition, faldo:ReverseStrandPosition, faldo:BothStrandsPosition) ).
BIND ( if(?faldo_type = faldo:ForwardStrandPosition, 1, if(?faldo_type = faldo:ReverseStrandPosition, -1, 0)) as ?strand )
?uniqueID rdf:type ?uniqueID_type FILTER ( ?uniqueID_type %SO% ).
?uniqueID_type rdfs:label ?uniqueID_type_label .
BIND ( str(?uniqueID_type_label) as ?type ) .
?uniqueID obo:so_part_of ?parentUniqueID %FILTER% .
}
}
OPTIONAL { ?uniqueID insdc:locus_tag ?name . }
OPTIONAL { ?uniqueID insdc:product ?description . }
}
ORDER BY ?start
QUERY
FEATURE_TYPE2SO = {
'CDS' => 'SO:0000316',
'gene' => 'SO:0000704',
'exon' => 'SO:0000147',
'tRNA' => 'SO:0000253',
'rRNA' => 'SO:0000252',
}
FEATURE_STRAND = {
1 => '+',
-1 => '-',
0 => '0',
}
def xml_prepare
xml = REXML::Document.new
xml << REXML::XMLDecl.new('1.0', 'UTF-8')
end
def xml_print(xml)
if $DEBUG
formatter = REXML::Formatters::Pretty.new
formatter.compact = true
puts formatter.write(xml.root, "")
else
puts xml
end
end
def das_sequence(seq_id, seq_start, seq_end)
url = "http://togows.org/entry/nuccore/#{seq_id}/seq/#{seq_start}..#{seq_end}"
$stderr.puts url if $DEBUG
uri = URI.parse(url)
seq = Net::HTTP.get(uri)
$stderr.puts seq if $DEBUG
xml = xml_prepare
xml.add_element('DASSEQUENCE').add_element('sequence', {'id' => seq_id, 'start' => seq_start, 'stop' => seq_end}).add_text(seq.chomp)
xml_print(xml)
end
def das_features(das_source, seq_id, seq_start, seq_end)
sparql = QUERY.sub('{ref}', seq_id).sub('{start}', seq_start).sub('{end}', seq_end)
sparql_all = sparql.sub('%SO%', 'IN (obo:SO_0000316, obo:SO_0000147, obo:SO_0000253, obo:SO_0000252)').sub('%FILTER%', 'FILTER (?parentUniqueID != ?seq_id)')
#sparql_cds = sparql.sub('%SO%', 'IN (obo:SO_0000316, obo:SO_0000147)').sub('%FILTER%', 'FILTER (?uniqueID_type = obo:SO_0000704 || ?parentUniqueID != ?seq_id)')
#sparql_gene = sparql.sub('%SO%', 'IN (obo:SO_0000704, obo:SO_0000147)').sub('%FILTER%', 'FILTER (?uniqueID_type = obo:SO_0000704 || ?parentUniqueID != ?seq_id)')
#sparql_trna = sparql.sub('%SO%', '= obo:SO_0000253').sub('%FILTER%', '')
#sparql_rrna = sparql.sub('%SO%', '= obo:SO_0000252').sub('%FILTER%', '')
host = ENV['SPARQL_ENDPOINT'] || "http://ep.dbcls.jp/sparql7ssd"
serv = SPARQL.new(host)
json = JSON.parse(serv.query(sparql_all, :format => 'json'))
head = json['head']['vars']
body = json['results']['bindings']
xml = xml_prepare
das = xml.add_element('DASGFF')
gff = das.add_element('GFF', {'href' => "http://togogenome.org/das/#{das_source}/features?segment=#{seq_id}:#{seq_start},#{seq_end}"})
seg = das.add_element('SEGMENT', {'id' => seq_id, 'start' => seq_start, 'stop' => seq_end})
body.each do |hash|
feature_id = hash['uniqueID']['value']
feature_name = hash['name']['value']
feature_type = hash['type']['value']
feature_so = FEATURE_TYPE2SO[feature_type]
feature_start = hash['start']['value']
feature_end = hash['end']['value']
feature_strand = FEATURE_STRAND[hash['strand']['value'].to_i]
feature_note = hash['description']['value']
feature_parent = hash['parentUniqueID']['value']
feature = seg.add_element('FEATURE', {'id' => feature_id, 'label' => feature_name})
feature.add_element('TYPE', {'id' => feature_id, 'category' => feature_type, 'reference' => seq_id, 'cvId' => feature_so}).add_text(feature_type)
feature.add_element('METHOD', {'id' => 'TogoGenome'}).add_text('TogoGenome') # [TODO] to be defined
feature.add_element('START').add_text(feature_start)
feature.add_element('END').add_text(feature_end)
feature.add_element('ORIENTATION').add_text(feature_strand)
feature.add_element('NOTE').add_text(feature_note)
feature.add_element('PARENT', {'id' => feature_parent})
end
xml_print(xml)
end
### DAS Server
das_prefix = "http://togogenome.org/das/"
das_source = ARGV.shift || "645657" # taxid
das_command = ARGV.shift || "features" # or "sources" etc.
das_arguments = ARGV.shift || "segment=NC_017194.1:1,10000"
args = {}
das_arguments.split(';').each do |das_argument|
arg, val = das_argument.split('=')
case arg
when "segment"
seq_id, seq_region = val.split(':')
seq_start, seq_end = seq_region.split(',')
args[:segment] ||= []
args[:segment] << [seq_id, seq_start, seq_end]
end
end
case das_command
when "sources"
# Not implemented yet
when "entry_points"
# Not implemented yet
when "sequence"
args[:segment].each do |seq_id, seq_start, seq_end|
das_sequence(seq_id, seq_start, seq_end)
end
when "types"
# Not implemented yet
when "features"
args[:segment].each do |seq_id, seq_start, seq_end|
das_features(das_source, seq_id, seq_start, seq_end)
end
when "stylesheet"
# Not implemented yet
when "structure"
# Not implemented yet
end
@ktym
Copy link
Author

ktym commented Jan 18, 2014

% ruby -d sparql2das.rb 645657 features segment=NC_017196.1:5000,15000

<?xml version='1.0' encoding='UTF-8'?>
<DASGFF>
  <GFF href='http://togogenome.org/das/645657/features?segment=NC_017196.1:5000,15000'/>
  <SEGMENT id='NC_017196.1' start='5000' stop='15000'>
    <FEATURE id='urn:uuid:62520f31-0415-48da-b7ee-c518e4dedb5a' label='BSNT_00006'>
      <TYPE id='urn:uuid:62520f31-0415-48da-b7ee-c518e4dedb5a' category='CDS' reference='NC_017196.1' cvId='SO:0000316'>CDS</TYPE>
      <METHOD id='TogoGenome'>TogoGenome</METHOD>
      <START>4867</START>
      <END>6783</END>
      <ORIENTATION>+</ORIENTATION>
      <NOTE>DNA gyrase subunit B</NOTE>
      <PARENT id='urn:uuid:3d404969-6cb8-44fe-8c06-00e21acdf9c2'/>
    </FEATURE>
    <FEATURE id='urn:uuid:c1942d05-bfe6-4c67-ae13-be6a6a1facf4' label='BSNT_00007'>
      <TYPE id='urn:uuid:c1942d05-bfe6-4c67-ae13-be6a6a1facf4' category='CDS' reference='NC_017196.1' cvId='SO:0000316'>CDS</TYPE>
      <METHOD id='TogoGenome'>TogoGenome</METHOD>
      <START>6780</START>
      <END>6893</END>
      <ORIENTATION>-</ORIENTATION>
      <NOTE>hypothetical protein</NOTE>
      <PARENT id='urn:uuid:cbace443-9248-401e-a149-8c70094c07eb'/>
    </FEATURE>
    <FEATURE id='urn:uuid:9a904ba0-edc6-4dae-af72-7a5b45cb7384' label='BSNT_00008'>
      <TYPE id='urn:uuid:9a904ba0-edc6-4dae-af72-7a5b45cb7384' category='CDS' reference='NC_017196.1' cvId='SO:0000316'>CDS</TYPE>
      <METHOD id='TogoGenome'>TogoGenome</METHOD>
      <START>6994</START>
      <END>9459</END>
      <ORIENTATION>+</ORIENTATION>
      <NOTE>DNA gyrase subunit A</NOTE>
      <PARENT id='urn:uuid:402ce87c-df54-4bf6-b09b-a2b829cbf8e9'/>
    </FEATURE>
    <FEATURE id='urn:uuid:4381af26-72af-4e0a-b77b-4ee82f20e8da' label='BSNT_00009'>
      <TYPE id='urn:uuid:4381af26-72af-4e0a-b77b-4ee82f20e8da' category='rRNA' reference='NC_017196.1' cvId='SO:0000252'>rRNA</TYPE>
      <METHOD id='TogoGenome'>TogoGenome</METHOD>
      <START>9755</START>
      <END>11292</END>
      <ORIENTATION>+</ORIENTATION>
      <NOTE>16S ribosomal RNA</NOTE>
      <PARENT id='urn:uuid:970aa314-b7ac-4eaa-9192-d63c8fc83ee0'/>
    </FEATURE>
    <FEATURE id='urn:uuid:d28f3ae8-ecd6-4de2-95c7-a2c29c0dc76f' label='BSNT_00011'>
      <TYPE id='urn:uuid:d28f3ae8-ecd6-4de2-95c7-a2c29c0dc76f' category='tRNA' reference='NC_017196.1' cvId='SO:0000253'>tRNA</TYPE>
      <METHOD id='TogoGenome'>TogoGenome</METHOD>
      <START>11400</START>
      <END>11476</END>
      <ORIENTATION>+</ORIENTATION>
      <NOTE>tRNA-Ile</NOTE>
      <PARENT id='urn:uuid:c0b18384-004f-49ab-ad95-2f57ac038831'/>
    </FEATURE>
    <FEATURE id='urn:uuid:1dd7b54e-8550-4e0d-b093-afc7f506f6dd' label='BSNT_00013'>
      <TYPE id='urn:uuid:1dd7b54e-8550-4e0d-b093-afc7f506f6dd' category='tRNA' reference='NC_017196.1' cvId='SO:0000253'>tRNA</TYPE>
      <METHOD id='TogoGenome'>TogoGenome</METHOD>
      <START>11488</START>
      <END>11563</END>
      <ORIENTATION>+</ORIENTATION>
      <NOTE>tRNA-Ala</NOTE>
      <PARENT id='urn:uuid:24fef4a7-9db4-4355-b329-1ff8b68c899b'/>
    </FEATURE>
    <FEATURE id='urn:uuid:4e7a5937-101f-4bd5-a716-fba792b64d8e' label='BSNT_00015'>
      <TYPE id='urn:uuid:4e7a5937-101f-4bd5-a716-fba792b64d8e' category='rRNA' reference='NC_017196.1' cvId='SO:0000252'>rRNA</TYPE>
      <METHOD id='TogoGenome'>TogoGenome</METHOD>
      <START>11647</START>
      <END>14572</END>
      <ORIENTATION>+</ORIENTATION>
      <NOTE>23S ribosomal RNA</NOTE>
      <PARENT id='urn:uuid:189da89d-03dc-4e85-81e8-ad6d9d82a336'/>
    </FEATURE>
    <FEATURE id='urn:uuid:731630bb-4b6a-43e4-8dfb-ca14c6dc3a15' label='BSNT_00017'>
      <TYPE id='urn:uuid:731630bb-4b6a-43e4-8dfb-ca14c6dc3a15' category='rRNA' reference='NC_017196.1' cvId='SO:0000252'>rRNA</TYPE>
      <METHOD id='TogoGenome'>TogoGenome</METHOD>
      <START>14630</START>
      <END>14744</END>
      <ORIENTATION>+</ORIENTATION>
      <NOTE>5S ribosomal RNA</NOTE>
      <PARENT id='urn:uuid:e43b912c-1cfd-4a1a-ae3b-0c80fd90c186'/>
    </FEATURE>
    <FEATURE id='urn:uuid:7660b296-b1c5-4a6f-bbbf-5b13d988cdff' label='BSNT_00021'>
      <TYPE id='urn:uuid:7660b296-b1c5-4a6f-bbbf-5b13d988cdff' category='CDS' reference='NC_017196.1' cvId='SO:0000316'>CDS</TYPE>
      <METHOD id='TogoGenome'>TogoGenome</METHOD>
      <START>14783</START>
      <END>15730</END>
      <ORIENTATION>-</ORIENTATION>
      <NOTE>hypothetical protein</NOTE>
      <PARENT id='urn:uuid:d60a1671-4392-44a5-845f-d3e4f7d048e9'/>
    </FEATURE>
  </SEGMENT>
</DASGFF>

@ktym
Copy link
Author

ktym commented Jan 18, 2014

% ruby sparql2das.rb 645657 sequence segment=NC_017196.1:2000,2500

<?xml version='1.0' encoding='UTF-8'?>
<DASSEQUENCE>
  <sequence id='NC_017196.1' start='2000' stop='2500'>
    tttcatccagaaccacgattcccattctgactggtattaaaattgttgcatcagatgatggagtatcctttacagggagtgactcagatatttctattgaatccttcattccaaaagaagaaggagataaagaaatcgtcactattgaacagcccggaagcatcgttttacaggctcgcttttttagtgaaattgtaaaaaaattgccgatggcaactgtagaaattgaagtccaaaatcagtatttgacgattatccgttctggtaaagctgaatttaatctaaacggactggatgctgacgagtatccgcacttgccgcagattgaagagcatcatgcgattcagatcccaactgatttgttaaaaaatctaatcagacaaacagtatttgcagtgtccacctcagaaacacgccctatcttgacaggtgtaaactggaaagtggagcaaagtgaattattatgcactgcaacggatagccaccgtcttgcattaagaa
  </sequence>
</DASSEQUENCE>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment