Created
January 18, 2014 10:07
-
-
Save ktym/8488467 to your computer and use it in GitHub Desktop.
Generate BioDAS XML from a SPARQL endpoint which stores genome sequences/annotations using FALDO
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# Usage: | |
# DAS XML for http://togogenome.org/das/645657/features?segment=NC_017196.1:1,100000 can be generated by | |
# % ruby sparql2das.rb 645657 features segment=NC_017196.1:1,100000 | |
# With ruby -d option, you'll see debug massages with pretty formatted XML (will miss <?xml> tag though) | |
# | |
# TODO: | |
# Wrap as an Rack app and add X-DAS-* HTTP headers | |
# | |
# References: | |
# http://www.biodas.org/documents/spec-1.6.html | |
# | |
require "rubygems" | |
require "net/http" | |
require "uri" | |
require "cgi" | |
require "json" # gem install json | |
#require "active_support/core_ext" # gem install activesupport | |
require "rexml/document" | |
class SPARQL | |
attr :prefix_hash | |
def initialize(url) | |
@endpoint = url | |
uri = URI.parse(url) | |
@host = uri.host | |
@port = uri.port | |
@path = uri.path | |
@user = uri.user | |
@pass = uri.password | |
@prefix_hash = {} | |
Net::HTTP.version_1_2 | |
end | |
def host | |
return @endpoint | |
end | |
def prefix | |
ary = [] | |
@prefix_hash.sort.each { |key, value| | |
ary << "PREFIX #{key}: <#{value}>\n" | |
} | |
return ary.join | |
end | |
def query(sparql, opts={}, &block) | |
result = "" | |
case opts[:format] | |
when "xml" | |
format = "application/sparql-results+xml" | |
when "json" | |
format = "application/sparql-results+json" | |
else # tabular text | |
format = "application/sparql-results+json" | |
end | |
Net::HTTP.start(@host, @port) do |http| | |
if timeout = ENV['SPARQL_TIMEOUT'] | |
http.read_timeout = timeout.to_i | |
end | |
sparql_qry = prefix + sparql | |
sparql_str = CGI.escape(sparql_qry) | |
path = "#{@path}?query=#{sparql_str}" | |
if $DEBUG | |
$stderr.puts "SPARQL_ENDPOINT host: #{@host}, port: #{@port}, path: #{@path}, user: #{@user}, pass: #{@pass}" | |
$stderr.puts "SPARQL_TIMEOUT timeout: #{http.read_timeout} seconds" | |
$stderr.puts sparql_qry | |
$stderr.puts path | |
end | |
req = Net::HTTP::Get.new(path, {"Accept" => "#{format}"}) | |
if @user and @pass | |
req.basic_auth @user, @pass | |
end | |
http.request(req) { |res| | |
if block and opts[:format] # xml or json | |
yield res.body | |
else # tabular text | |
result += res.body | |
end | |
} | |
end | |
if opts[:format] # xml or json | |
return result | |
else # generate tabular text | |
if $DEBUG | |
$stderr.puts result | |
end | |
table = format_json(result) | |
if block | |
yield table | |
else | |
return table | |
end | |
end | |
end | |
def find(keyword, opts={}, &block) | |
sparql = "select ?s ?p ?o where { ?s ?t '#{keyword}'. ?s ?p ?o . }" | |
query(sparql, opts, &block) | |
end | |
def head(opts={}, &block) | |
limit = opts[:limit] || 20 | |
offset = (opts[:offset] || 1).to_i | |
sparql = "select ?s ?p ?o where { ?s ?p ?o . } offset #{offset} limit #{limit}" | |
query(sparql, opts, &block) | |
end | |
def prefix_default | |
@prefix_hash = { | |
"rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#", | |
"rdfs" => "http://www.w3.org/2000/01/rdf-schema#", | |
"owl" => "http://www.w3.org/2002/07/owl#", | |
"xsd" => "http://www.w3.org/2001/XMLSchema#", | |
"pext" => "http://proton.semanticweb.org/protonext#", | |
"psys" => "http://proton.semanticweb.org/protonsys#", | |
"xhtml" => "http://www.w3.org/1999/xhtml#", | |
"dc" => "http://purl.org/dc/elements/1.1/", | |
"dcterms" => "http://purl.org/dc/terms/", | |
"foaf" => "http://xmlns.com/foaf/0.1/", | |
"skos" => "http://www.w3.org/2004/02/skos/core#", | |
"void" => "http://rdfs.org/ns/void#", | |
"dbpedia" => "http://dbpedia.org/resource/", | |
"dbp" => "http://dbpedia.org/property/", | |
"dbo" => "http://dbpedia.org/ontology/", | |
"yago" => "http://dbpedia.org/class/yago/", | |
"fb" => "http://rdf.freebase.com/ns/", | |
"sioc" => "http://rdfs.org/sioc/ns#", | |
"geo" => "http://www.w3.org/2003/01/geo/wgs84_pos#", | |
"geonames" => "http://www.geonames.org/ontology#", | |
"bibo" => "http://purl.org/ontology/bibo/", | |
"prism" => "http://prismstandard.org/namespaces/basic/2.1/", | |
} | |
end | |
private | |
def format_json(json) | |
begin | |
hash = JSON.parse(json) | |
head = hash["head"]["vars"] | |
body = hash["results"]["bindings"] | |
rescue | |
return "" | |
end | |
text = "" | |
text << head.join("\t") + "\n" | |
body.each do |result| | |
ary = [] | |
head.each do |key| | |
data = result[key] || { "type" => '', "value" => ''} | |
if data["type"] == "uri" | |
uri = '<' + data["value"].gsub('\\', '') + '>' | |
ary << uri | |
else | |
val = data["value"].gsub('\/', '/') | |
ary << val | |
end | |
end | |
text << ary.join("\t") + "\n" | |
end | |
return text | |
end | |
end # class SPARQL | |
QUERY = <<"QUERY" | |
DEFINE sql:select-option "order" | |
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | |
PREFIX obo: <http://purl.obolibrary.org/obo/> | |
PREFIX faldo: <http://biohackathon.org/resource/faldo#> | |
PREFIX insdc: <http://ddbj.nig.ac.jp/ontologies/sequence#> | |
SELECT DISTINCT ?start ?end ?strand ?type ?name ?description ?uniqueID ?parentUniqueID | |
FROM <http://togogenome.org/graph/refseq/> | |
FROM <http://togogenome.org/graph/so/> | |
FROM <http://togogenome.org/graph/faldo/> | |
WHERE | |
{ | |
{ | |
SELECT ?start ?end ?strand ?type ?uniqueID ?parentUniqueID | |
WHERE | |
{ | |
?seq_id ?p "{ref}" . | |
?uniqueID obo:so_part_of+ ?seq_id . | |
FILTER ( !(?start > {end} || ?end < {start}) ) | |
?uniqueID faldo:location ?loc . | |
?loc faldo:begin/faldo:position ?start . | |
?loc faldo:end/faldo:position ?end . | |
?loc faldo:begin/rdf:type ?faldo_type FILTER ( ?faldo_type IN (faldo:ForwardStrandPosition, faldo:ReverseStrandPosition, faldo:BothStrandsPosition) ). | |
BIND ( if(?faldo_type = faldo:ForwardStrandPosition, 1, if(?faldo_type = faldo:ReverseStrandPosition, -1, 0)) as ?strand ) | |
?uniqueID rdf:type ?uniqueID_type FILTER ( ?uniqueID_type %SO% ). | |
?uniqueID_type rdfs:label ?uniqueID_type_label . | |
BIND ( str(?uniqueID_type_label) as ?type ) . | |
?uniqueID obo:so_part_of ?parentUniqueID %FILTER% . | |
} | |
} | |
OPTIONAL { ?uniqueID insdc:locus_tag ?name . } | |
OPTIONAL { ?uniqueID insdc:product ?description . } | |
} | |
ORDER BY ?start | |
QUERY | |
FEATURE_TYPE2SO = { | |
'CDS' => 'SO:0000316', | |
'gene' => 'SO:0000704', | |
'exon' => 'SO:0000147', | |
'tRNA' => 'SO:0000253', | |
'rRNA' => 'SO:0000252', | |
} | |
FEATURE_STRAND = { | |
1 => '+', | |
-1 => '-', | |
0 => '0', | |
} | |
def xml_prepare | |
xml = REXML::Document.new | |
xml << REXML::XMLDecl.new('1.0', 'UTF-8') | |
end | |
def xml_print(xml) | |
if $DEBUG | |
formatter = REXML::Formatters::Pretty.new | |
formatter.compact = true | |
puts formatter.write(xml.root, "") | |
else | |
puts xml | |
end | |
end | |
def das_sequence(seq_id, seq_start, seq_end) | |
url = "http://togows.org/entry/nuccore/#{seq_id}/seq/#{seq_start}..#{seq_end}" | |
$stderr.puts url if $DEBUG | |
uri = URI.parse(url) | |
seq = Net::HTTP.get(uri) | |
$stderr.puts seq if $DEBUG | |
xml = xml_prepare | |
xml.add_element('DASSEQUENCE').add_element('sequence', {'id' => seq_id, 'start' => seq_start, 'stop' => seq_end}).add_text(seq.chomp) | |
xml_print(xml) | |
end | |
def das_features(das_source, seq_id, seq_start, seq_end) | |
sparql = QUERY.sub('{ref}', seq_id).sub('{start}', seq_start).sub('{end}', seq_end) | |
sparql_all = sparql.sub('%SO%', 'IN (obo:SO_0000316, obo:SO_0000147, obo:SO_0000253, obo:SO_0000252)').sub('%FILTER%', 'FILTER (?parentUniqueID != ?seq_id)') | |
#sparql_cds = sparql.sub('%SO%', 'IN (obo:SO_0000316, obo:SO_0000147)').sub('%FILTER%', 'FILTER (?uniqueID_type = obo:SO_0000704 || ?parentUniqueID != ?seq_id)') | |
#sparql_gene = sparql.sub('%SO%', 'IN (obo:SO_0000704, obo:SO_0000147)').sub('%FILTER%', 'FILTER (?uniqueID_type = obo:SO_0000704 || ?parentUniqueID != ?seq_id)') | |
#sparql_trna = sparql.sub('%SO%', '= obo:SO_0000253').sub('%FILTER%', '') | |
#sparql_rrna = sparql.sub('%SO%', '= obo:SO_0000252').sub('%FILTER%', '') | |
host = ENV['SPARQL_ENDPOINT'] || "http://ep.dbcls.jp/sparql7ssd" | |
serv = SPARQL.new(host) | |
json = JSON.parse(serv.query(sparql_all, :format => 'json')) | |
head = json['head']['vars'] | |
body = json['results']['bindings'] | |
xml = xml_prepare | |
das = xml.add_element('DASGFF') | |
gff = das.add_element('GFF', {'href' => "http://togogenome.org/das/#{das_source}/features?segment=#{seq_id}:#{seq_start},#{seq_end}"}) | |
seg = das.add_element('SEGMENT', {'id' => seq_id, 'start' => seq_start, 'stop' => seq_end}) | |
body.each do |hash| | |
feature_id = hash['uniqueID']['value'] | |
feature_name = hash['name']['value'] | |
feature_type = hash['type']['value'] | |
feature_so = FEATURE_TYPE2SO[feature_type] | |
feature_start = hash['start']['value'] | |
feature_end = hash['end']['value'] | |
feature_strand = FEATURE_STRAND[hash['strand']['value'].to_i] | |
feature_note = hash['description']['value'] | |
feature_parent = hash['parentUniqueID']['value'] | |
feature = seg.add_element('FEATURE', {'id' => feature_id, 'label' => feature_name}) | |
feature.add_element('TYPE', {'id' => feature_id, 'category' => feature_type, 'reference' => seq_id, 'cvId' => feature_so}).add_text(feature_type) | |
feature.add_element('METHOD', {'id' => 'TogoGenome'}).add_text('TogoGenome') # [TODO] to be defined | |
feature.add_element('START').add_text(feature_start) | |
feature.add_element('END').add_text(feature_end) | |
feature.add_element('ORIENTATION').add_text(feature_strand) | |
feature.add_element('NOTE').add_text(feature_note) | |
feature.add_element('PARENT', {'id' => feature_parent}) | |
end | |
xml_print(xml) | |
end | |
### DAS Server | |
das_prefix = "http://togogenome.org/das/" | |
das_source = ARGV.shift || "645657" # taxid | |
das_command = ARGV.shift || "features" # or "sources" etc. | |
das_arguments = ARGV.shift || "segment=NC_017194.1:1,10000" | |
args = {} | |
das_arguments.split(';').each do |das_argument| | |
arg, val = das_argument.split('=') | |
case arg | |
when "segment" | |
seq_id, seq_region = val.split(':') | |
seq_start, seq_end = seq_region.split(',') | |
args[:segment] ||= [] | |
args[:segment] << [seq_id, seq_start, seq_end] | |
end | |
end | |
case das_command | |
when "sources" | |
# Not implemented yet | |
when "entry_points" | |
# Not implemented yet | |
when "sequence" | |
args[:segment].each do |seq_id, seq_start, seq_end| | |
das_sequence(seq_id, seq_start, seq_end) | |
end | |
when "types" | |
# Not implemented yet | |
when "features" | |
args[:segment].each do |seq_id, seq_start, seq_end| | |
das_features(das_source, seq_id, seq_start, seq_end) | |
end | |
when "stylesheet" | |
# Not implemented yet | |
when "structure" | |
# Not implemented yet | |
end | |
% ruby sparql2das.rb 645657 sequence segment=NC_017196.1:2000,2500
<?xml version='1.0' encoding='UTF-8'?>
<DASSEQUENCE>
<sequence id='NC_017196.1' start='2000' stop='2500'>
tttcatccagaaccacgattcccattctgactggtattaaaattgttgcatcagatgatggagtatcctttacagggagtgactcagatatttctattgaatccttcattccaaaagaagaaggagataaagaaatcgtcactattgaacagcccggaagcatcgttttacaggctcgcttttttagtgaaattgtaaaaaaattgccgatggcaactgtagaaattgaagtccaaaatcagtatttgacgattatccgttctggtaaagctgaatttaatctaaacggactggatgctgacgagtatccgcacttgccgcagattgaagagcatcatgcgattcagatcccaactgatttgttaaaaaatctaatcagacaaacagtatttgcagtgtccacctcagaaacacgccctatcttgacaggtgtaaactggaaagtggagcaaagtgaattattatgcactgcaacggatagccaccgtcttgcattaagaa
</sequence>
</DASSEQUENCE>
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
% ruby -d sparql2das.rb 645657 features segment=NC_017196.1:5000,15000