Skip to content

Instantly share code, notes, and snippets.

@tfuji
Created March 14, 2016 10:59
Show Gist options
  • Save tfuji/4571a5026311836b43cb to your computer and use it in GitHub Desktop.
Save tfuji/4571a5026311836b43cb to your computer and use it in GitHub Desktop.
INSDC structured comment parser
#!/usr/bin/env ruby
require 'rubygems'
require 'bio'
def parse_st_comment comment
tagset_id = nil
tag_name = tag_value = ''
st = Hash.new { |h,k| h[k] = {} }
comment.split("\n").each do |line|
##Genome-Assembly-Data-START##
if match = line.match(/##(.+)-START##/)
tagset_id = match[1]
next
elsif match = line.match(/##(.+)-END##/)
if tagset_id == match[1]
tagset_id = nil
end
elsif match = line.match(/^(.+[^ ])\s+:: (.+)/)
tag_name = match[1]
tag_value = match[2]
st[tagset_id][tag_name]= tag_value
elsif tagset_id
value = "#{st[tagset_id][tag_name]} #{line.strip}"
st[tagset_id][tag_name] = value
end
#puts line
end
st
end
io = ARGF
Bio::FlatFile.auto(io).each do |entry|
#puts entry.comment
st = parse_st_comment entry.comment
st.each do |k,v|
v.each do |vk,vv|
puts "#{entry.entry_id}.#{entry.version}\t#{k}\t#{vk}\t#{vv}"
end
end
#@entry = entry
#@features = entry.features
#@source = @features.shift
#parse_sequence
#parse_source
#parse_genes
#parse_features
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment