Created
July 25, 2008 19:46
-
-
Save michaelbarton/2501 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'rubygems' | |
require 'hpricot' | |
ARGV.each do |gi| | |
# Fetch the URL and pass the contents to Hpricot | |
url = "http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=#{gi}&rettype=fasta&retmode=xml" | |
doc = Hpricot.XML(open(url).read) | |
# Use Hpricot to naviagte to the necessary position in the document | |
# Select only the sub nodes that are elements, as carriage returns are treated as content | |
# but are not what we are interested in | |
elements = doc.search("TSeqSet/TSeq/*").select{|x| x.instance_of? Hpricot::Elem} | |
# Iterate through each element, and pass to the injected hash map | |
# Inject is a bit of Ruby magic that allows an enum to be traversed | |
# but also includes the method argument in the scope of the loop | |
map = elements.inject(Hash.new) do |hash,elem| | |
# Split each element into a key value pair and add to the hash | |
# Use gsub to remove the 'TSeq_' at the start of each key, then downcase and convert to a symbol | |
hash[elem.name.gsub('TSeq_','').downcase.to_sym] = elem.inner_text | |
# Updated object needs to be returned at the end on an inject loop | |
hash | |
end | |
# Output the arguments to the command line | |
# NOTE: This is the only part of the code that is dependent on the names of the tinyseq values | |
puts "(#{map[:gi]}|#{map[:accver]}) \"#{map[:defline]}\" size:#{map[:length]} #{map[:orgname]}" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment