jstirnaman/entrez.rb

## entrez.rb
require 'bio-table'
require 'bio'
require 'rdf'
require "addressable/template"
require 'linkeddata'
require 'httparty'
require 'nokogiri'
require "csv"
require 'uri'

module Entrez
# Set default email address for Entrez eUtils.
DEFAULT_EMAIL = ''
DEFAULT_TOOL = 'pubmed'

  class Esearch
		attr_accessor :esearch_response

		def initialize(query, etool = DEFAULT_TOOL)
			# Store search results in Entrez
			# e.g., esearch.fcgi?db=<database>&term=<query>&usehistory=y
				begin
				@esearch_response = Nokogiri::XML(HTTParty.get(url(query, etool)).body
										)
				@esearch_response ||= Nokogiri::XML::Document.new()
				rescue StandardError => e
					STDOUT.puts e
				end
		end

    def url(q, etool)
      q = URI.escape(q)
      URI::HTTP.build([nil, "eutils.ncbi.nlm.nih.gov", nil,
                       "/entrez/eutils/esearch.fcgi",
                       "db=#{etool}&term=#{q}&usehistory=y", nil]).to_s
    end

		def entrez_webenv
			esearch_response.css('WebEnv').text
		end

		def entrez_querykey
			esearch_response.css('QueryKey').text
		end

		def uids
		  # Returns string of comma-separated IDs.
		    esearch_response.css('Id').map {|n| n.text}.join(',')
		end

		def num_results
		  esearch_response.xpath('//eSearchResult/Count').text
		end

		def fetch
		  # For convenience. Go ahead and fetch the records from uids.
		  Efetch.new(uids)
		end
  end

  class Efetch
		attr_accessor :efetch_response, :records
		def initialize(id_list = nil, usehistory = 'n', query_key = nil, webenv = nil, retstart = 1, retmax = 100, retmode = 'xml', etool = DEFAULT_TOOL)
			# Download full records
			# e.g., efetch.fcgi?db=<database>&id=<uid_list>&rettype=<retrieval_type>
			# &retmode=<retrieval_mode>
			search = ''
			if usehistory == 'y'
				search = 'usehistory=y&query_key=' + query_key + '&WebEnv=' + webenv + '&retstart=' + retstart + '&retmax=' + retmax
			elsif id_list
			  if id_list.class == Array
			    id_list = id_list.join(',')
			  end
				search = 'id=' + id_list
			end

			begin
			 @efetch_response = HTTParty.get("http://eutils.ncbi.nlm.nih.gov" +
				"/entrez/eutils/efetch.fcgi?" + "db=" + etool + "&" + "retmode=" + retmode + "&" + search)
			rescue StandardError => e
				STDOUT.puts e
			end
		end

		def record_collection
		  Nokogiri::XML(efetch_response.body)
    end

		def records
		  # Array of XmlRecords
		  record_collection.root.element_children.map {|e| XmlRecord.new(e.name,record_collection) << e.children }
		end
  end

  class XmlRecord < Nokogiri::XML::Node
    def initialize(p1,p2)
    end

    # Nokogiri's XPath Node queries must begin like './' to match the node's top-level element,
    # PubmedArticle in this case.
    def has_pmid
				xpath("./PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']").inner_text
		end

		def has_grants
			grants = xpath('./MedlineCitation/Article/GrantList//Grant').map do |g|
										 h = Hash.new
										 h["agency"] = agency = g.css('Agency').inner_text.split(' ')[0]
										 h["id"] = id = g.css('GrantID').inner_text
										 h["full_name"] = agency + ' #' + id
										 h
			end
		end

		def includes_grants(compare_arr)
			if has_grants
				grant_machine_names = has_grants.map do |g|
					g = g["agency"] + g["id"]
					g.strip.downcase.gsub(" ", "").gsub("-", "")
				end
				grant_machine_names
				compare_arr.keep_if do |c|
					c_machine_name = c.strip.downcase.gsub(" ", "").gsub("#", "").gsub("-", "")
					grant_machine_names.include?(c_machine_name)
				end
				compare_arr.join(",")
			end
		end

		def has_pmcid
			xpath("./PubmedData/ArticleIdList/ArticleId[@IdType='pmc']").inner_text
		end

		def has_pubdate(pubstatus = 'pubmed')
			date_node = xpath("./PubmedData/History/PubMedPubDate[@PubStatus='#{pubstatus}']")
			date_node.empty? ? "" : format_date_from_xml(date_node)
		end

		def has_nih_support?
			nih_type = "Research Support, N.I.H., Extramural"
			nih_type_machine_name = nih_type.strip.gsub(".", "").gsub(",", "").squeeze(" ").downcase
			nih_pub_types = xpath("./MedlineCitation/Article/PublicationTypeList//PublicationType")
			# Convert each pub type in the XML and the standard type to machine, split words into an array, sort them, and test for a match.
			nih_pub_types.detect {|npt| npt.inner_text.strip.gsub(".", "").gsub(",", "").squeeze(" ").downcase.split(" ").sort.eql?(nih_type_machine_name.split(" ").sort) } ? "true" : ""
		end

		def format_date_from_xml(date_xml_node)
			# Accepts date_xml_node <Year></Year> <Month></Month> <Day></Day>
			Date.new(date_xml_node.xpath('./Year').inner_text.to_i,
																	date_xml_node.xpath('./Month').inner_text.to_i,
																	date_xml_node.xpath('./Day').inner_text.to_i
							)
			rescue ArgumentError => e
				puts e.message
		end
  end
end
	require 'bio-table'
	require 'bio'
	require 'rdf'
	require "addressable/template"
	require 'linkeddata'
	require 'httparty'
	require 'nokogiri'
	require "csv"
	require 'uri'

	module Entrez
	# Set default email address for Entrez eUtils.
	DEFAULT_EMAIL = ''
	DEFAULT_TOOL = 'pubmed'

	class Esearch
	attr_accessor :esearch_response

	def initialize(query, etool = DEFAULT_TOOL)
	# Store search results in Entrez
	# e.g., esearch.fcgi?db=<database>&term=<query>&usehistory=y
	begin
	@esearch_response = Nokogiri::XML(HTTParty.get(url(query, etool)).body
	)
	@esearch_response \|\|= Nokogiri::XML::Document.new()
	rescue StandardError => e
	STDOUT.puts e
	end
	end

	def url(q, etool)
	q = URI.escape(q)
	URI::HTTP.build([nil, "eutils.ncbi.nlm.nih.gov", nil,
	"/entrez/eutils/esearch.fcgi",
	"db=#{etool}&term=#{q}&usehistory=y", nil]).to_s
	end

	def entrez_webenv
	esearch_response.css('WebEnv').text
	end

	def entrez_querykey
	esearch_response.css('QueryKey').text
	end

	def uids
	# Returns string of comma-separated IDs.
	esearch_response.css('Id').map {\|n\| n.text}.join(',')
	end

	def num_results
	esearch_response.xpath('//eSearchResult/Count').text
	end

	def fetch
	# For convenience. Go ahead and fetch the records from uids.
	Efetch.new(uids)
	end
	end

	class Efetch
	attr_accessor :efetch_response, :records
	def initialize(id_list = nil, usehistory = 'n', query_key = nil, webenv = nil, retstart = 1, retmax = 100, retmode = 'xml', etool = DEFAULT_TOOL)
	# Download full records
	# e.g., efetch.fcgi?db=<database>&id=<uid_list>&rettype=<retrieval_type>
	# &retmode=<retrieval_mode>
	search = ''
	if usehistory == 'y'
	search = 'usehistory=y&query_key=' + query_key + '&WebEnv=' + webenv + '&retstart=' + retstart + '&retmax=' + retmax
	elsif id_list
	if id_list.class == Array
	id_list = id_list.join(',')
	end
	search = 'id=' + id_list
	end

	begin
	@efetch_response = HTTParty.get("http://eutils.ncbi.nlm.nih.gov" +
	"/entrez/eutils/efetch.fcgi?" + "db=" + etool + "&" + "retmode=" + retmode + "&" + search)
	rescue StandardError => e
	STDOUT.puts e
	end
	end

	def record_collection
	Nokogiri::XML(efetch_response.body)
	end

	def records
	# Array of XmlRecords
	record_collection.root.element_children.map {\|e\| XmlRecord.new(e.name,record_collection) << e.children }
	end
	end

	class XmlRecord < Nokogiri::XML::Node
	def initialize(p1,p2)
	end

	# Nokogiri's XPath Node queries must begin like './' to match the node's top-level element,
	# PubmedArticle in this case.
	def has_pmid
	xpath("./PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']").inner_text
	end

	def has_grants
	grants = xpath('./MedlineCitation/Article/GrantList//Grant').map do \|g\|
	h = Hash.new
	h["agency"] = agency = g.css('Agency').inner_text.split(' ')[0]
	h["id"] = id = g.css('GrantID').inner_text
	h["full_name"] = agency + ' #' + id
	h
	end
	end

	def includes_grants(compare_arr)
	if has_grants
	grant_machine_names = has_grants.map do \|g\|
	g = g["agency"] + g["id"]
	g.strip.downcase.gsub(" ", "").gsub("-", "")
	end
	grant_machine_names
	compare_arr.keep_if do \|c\|
	c_machine_name = c.strip.downcase.gsub(" ", "").gsub("#", "").gsub("-", "")
	grant_machine_names.include?(c_machine_name)
	end
	compare_arr.join(",")
	end
	end

	def has_pmcid
	xpath("./PubmedData/ArticleIdList/ArticleId[@IdType='pmc']").inner_text
	end

	def has_pubdate(pubstatus = 'pubmed')
	date_node = xpath("./PubmedData/History/PubMedPubDate[@PubStatus='#{pubstatus}']")
	date_node.empty? ? "" : format_date_from_xml(date_node)
	end

	def has_nih_support?
	nih_type = "Research Support, N.I.H., Extramural"
	nih_type_machine_name = nih_type.strip.gsub(".", "").gsub(",", "").squeeze(" ").downcase
	nih_pub_types = xpath("./MedlineCitation/Article/PublicationTypeList//PublicationType")
	# Convert each pub type in the XML and the standard type to machine, split words into an array, sort them, and test for a match.
	nih_pub_types.detect {\|npt\| npt.inner_text.strip.gsub(".", "").gsub(",", "").squeeze(" ").downcase.split(" ").sort.eql?(nih_type_machine_name.split(" ").sort) } ? "true" : ""
	end

	def format_date_from_xml(date_xml_node)
	# Accepts date_xml_node <Year></Year> <Month></Month> <Day></Day>
	Date.new(date_xml_node.xpath('./Year').inner_text.to_i,
	date_xml_node.xpath('./Month').inner_text.to_i,
	date_xml_node.xpath('./Day').inner_text.to_i
	)
	rescue ArgumentError => e
	puts e.message
	end
	end
	end