Skip to content

Instantly share code, notes, and snippets.

@jstirnaman
Created May 20, 2015 14:25
Show Gist options
  • Save jstirnaman/1998e27e8b66c73022bd to your computer and use it in GitHub Desktop.
Save jstirnaman/1998e27e8b66c73022bd to your computer and use it in GitHub Desktop.
Ruby module for accessing NLM's Eutils for Entrez, specifically Medline, and returning specific details (e.g. grants) from the Medline record.
require 'bio-table'
require 'bio'
require 'rdf'
require "addressable/template"
require 'linkeddata'
require 'httparty'
require 'nokogiri'
require "csv"
require 'uri'
module Entrez
# Set default email address for Entrez eUtils.
DEFAULT_EMAIL = ''
DEFAULT_TOOL = 'pubmed'
class Esearch
attr_accessor :esearch_response
def initialize(query, etool = DEFAULT_TOOL)
# Store search results in Entrez
# e.g., esearch.fcgi?db=<database>&term=<query>&usehistory=y
begin
@esearch_response = Nokogiri::XML(HTTParty.get(url(query, etool)).body
)
@esearch_response ||= Nokogiri::XML::Document.new()
rescue StandardError => e
STDOUT.puts e
end
end
def url(q, etool)
q = URI.escape(q)
URI::HTTP.build([nil, "eutils.ncbi.nlm.nih.gov", nil,
"/entrez/eutils/esearch.fcgi",
"db=#{etool}&term=#{q}&usehistory=y", nil]).to_s
end
def entrez_webenv
esearch_response.css('WebEnv').text
end
def entrez_querykey
esearch_response.css('QueryKey').text
end
def uids
# Returns string of comma-separated IDs.
esearch_response.css('Id').map {|n| n.text}.join(',')
end
def num_results
esearch_response.xpath('//eSearchResult/Count').text
end
def fetch
# For convenience. Go ahead and fetch the records from uids.
Efetch.new(uids)
end
end
class Efetch
attr_accessor :efetch_response, :records
def initialize(id_list = nil, usehistory = 'n', query_key = nil, webenv = nil, retstart = 1, retmax = 100, retmode = 'xml', etool = DEFAULT_TOOL)
# Download full records
# e.g., efetch.fcgi?db=<database>&id=<uid_list>&rettype=<retrieval_type>
# &retmode=<retrieval_mode>
search = ''
if usehistory == 'y'
search = 'usehistory=y&query_key=' + query_key + '&WebEnv=' + webenv + '&retstart=' + retstart + '&retmax=' + retmax
elsif id_list
if id_list.class == Array
id_list = id_list.join(',')
end
search = 'id=' + id_list
end
begin
@efetch_response = HTTParty.get("http://eutils.ncbi.nlm.nih.gov" +
"/entrez/eutils/efetch.fcgi?" + "db=" + etool + "&" + "retmode=" + retmode + "&" + search)
rescue StandardError => e
STDOUT.puts e
end
end
def record_collection
Nokogiri::XML(efetch_response.body)
end
def records
# Array of XmlRecords
record_collection.root.element_children.map {|e| XmlRecord.new(e.name,record_collection) << e.children }
end
end
class XmlRecord < Nokogiri::XML::Node
def initialize(p1,p2)
end
# Nokogiri's XPath Node queries must begin like './' to match the node's top-level element,
# PubmedArticle in this case.
def has_pmid
xpath("./PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']").inner_text
end
def has_grants
grants = xpath('./MedlineCitation/Article/GrantList//Grant').map do |g|
h = Hash.new
h["agency"] = agency = g.css('Agency').inner_text.split(' ')[0]
h["id"] = id = g.css('GrantID').inner_text
h["full_name"] = agency + ' #' + id
h
end
end
def includes_grants(compare_arr)
if has_grants
grant_machine_names = has_grants.map do |g|
g = g["agency"] + g["id"]
g.strip.downcase.gsub(" ", "").gsub("-", "")
end
grant_machine_names
compare_arr.keep_if do |c|
c_machine_name = c.strip.downcase.gsub(" ", "").gsub("#", "").gsub("-", "")
grant_machine_names.include?(c_machine_name)
end
compare_arr.join(",")
end
end
def has_pmcid
xpath("./PubmedData/ArticleIdList/ArticleId[@IdType='pmc']").inner_text
end
def has_pubdate(pubstatus = 'pubmed')
date_node = xpath("./PubmedData/History/PubMedPubDate[@PubStatus='#{pubstatus}']")
date_node.empty? ? "" : format_date_from_xml(date_node)
end
def has_nih_support?
nih_type = "Research Support, N.I.H., Extramural"
nih_type_machine_name = nih_type.strip.gsub(".", "").gsub(",", "").squeeze(" ").downcase
nih_pub_types = xpath("./MedlineCitation/Article/PublicationTypeList//PublicationType")
# Convert each pub type in the XML and the standard type to machine, split words into an array, sort them, and test for a match.
nih_pub_types.detect {|npt| npt.inner_text.strip.gsub(".", "").gsub(",", "").squeeze(" ").downcase.split(" ").sort.eql?(nih_type_machine_name.split(" ").sort) } ? "true" : ""
end
def format_date_from_xml(date_xml_node)
# Accepts date_xml_node <Year></Year> <Month></Month> <Day></Day>
Date.new(date_xml_node.xpath('./Year').inner_text.to_i,
date_xml_node.xpath('./Month').inner_text.to_i,
date_xml_node.xpath('./Day').inner_text.to_i
)
rescue ArgumentError => e
puts e.message
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment