Created
May 20, 2015 14:25
-
-
Save jstirnaman/1998e27e8b66c73022bd to your computer and use it in GitHub Desktop.
Ruby module for accessing NLM's Eutils for Entrez, specifically Medline, and returning specific details (e.g. grants) from the Medline record.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'bio-table' | |
require 'bio' | |
require 'rdf' | |
require "addressable/template" | |
require 'linkeddata' | |
require 'httparty' | |
require 'nokogiri' | |
require "csv" | |
require 'uri' | |
module Entrez | |
# Set default email address for Entrez eUtils. | |
DEFAULT_EMAIL = '' | |
DEFAULT_TOOL = 'pubmed' | |
class Esearch | |
attr_accessor :esearch_response | |
def initialize(query, etool = DEFAULT_TOOL) | |
# Store search results in Entrez | |
# e.g., esearch.fcgi?db=<database>&term=<query>&usehistory=y | |
begin | |
@esearch_response = Nokogiri::XML(HTTParty.get(url(query, etool)).body | |
) | |
@esearch_response ||= Nokogiri::XML::Document.new() | |
rescue StandardError => e | |
STDOUT.puts e | |
end | |
end | |
def url(q, etool) | |
q = URI.escape(q) | |
URI::HTTP.build([nil, "eutils.ncbi.nlm.nih.gov", nil, | |
"/entrez/eutils/esearch.fcgi", | |
"db=#{etool}&term=#{q}&usehistory=y", nil]).to_s | |
end | |
def entrez_webenv | |
esearch_response.css('WebEnv').text | |
end | |
def entrez_querykey | |
esearch_response.css('QueryKey').text | |
end | |
def uids | |
# Returns string of comma-separated IDs. | |
esearch_response.css('Id').map {|n| n.text}.join(',') | |
end | |
def num_results | |
esearch_response.xpath('//eSearchResult/Count').text | |
end | |
def fetch | |
# For convenience. Go ahead and fetch the records from uids. | |
Efetch.new(uids) | |
end | |
end | |
class Efetch | |
attr_accessor :efetch_response, :records | |
def initialize(id_list = nil, usehistory = 'n', query_key = nil, webenv = nil, retstart = 1, retmax = 100, retmode = 'xml', etool = DEFAULT_TOOL) | |
# Download full records | |
# e.g., efetch.fcgi?db=<database>&id=<uid_list>&rettype=<retrieval_type> | |
# &retmode=<retrieval_mode> | |
search = '' | |
if usehistory == 'y' | |
search = 'usehistory=y&query_key=' + query_key + '&WebEnv=' + webenv + '&retstart=' + retstart + '&retmax=' + retmax | |
elsif id_list | |
if id_list.class == Array | |
id_list = id_list.join(',') | |
end | |
search = 'id=' + id_list | |
end | |
begin | |
@efetch_response = HTTParty.get("http://eutils.ncbi.nlm.nih.gov" + | |
"/entrez/eutils/efetch.fcgi?" + "db=" + etool + "&" + "retmode=" + retmode + "&" + search) | |
rescue StandardError => e | |
STDOUT.puts e | |
end | |
end | |
def record_collection | |
Nokogiri::XML(efetch_response.body) | |
end | |
def records | |
# Array of XmlRecords | |
record_collection.root.element_children.map {|e| XmlRecord.new(e.name,record_collection) << e.children } | |
end | |
end | |
class XmlRecord < Nokogiri::XML::Node | |
def initialize(p1,p2) | |
end | |
# Nokogiri's XPath Node queries must begin like './' to match the node's top-level element, | |
# PubmedArticle in this case. | |
def has_pmid | |
xpath("./PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']").inner_text | |
end | |
def has_grants | |
grants = xpath('./MedlineCitation/Article/GrantList//Grant').map do |g| | |
h = Hash.new | |
h["agency"] = agency = g.css('Agency').inner_text.split(' ')[0] | |
h["id"] = id = g.css('GrantID').inner_text | |
h["full_name"] = agency + ' #' + id | |
h | |
end | |
end | |
def includes_grants(compare_arr) | |
if has_grants | |
grant_machine_names = has_grants.map do |g| | |
g = g["agency"] + g["id"] | |
g.strip.downcase.gsub(" ", "").gsub("-", "") | |
end | |
grant_machine_names | |
compare_arr.keep_if do |c| | |
c_machine_name = c.strip.downcase.gsub(" ", "").gsub("#", "").gsub("-", "") | |
grant_machine_names.include?(c_machine_name) | |
end | |
compare_arr.join(",") | |
end | |
end | |
def has_pmcid | |
xpath("./PubmedData/ArticleIdList/ArticleId[@IdType='pmc']").inner_text | |
end | |
def has_pubdate(pubstatus = 'pubmed') | |
date_node = xpath("./PubmedData/History/PubMedPubDate[@PubStatus='#{pubstatus}']") | |
date_node.empty? ? "" : format_date_from_xml(date_node) | |
end | |
def has_nih_support? | |
nih_type = "Research Support, N.I.H., Extramural" | |
nih_type_machine_name = nih_type.strip.gsub(".", "").gsub(",", "").squeeze(" ").downcase | |
nih_pub_types = xpath("./MedlineCitation/Article/PublicationTypeList//PublicationType") | |
# Convert each pub type in the XML and the standard type to machine, split words into an array, sort them, and test for a match. | |
nih_pub_types.detect {|npt| npt.inner_text.strip.gsub(".", "").gsub(",", "").squeeze(" ").downcase.split(" ").sort.eql?(nih_type_machine_name.split(" ").sort) } ? "true" : "" | |
end | |
def format_date_from_xml(date_xml_node) | |
# Accepts date_xml_node <Year></Year> <Month></Month> <Day></Day> | |
Date.new(date_xml_node.xpath('./Year').inner_text.to_i, | |
date_xml_node.xpath('./Month').inner_text.to_i, | |
date_xml_node.xpath('./Day').inner_text.to_i | |
) | |
rescue ArgumentError => e | |
puts e.message | |
end | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment