Skip to content

Instantly share code, notes, and snippets.

@inutano
Created May 12, 2011 09:14
Show Gist options
  • Save inutano/968221 to your computer and use it in GitHub Desktop.
Save inutano/968221 to your computer and use it in GitHub Desktop.
explore SRA
# -*- coding: utf-8 -*-
require "nokogiri"
hako = "/path/to/directory_contains_pubmed_html"
title_list = []
Dir::entries(hako).each do |item|
if !item.include?("url_list")
if !item.include?("¥.")
kitanaihtml = Nokogiri::HTML(open("#{hako}/#{item}"))
       title = (kitanaihtml/"title").inner_text
       title_list.push(title)
end
end
end
title_list.each do |title|
puts title.gsub(/( - PubMed result)$/,"")
end
journal_list = []
title_list.each do |title|
title =‾ %r|¥[(.*)¥]|
journal = $1
if journal
journal_list.push(journal)
end
end
journal_list.sort.each do |journal|
puts journal.gsub(/(.....)$/,"")
end
year_list = []
title_list.each do |title|
title =‾ %r|¥[.*(....)¥]|
year = $1
if year
year_list.push(year)
end
end
puts year_list.sort
# -*- coding: utf-8 -*-
require "open-uri"
require "json"
require "nokogiri"
require "pp"
def sratohananndattanoka(kaneganaitoka,nantokasiroya) # argument: organism, study_type
mishima = "http://trace.ddbj.nig.ac.jp/DRASearch/"
ground_floor = "#{mishima}query?organism=#{kaneganaitoka}&study_type=#{nantokasiroya}&show=100"
deeper_underground = "#{mishima}query?organism=#{kaneganaitoka}&study_type=#{nantokasiroya}&show=100&page=2"
list_study = URI(ground_floor).read.scan(/href="(study¥?acc=.*)" target/).flatten
list_study2 = URI(deeper_underground).read.scan(/href="(study¥?acc=.*)" target/).flatten
full_list = list_study + list_study2
# save full sample list as json format file
sample_id_full_list = full_list.map { |study_url|
URI("#{mishima}#{study_url}").read.scan(/href="(sample¥?acc=.*)" target/)
}.flatten
n = kaneganaitoka.gsub("+","_")
s = nantokasiroya.gsub("+","_")
open("./#{n}_#{s}_sample_id.json","w") { |f| f.puts JSON.dump(sample_id_full_list) }
return sample_id_full_list
end
def mouNCBInantesiruka(majide) # argument: sampleid (e.g. "sample?acc=SRAXXXXXX")
begin
mishima = "http://trace.ddbj.nig.ac.jp/DRASearch/"
convert = URI("#{mishima}#{majide}").read.scan(/href="(submission¥?acc=.*)" target/).flatten.join("")
sub_id = convert.gsub("submission¥?acc=","")
sub_id_index = sub_id.slice(0,6)
xmldono = File.read("/Users/iNut/togofarm/xmldono/Submissions/#{sub_id_index}/#{sub_id}/#{sub_id}.study.xml")
nakami = Nokogiri::XML(xmldono)
entrez_link_db = (nakami/"ENTREZ_LINK"/"DB").inner_text
entrez_link_id = (nakami/"ENTREZ_LINK"/"ID").inner_text
return entrez_link_db, entrez_link_id
rescue
"no data"
end
end
def ronbundasumadegasequencedesu(ronbunronbun)
pmid_list = []
ronbunronbun.each do |pair|
if pair[0].include?("pubmed")
if pair[0] =‾ %r|^pubmed|
pmid = pair[1].slice(0,8)
pmid_list.push(pmid)
elsif pair[0] =‾ %r|pubmed$|
pmid = pair[1].scan(/.*(........)$/).join("")
pmid_list.push(pmid)
end
end
end
#pp pmid_list.uniq
return pmid_list.uniq.map { |id|
"http://www.ncbi.nlm.nih.gov/pubmed/" + id
}
end
if __FILE__ == $0
# organism = "Homo+sapiens"
organism = "Mus+musculus"
study_type = "Transcriptome+Analysis"
n = organism.gsub("+","_")
s = study_type.gsub("+","_")
sample_list = sratohananndattanoka(organism,study_type)
result = []
sample_list.each do |id|
result.push(mouNCBInantesiruka(id))
end
sorted = result.uniq
open("#{n}_#{s}_db_id.json","w") { |f| JSON.dump(sorted, f) }
puts ronbundasumadegasequencedesu(sorted)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment