Skip to content

Instantly share code, notes, and snippets.

@tfuji
Last active August 29, 2015 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tfuji/f357c77e63c43092b55f to your computer and use it in GitHub Desktop.
Save tfuji/f357c77e63c43092b55f to your computer and use it in GitHub Desktop.
genome_reports2ttl_v2.rb
#!/usr/bin/env ruby
#
# convert genome_reprots to RDF
# * ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt
# * ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt
#
require 'date'
def quote(str)
return str.to_s.gsub('\\', '\\\\').gsub("\t", '\\t').gsub("\n", '\\n').gsub("\r", '\\r').gsub('"', '\\"').inspect
end
def resource_date(str)
return quote(str)
#str = '0001/01/01' if str == '-'
#return quote(Date.parse(str).strftime("%Y-%m-%d")) +"^^xsd:date"
end
def output_prefix
puts "@prefix obo: <http://purl.obolibrary.org/obo/> ."
puts
puts
end
def status2so str
case str
when "Contig"
"obo:SO_0000149"
when "Gapless Chromosome"
"obo:SO_0000340"
when "Complete"
"obo:SO_0000148"
when "Scaffold"
"obo:SO_0000148"
when "Chromosome"
"obo:SO_0000340"
when "Chromosome with gaps"
"obo:SO_0000340"
else
warn "undefied status: #{k}"
raise error
end
end
def output_pv k,v
case k
when 'Organism/Name'
puts "\t:organism_name\t#{quote(v)} ;"
when 'TaxID'
puts "\t:tax_id\t#{quote(v)} ;"
puts "\t:taxon\t<http://identifiers.org/taxonomy/#{v}> ;" if v !='-'
when 'BioProject Accession'
puts "\t:bioproject_accession\t#{quote(v)} ;"
puts "\t:bioproject\t<http://identifiers.org/bioproject/#{v}> ;"
when 'BioProject ID'
puts "\t:bioproject_id\t#{quote(v)} ;"
when 'Group'
puts "\t:group\t#{quote(v)} ;"
when 'SubGroup'
puts "\t:subgroup\t#{quote(v)} ;"
when 'Size (Mb)'
puts "\t:size\t#{quote(v)} ;"
when 'GC%'
puts "\t:gc\t#{quote(v)} ;"
when 'Assembly Accession'
puts "\t:assembly_accession\t#{quote(v)} ;"
when 'Chromosomes'
puts "\t:chromosomes\t#{quote(v)} ;"
when 'Organelles'
puts "\t:organelles\t#{quote(v)} ;"
when 'Plasmids'
puts "\t:plasmids\t#{quote(v)} ;"
when 'WGS'
puts "\t:wgs\t#{quote(v)} ;"
when 'Scaffolds'
puts "\t:scaffolds\t#{quote(v)} ;"
when 'Genes'
puts "\t:genes\t#{quote(v)} ;"
when 'Proteins'
puts "\t:proteins\t#{quote(v)} ;"
when 'Release Date'
puts "\t:release_date\t#{resource_date(v)} ;"
when 'Modify Date'
puts "\t:modify_date\t#{resource_date(v)} ;"
when 'Status'
puts "\t:status\t#{quote(v)} ;"
puts "\t:status2so\t#{status2so(v)} ;"
when 'Center'
puts "\t:center\t#{quote(v)} ;"
when 'BioSample Accession'
puts "\t:biosample_accession\t#{quote(v)} ;"
puts "\t:biosample\t<http://identifiers.org/biosample/#{v}> ;" if v != '-'
when 'Chromosomes/RefSeq'
puts "\t:chromosomes_refseq\t#{quote(v)} ; #only prokaryotes"
v.split(",").each { |vv| puts "\t:chromosome\t<http://identifiers.org/refseq/#{vv}> ;"} if v != '-'
when 'Chromosomes/INSDC'
puts "\t:chromosomes_insdc\t#{quote(v)} ; #only prokaryotes"
when 'Plasmids/RefSeq'
puts "\t:plasmids_refseq\t#{quote(v)} ; #only prokaryotes"
v.split(",").each { |vv| puts "\t:plasmid\t<http://identifiers.org/refseq/#{vv}> ;"} if v != '-'
when 'Plasmids/INSDC'
puts "\t:plasmids_insdc\t#{quote(v)} ; #only prokaryotes"
when 'Reference'
puts "\t:reference\t#{quote(v)}; #only prokaryotes"
when 'FTP Path'
puts "\t:ftp_path\t#{quote(v)}; #only prokaryotes"
when 'Pubmed ID
puts "\t:pubmed_id\t#{quote(v)} ; #only prokaryotes"
else
puts " when '#{k}'"
warn "undefied key: #{k}"
raise error
end
end
ary =[]
%w(GENOME_REPORTS/prokaryotes.txt GENOME_REPORTS/eukaryotes.txt).each do |input_file|
head = []
File.readlines(input_file).each_with_index do |line,i|
if i == 0
head =line.strip.gsub("\r","").gsub(/^#/,"").split("\t")
else
ary << head.zip(line.strip.split("\t")).inject({}){|h,col| h[col[0]]=col[1];h}
end
end
end
# {"Contig"=>12234, "Gapless Chromosome"=>2935, "Complete"=>25, "Scaffold"=>11279, "Chromosome"=>533, "Chromosome with gaps"=>339}
output_prefix
status = Hash.new{|h,k|h[k]=0}
ary.each do |project|
acc = project["BioProject Accession"]
puts "<http://identifiers.org/bioproject/#{acc}>"
project.each do |k,v|
output_pv(k,v)
end
puts "."
status[project["Status"]] += 1
end
warn status
# TSV format errors in prokaryotes.txt
#< #Organism/Name TaxID BioProject Accession BioProject ID Group SubGroup Size (Mb) GC% Chromosomes/RefSeq Chromosomes/INSDC Plasmids/RefSeq Plasmids/INSDC WGS Scaffolds Genes Proteins Release Date Modify Date Status Center BioSample Accession Assembly Accession Reference FTP Path Pubmed ID
#> #Organism/Name TaxID BioProject Accession BioProject ID Group SubGroup Size (Mb) GC% Chromosomes/RefSeq Chromosomes/INSDC Plasmids/RefSeq Plasmids/INSDC WGS Scaffolds Genes Proteins Release Date Modify Date Status Center BioSample Accession Assembly Accession Reference FTP Path Pubmed ID
#< "The Federal Goverment Health Institution ""Stavropol Plague Control Reseach Institute"" of the Federal Service for Supervision in the Sphere of Consumer Rights Protection and Human Welfare"
#> The Federal Goverment Health Institution "Stavropol Plague Control Reseach Institute" of the Federal Service for Supervision in the Sphere of Consumer Rights Protection and Human Welfare
#< """National Center for Biotechnology"" RSE"
#> "National Center for Biotechnology" RSE
#< "1Centre ""Bioengineering"" of Russian Academy of Sciences"
#> 1Centre "Bioengineering" of Russian Academy of Sciences
## peokaryotes
#["Organism/Name", "Campylobacter jejuni subsp. jejuni CG8421"]
#["TaxID", "478547"]
#["BioProject Accession", "PRJNA21037"]
#["BioProject ID", "21037"]
#["Group", "Proteobacteria"]
#["SubGroup", "delta/epsilon subdivisions"]
#["Size (Mb)", "1.60894"]
#["GC%", "30.3"]
#["Chromosomes/RefSeq", "-"]
#["Chromosomes/INSDC", "-"]
#["Plasmids/RefSeq", "-"]
#["Plasmids/INSDC", "-"]
#["WGS", "ABGQ01"]
#["Scaffolds", "20"]
#["Genes", "1590"]
#["Proteins", "1512"]
#["Release Date", "2008/09/19"]
#["Modify Date", "2014/01/08"]
#["Status", "Contig"]
#["Center", "Naval Medical Research Center"]
#["BioSample Accession", "SAMN02470701"]
#["Assembly Accession", "GCA_000171795.1"]
#["Reference", "-"]
#["FTP Path", "Campylobacter_jejuni/GCF_000171795"]
#["Pubmed ID", "18809665"]
#
#eukaryotes
#["Organism/Name", "Emiliania huxleyi CCMP1516"]
#["TaxID", "280463"]
#["BioProject Accession", "PRJNA77753"]
#["BioProject ID", "77753"]
#["Group", "Protists"]
#["SubGroup", "Other Protists"]
#["Size (Mb)", "167.676"]
#["GC%", "64.5"]
#["Assembly Accession", "GCA_000372725.1"]
#["Chromosomes", "-"]
#["Organelles", "-"]
#["Plasmids", "-"]
#["WGS", "AHAL01"]
#["Scaffolds", "7795"]
#["Genes", "38549"]
#["Proteins", "38554"]
#["Release Date", "2013/04/19"]
#["Modify Date", "2013/07/08"]
#["Status", "Scaffold"]
#["Center", "JGI"]
#["BioSample Accession", "-"]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment