Last active
August 29, 2015 14:09
-
-
Save ktnyt/a0053d5699d194edd3a9 to your computer and use it in GitHub Desktop.
mzTab Annotator - picks up "opt_global_kegg" column to automatically annotate SML lines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
############################################################################### | |
# | |
# Modules | |
# | |
use strict; | |
use warnings; | |
use utf8; | |
use POSIX; | |
use Encode; | |
use Storable; | |
use XML::Simple; | |
use JSON; | |
use LWP::Simple; | |
use LWP::UserAgent; | |
use URI::Escape; | |
use HTTP::Request::Common qw(POST); | |
use Mozilla::CA; | |
############################################################################### | |
############################################################################### | |
# | |
# Parse input | |
# | |
my %opts = ( | |
force => 0, | |
help => 0, | |
input => "", | |
output => "", | |
verbose => 0 | |
); | |
for(my $i = 0; $i < scalar(@ARGV); ++$i) { | |
if((my $name = $ARGV[$i]) =~ s/^--?//) { | |
if(exists($opts{$name})) { | |
my @value = (); | |
for(my $j = $i + 1; $j < scalar(@ARGV); ++$i, ++$j) { | |
if($ARGV[$j] =~ /^--?/) { | |
last; | |
} | |
push(@value, $ARGV[$j]); | |
} | |
if(@value) { | |
$opts{$name} = join(" ", @value); | |
} else { | |
$opts{$name} = 1; | |
} | |
} | |
} | |
} | |
if($opts{help} || !length($opts{input})) { | |
print_help(1); | |
} | |
############################################################################### | |
############################################################################### | |
# | |
# Read and parse input mzTab file | |
# | |
open my $ifh, "<", $opts{input} or die $!; | |
my $mztab; | |
while(my $line = <$ifh>) { | |
chomp($line); | |
if($line =~ /^(\w{3})/) { | |
push(@{$mztab->{$1}}, [split(/\t/, $line)]); | |
} | |
} | |
close $ifh; | |
############################################################################### | |
############################################################################### | |
# | |
# Annotation | |
# | |
my $keggcol = get_keggcol(@{$mztab->{SMH}->[0]}); | |
my $inchicol = get_inchicol(@{$mztab->{SMH}->[0]}); | |
my $smilescol = get_smilescol(@{$mztab->{SMH}->[0]}); | |
my $formulacol = get_formulacol(@{$mztab->{SMH}->[0]}); | |
my @kegg = map{$_->[$keggcol]}@{$mztab->{SML}}; | |
my @map = kegg2map_array(@kegg); | |
my @brite = kegg2brite_array(@kegg); | |
my @drug = kegg2drug_array(@kegg); | |
my @disease = kegg2disease_array(@kegg); | |
my @chebi = kegg2chebi_array(@kegg); | |
my @chembl = kegg2chembl_array(@kegg); | |
my @metacyc = kegg2metacyc_array(@kegg); | |
my @uniquec = keggCompoundTraverse("B", @kegg); | |
my @uniquep = keggPathwayTraverse("B", @kegg); | |
my @inchi = chembl2inchi_array(@chembl); | |
my @smiles = chembl2smiles_array(@chembl); | |
my @formula = chembl2formula_array(@chembl); | |
push(@{$mztab->{SMH}->[0]}, "opt_global_chembl"); | |
push(@{$mztab->{SMH}->[0]}, "opt_global_chebi"); | |
push(@{$mztab->{SMH}->[0]}, "opt_global_metacyc"); | |
push(@{$mztab->{SMH}->[0]}, "opt_global_kegg_pathway"); | |
push(@{$mztab->{SMH}->[0]}, "opt_global_kegg_brite"); | |
push(@{$mztab->{SMH}->[0]}, "opt_global_kegg_drug"); | |
push(@{$mztab->{SMH}->[0]}, "opt_global_kegg_disease"); | |
push(@{$mztab->{SMH}->[0]}, "opt_global_kegg_B"); | |
push(@{$mztab->{SMH}->[0]}, "opt_global_kegg_pathway_B"); | |
for(my $i = 0; $i < scalar(@{$mztab->{SML}}); ++$i) { | |
$mztab->{SML}->[$i]->[$inchicol] = $inchi[$i] if $inchicol; | |
$mztab->{SML}->[$i]->[$smilescol] = $smiles[$i] if $smilescol; | |
$mztab->{SML}->[$i]->[$formulacol] = $formula[$i] if $formulacol; | |
push(@{$mztab->{SML}->[$i]}, $chembl[$i]); | |
push(@{$mztab->{SML}->[$i]}, $chebi[$i]); | |
push(@{$mztab->{SML}->[$i]}, $metacyc[$i]); | |
push(@{$mztab->{SML}->[$i]}, $map[$i]); | |
push(@{$mztab->{SML}->[$i]}, $brite[$i]); | |
push(@{$mztab->{SML}->[$i]}, $drug[$i]); | |
push(@{$mztab->{SML}->[$i]}, $disease[$i]); | |
push(@{$mztab->{SML}->[$i]}, $uniquec[$i]); | |
push(@{$mztab->{SML}->[$i]}, $uniquep[$i]); | |
} | |
############################################################################### | |
############################################################################### | |
# | |
# Output in mzTab format | |
# | |
my $ofh = *STDOUT; | |
if(length($opts{output})) { | |
open($ofh, ">", $opts{output}); | |
} | |
if(-e $opts{output}) { | |
if(!$opts{force}) { | |
printf(STDERR "Output file exists. Overwrite? [Y/n]: "); | |
if(<STDIN> !~ /y/i) { | |
printf(STDERR "Exiting...\n"); | |
} | |
} | |
} | |
foreach my $section (qw(MTD SMH SML)) { | |
foreach my $line (@{$mztab->{$section}}) { | |
printf($ofh "%s\n", join("\t", @{$line})); | |
} | |
} | |
if(length($opts{output})) { | |
close $ofh; | |
} | |
############################################################################### | |
############################################################################### | |
# | |
# Subroutines | |
# | |
#################### | |
# Parsing routines # | |
#################### | |
sub get_keggcol { | |
my (@header) = (@_); | |
for(my $i = 0; $i < scalar(@header); ++$i) { | |
if($header[$i] eq "opt_global_kegg") { | |
return $i; | |
} | |
} | |
return 0; | |
} | |
sub get_inchicol { | |
my (@header) = (@_); | |
for(my $i = 0; $i < scalar(@header); ++$i) { | |
if($header[$i] eq "inchi_key") { | |
return $i; | |
} | |
} | |
return 0; | |
} | |
sub get_smilescol { | |
my (@header) = (@_); | |
for(my $i = 0; $i < scalar(@header); ++$i) { | |
if($header[$i] eq "smiles") { | |
return $i; | |
} | |
} | |
return 0; | |
} | |
sub get_formulacol { | |
my (@header) = (@_); | |
for(my $i = 0; $i < scalar(@header); ++$i) { | |
if($header[$i] eq "chemical_formula") { | |
return $i; | |
} | |
} | |
return 0; | |
} | |
####################### | |
# Annotation routines # | |
####################### | |
sub kegg2chembl_table { | |
my %table; | |
printf(STDERR "Creating kegg2chembl conversion table.\n") if $opts{verbose}; | |
my $response = get("http://www.genome.jp/sparql/linkdb?default-graph-uri=http%3A%2F%2Flinkdb&query=BASE%20%3Chttp%3A%2F%2Fwww.genome.jp%2Flinkdb%2F%3E%0ASELECT%20%3Ffromlabel%20%3Ftolabel%20WHERE%20%7B%0A%3Ffrom%20%3Cequivalent%3E%20%3Fto%20.%0A%0A%3Ffrom%20%3Ccore%2Fdatabase%3E%20%3Ffromdb%20.%0A%3Fto%20%20%20%3Ccore%2Fdatabase%3E%20%3Ftodb%20.%0A%0A%3Ffromdb%20%3Ccore%2Fdblabel%3E%20%22compound%22%20.%0A%3Ftodb%20%20%20%3Ccore%2Fdblabel%3E%20%22chembl%22%20.%0A%0A%3Ffrom%20rdfs%3Alabel%20%3Ffromlabel%20.%0A%3Fto%20%20%20rdfs%3Alabel%20%3Ftolabel%20.%0A%7D%20ORDER%20BY%20%3Ffromlabel%20%3Ftolabel%0A&format=text/tab-separated-valuescs&timeout=0&debug=on"); | |
foreach my $line (split(/\n/, $response)) { | |
my ($kegg, $chembl) = split(/\t/, $line); | |
if($kegg =~ s/".+:(.+)"/$1/g && $chembl =~ s/".+:(.+)"/$1/g) { | |
$table{$kegg} = $chembl; | |
} | |
} | |
printf(STDERR "Done.\n") if $opts{verbose}; | |
return %table; | |
} | |
sub kegg2chembl_array { | |
my (@kegg) = (@_); | |
my %table = kegg2chembl_table(); | |
return map{exists($table{$_}) ? $table{$_} : "null"}@kegg; | |
} | |
sub kegg2chebi_table { | |
my %table; | |
printf(STDERR "Creating kegg2chebi conversion table.\n") if $opts{verbose}; | |
my $response = get("http://www.genome.jp/sparql/linkdb?default-graph-uri=http%3A%2F%2Flinkdb&query=BASE%20%3Chttp%3A%2F%2Fwww.genome.jp%2Flinkdb%2F%3E%0ASELECT%20%3Ffromlabel%20%3Ftolabel%20WHERE%20%7B%0A%3Ffrom%20%3Cequivalent%3E%20%3Fto%20.%0A%0A%3Ffrom%20%3Ccore%2Fdatabase%3E%20%3Ffromdb%20.%0A%3Fto%20%20%20%3Ccore%2Fdatabase%3E%20%3Ftodb%20.%0A%0A%3Ffromdb%20%3Ccore%2Fdblabel%3E%20%22compound%22%20.%0A%3Ftodb%20%20%20%3Ccore%2Fdblabel%3E%20%22chebi%22%20.%0A%0A%3Ffrom%20rdfs%3Alabel%20%3Ffromlabel%20.%0A%3Fto%20%20%20rdfs%3Alabel%20%3Ftolabel%20.%0A%7D%20ORDER%20BY%20%3Ffromlabel%20%3Ftolabel%0A&format=text/tab-separated-valuescs&timeout=0&debug=on"); | |
foreach my $line (split(/\n/, $response)) { | |
my ($kegg, $chebi) = split(/\t/, $line); | |
if($kegg =~ s/".+:(.+)"/$1/g && $chebi =~ s/".+:(.+)"/$1/g) { | |
$table{$kegg} = $chebi; | |
} | |
} | |
printf(STDERR "Done.\n") if $opts{verbose}; | |
return %table; | |
} | |
sub kegg2chebi_array { | |
my (@kegg) = (@_); | |
my %table = kegg2chebi_table(); | |
return map{exists($table{$_}) ? $table{$_} : "null"}@kegg; | |
} | |
sub kegg2metacyc_table { | |
my (@kegg) = (@_); | |
my %table; | |
printf(STDERR "Creating kegg2metacyc conversion table.\n") if $opts{verbose}; | |
my $url = sprintf("http://websvc.biocyc.org/ECOLI/foreignid?ids=%s", join(",", map{sprintf("Kegg:%s", $_)}@kegg)); | |
my $response = get($url); | |
foreach my $line (split(/\n/, $response)) { | |
my ($kegg, $count, $metacyc) = split(/\t/, $line, 3); | |
$kegg =~ s/^Kegg://g; | |
if($count) { | |
$table{$kegg} = $metacyc; | |
} | |
} | |
printf(STDERR "Done.\n") if $opts{verbose}; | |
return %table; | |
} | |
sub kegg2metacyc_array { | |
my (@kegg) = (@_); | |
my %table = kegg2metacyc_table(@kegg); | |
return map{exists($table{$_}) ? $table{$_} : "null"}@kegg; | |
} | |
sub kegg2map_table { | |
my %table; | |
printf(STDERR "Creating kegg2map conversion table.\n") if $opts{verbose}; | |
my $response = get("http://rest.kegg.jp/link/pathway/compound"); | |
foreach my $line (split(/\n/, $response)) { | |
if($line =~ /cpd:(C\d+)\tpath:(map\d+)/) { | |
my ($kegg, $map) = ($1, $2); | |
push(@{$table{$kegg}}, $map); | |
} | |
} | |
foreach my $kegg (keys %table) { | |
$table{$kegg} = sprintf("%s", join(",", @{$table{$kegg}})); | |
} | |
printf(STDERR "Done.\n") if $opts{verbose}; | |
return %table; | |
} | |
sub kegg2map_array { | |
my (@kegg) = (@_); | |
my %table = kegg2map_table(); | |
return map{exists($table{$_}) ? $table{$_} : "null"}@kegg; | |
} | |
sub kegg2drug_table { | |
my %table; | |
printf(STDERR "Creating kegg2drug conversion table.\n") if $opts{verbose}; | |
my $response = get("http://rest.kegg.jp/link/drug/compound"); | |
foreach my $line (split(/\n/, $response)) { | |
if($line =~ /cpd:(C\d+)\tdr:(D\d+)/) { | |
my ($kegg, $drug) = ($1, $2); | |
push(@{$table{$kegg}}, $drug); | |
} | |
} | |
foreach my $kegg (keys %table) { | |
$table{$kegg} = sprintf("%s", join(",", @{$table{$kegg}})); | |
} | |
printf(STDERR "Done.\n") if $opts{verbose}; | |
return %table; | |
} | |
sub kegg2drug_array { | |
my (@kegg) = (@_); | |
my %table = kegg2drug_table(); | |
return map{exists($table{$_}) ? $table{$_} : "null"}@kegg; | |
} | |
sub kegg2disease_table { | |
my %table; | |
printf(STDERR "Creating kegg2disease conversion table.\n") if $opts{verbose}; | |
my $response = get("http://rest.kegg.jp/link/disease/compound"); | |
foreach my $line (split(/\n/, $response)) { | |
if($line =~ /cpd:(C\d+)\tds:(H\d+)/) { | |
my ($kegg, $disease) = ($1, $2); | |
push(@{$table{$kegg}}, $disease); | |
} | |
} | |
foreach my $kegg (keys %table) { | |
$table{$kegg} = sprintf("%s", join(",", @{$table{$kegg}})); | |
} | |
printf(STDERR "Done.\n") if $opts{verbose}; | |
return %table; | |
} | |
sub kegg2disease_array { | |
my (@kegg) = (@_); | |
my %table = kegg2disease_table(); | |
return map{exists($table{$_}) ? $table{$_} : "null"}@kegg; | |
} | |
sub kegg2brite_table { | |
my %table; | |
printf(STDERR "Creating kegg2brite conversion table.\n") if $opts{verbose}; | |
my $response = get("http://rest.kegg.jp/link/brite/compound"); | |
foreach my $line (split(/\n/, $response)) { | |
if($line =~ /cpd:(C\d+)\tbr:(br\d+)/) { | |
my ($kegg, $brite) = ($1, $2); | |
push(@{$table{$kegg}}, $brite); | |
} | |
} | |
foreach my $kegg (keys %table) { | |
$table{$kegg} = sprintf("%s", join(",", @{$table{$kegg}})); | |
} | |
printf(STDERR "Done.\n") if $opts{verbose}; | |
return %table; | |
} | |
sub kegg2brite_array { | |
my (@kegg) = (@_); | |
my %table = kegg2brite_table(); | |
return map{exists($table{$_}) ? $table{$_} : "null"}@kegg; | |
} | |
sub keggCompoundTraverse_table { | |
my $text = get("http://rest.kegg.jp/get/br:br08001"); | |
sleep(2); | |
my $table; | |
my ($cA, $cB, $cC); | |
foreach my $line (split(/[\r\n]/, $text)) { | |
if(my ($level, $category) = ($line =~ /^([ABCD]) *(.+)$/)) { | |
if($level eq "A") { | |
$category =~ s/ *\[.+\]$//g; | |
$cA = $category; | |
} elsif($level eq "B") { | |
$category =~ s/ *\[.+\]$//g; | |
$cB = $category; | |
} elsif($level eq "C") { | |
$category =~ s/ *\[.+\]$//g; | |
$cC = $category; | |
} else { | |
my ($kegg, $name) = split(/\s/, $category); | |
$table->{$kegg}->{A} = $cA; | |
$table->{$kegg}->{B} = $cB; | |
$table->{$kegg}->{C} = $cC; | |
} | |
} | |
} | |
return $table; | |
} | |
sub keggCompoundTraverse { | |
my ($level, @kegg) = (@_); | |
my $table = keggCompoundTraverse_table(); | |
return map{exists($table->{$_}->{$level}) ? $table->{$_}->{$level}: "null"}@kegg; | |
} | |
sub keggPathwayTraverse_table { | |
my $text = get("http://rest.kegg.jp/get/br:br08901"); | |
sleep(2); | |
my $table; | |
my @Bterms; | |
my ($cA, $cB); | |
my %k2m = kegg2map_table(); | |
foreach my $line (split(/[\r\n]/, $text)) { | |
if(my ($level, $category) = ($line =~ /^([ABC]) *(.+)$/)) { | |
if($level eq "A") { | |
$cA = $category; | |
} elsif($level eq "B") { | |
push(@Bterms, $category); | |
$cB = $category; | |
} else { | |
my ($map, $name) = split(/\s/, $category); | |
$table->{"map$map"}->{A} = $cA; | |
$table->{"map$map"}->{B} = $cB; | |
} | |
} | |
} | |
my $ret; | |
foreach my $kegg (keys(%k2m)) { | |
foreach my $level (qw(A B)) { | |
my $tmp; | |
foreach my $map (split(/,/, $k2m{$kegg})) { | |
$tmp->{$table->{$map}->{$level}}++; | |
} | |
my @order = map{$_->[0]}sort{$b->[1] <=> $a->[1]}map{[$_, $tmp->{$_}]}keys(%{$tmp}); | |
$ret->{$kegg}->{$level} = $order[0]; | |
} | |
} | |
return $ret; | |
} | |
sub keggPathwayTraverse { | |
my ($level, @kegg) = (@_); | |
my $table = keggPathwayTraverse_table(); | |
return map{exists($table->{$_}->{$level}) ? $table->{$_}->{$level}: "null"}@kegg; | |
} | |
sub chembl2inchi_table { | |
my (@chembl) = grep{!/null/}(@_); | |
printf(STDERR "Creating chembl2inchi conversion table.\n") if $opts{verbose}; | |
my %table; | |
my $n = 200; | |
my $max = ceil(scalar(@chembl) / $n); | |
for (my $i = 0; $i * $n < scalar(@chembl); ++$i) { | |
sleep(2) if $i; | |
printf(STDERR "\r\e[KBatch %d out of %d (%.3f %%)", $i + 1, $max , (($i + 1) / $max) * 100) if $opts{verbose}; | |
my $query_template = <<EOQ; | |
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX owl: <http://www.w3.org/2002/07/owl#> | |
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | |
PREFIX dc: <http://purl.org/dc/elements/1.1/> | |
PREFIX dcterms: <http://purl.org/dc/terms/> | |
PREFIX foaf: <http://xmlns.com/foaf/0.1/> | |
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> | |
PREFIX cco: <http://rdf.ebi.ac.uk/terms/chembl#> | |
SELECT ?id ?key | |
WHERE { | |
\%s | |
} | |
ORDER BY ?label | |
EOQ | |
my $union_template = <<EOT; | |
\{ | |
<http://rdf.ebi.ac.uk/resource/chembl/molecule/\%s> cco:chemblId ?id . | |
<http://rdf.ebi.ac.uk/resource/chembl/molecule/\%s#standard_inchi_key> <http://semanticscience.org/resource/SIO_000300> ?key . | |
\} | |
EOT | |
my $offset = $i * $n; | |
my $remain = scalar(@chembl) - $offset; | |
my $limit = $remain > $n ? $n : $remain; | |
my $union = join("UNION\n", map{sprintf($union_template, $_, $_)}@chembl[$offset..($offset+$limit-1)]); | |
my $query = sprintf($query_template, $union); | |
my $endpoint = "https://www.ebi.ac.uk/rdf/services/chembl/servlet/query"; | |
my %params = ( | |
query => $query, | |
offset => 0, | |
limit => $limit, | |
inference => "false" | |
); | |
my $request = POST($endpoint, [%params]); | |
my $ua = LWP::UserAgent->new(); | |
$ua->ssl_opts( SSL_ca_file => Mozilla::CA::SSL_ca_file() ); | |
my $response = $ua->request($request); | |
my $ref = XMLin($response->content()); | |
foreach my $result (map{$_->{binding}}@{$ref->{results}->{result}}) { | |
my $id = $result->{id}->{literal}; | |
my $key = $result->{key}->{literal}; | |
$table{$id} = $key; | |
} | |
} | |
printf(STDERR "\nDone.\n") if $opts{verbose}; | |
return %table; | |
} | |
sub chembl2inchi_array { | |
my (@chembl) = (@_); | |
my %table = chembl2inchi_table(@chembl); | |
return map{exists($table{$_}) ? $table{$_} : "null"}@chembl; | |
} | |
sub chembl2smiles_table { | |
my (@chembl) = grep{!/null/}(@_); | |
printf(STDERR "Creating chembl2smiles conversion table.\n") if $opts{verbose}; | |
my %table; | |
my $n = 200; | |
my $max = ceil(scalar(@chembl) / $n); | |
for (my $i = 0; $i * $n < scalar(@chembl); ++$i) { | |
sleep(2) if $i; | |
printf(STDERR "\r\e[KBatch %d out of %d (%.3f %%)", $i + 1, $max , (($i + 1) / $max) * 100) if $opts{verbose}; | |
my $query_template = <<EOQ; | |
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX owl: <http://www.w3.org/2002/07/owl#> | |
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | |
PREFIX dc: <http://purl.org/dc/elements/1.1/> | |
PREFIX dcterms: <http://purl.org/dc/terms/> | |
PREFIX foaf: <http://xmlns.com/foaf/0.1/> | |
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> | |
PREFIX cco: <http://rdf.ebi.ac.uk/terms/chembl#> | |
SELECT ?id ?key | |
WHERE { | |
\%s | |
} | |
ORDER BY ?label | |
EOQ | |
my $union_template = <<EOT; | |
\{ | |
<http://rdf.ebi.ac.uk/resource/chembl/molecule/\%s> cco:chemblId ?id . | |
<http://rdf.ebi.ac.uk/resource/chembl/molecule/\%s#canonical_smiles> <http://semanticscience.org/resource/SIO_000300> ?key . | |
\} | |
EOT | |
my $offset = $i * $n; | |
my $remain = scalar(@chembl) - $offset; | |
my $limit = $remain > $n ? $n : $remain; | |
my $union = join("UNION\n", map{sprintf($union_template, $_, $_)}@chembl[$offset..($offset+$limit-1)]); | |
my $query = sprintf($query_template, $union); | |
my $endpoint = "https://www.ebi.ac.uk/rdf/services/chembl/servlet/query"; | |
my %params = ( | |
query => $query, | |
offset => 0, | |
limit => $limit, | |
inference => "false" | |
); | |
my $request = POST($endpoint, [%params]); | |
my $ua = LWP::UserAgent->new(); | |
$ua->ssl_opts( SSL_ca_file => Mozilla::CA::SSL_ca_file() ); | |
my $response = $ua->request($request); | |
my $ref = XMLin($response->content()); | |
foreach my $result (map{$_->{binding}}@{$ref->{results}->{result}}) { | |
my $id = $result->{id}->{literal}; | |
my $key = $result->{key}->{literal}; | |
$table{$id} = $key; | |
} | |
} | |
printf(STDERR "\nDone.\n") if $opts{verbose}; | |
return %table; | |
} | |
sub chembl2smiles_array { | |
my (@chembl) = (@_); | |
my %table = chembl2smiles_table(@chembl); | |
return map{exists($table{$_}) ? $table{$_} : "null"}@chembl; | |
} | |
sub chembl2formula_table { | |
my (@chembl) = grep{!/null/}(@_); | |
printf(STDERR "Creating chembl2formula conversion table.\n") if $opts{verbose}; | |
my %table; | |
my $n = 200; | |
my $max = ceil(scalar(@chembl) / $n); | |
for (my $i = 0; $i * $n < scalar(@chembl); ++$i) { | |
sleep(2) if $i; | |
printf(STDERR "\r\e[KBatch %d out of %d (%.3f %%)", $i + 1, $max , (($i + 1) / $max) * 100) if $opts{verbose}; | |
my $query_template = <<EOQ; | |
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
PREFIX owl: <http://www.w3.org/2002/07/owl#> | |
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> | |
PREFIX dc: <http://purl.org/dc/elements/1.1/> | |
PREFIX dcterms: <http://purl.org/dc/terms/> | |
PREFIX foaf: <http://xmlns.com/foaf/0.1/> | |
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> | |
PREFIX cco: <http://rdf.ebi.ac.uk/terms/chembl#> | |
SELECT ?id ?key | |
WHERE { | |
\%s | |
} | |
ORDER BY ?label | |
EOQ | |
my $union_template = <<EOT; | |
\{ | |
<http://rdf.ebi.ac.uk/resource/chembl/molecule/\%s> cco:chemblId ?id . | |
<http://rdf.ebi.ac.uk/resource/chembl/molecule/\%s#full_molformula> <http://semanticscience.org/resource/SIO_000300> ?key . | |
\} | |
EOT | |
my $offset = $i * $n; | |
my $remain = scalar(@chembl) - $offset; | |
my $limit = $remain > $n ? $n : $remain; | |
my $union = join("UNION\n", map{sprintf($union_template, $_, $_)}@chembl[$offset..($offset+$limit-1)]); | |
my $query = sprintf($query_template, $union); | |
my $endpoint = "https://www.ebi.ac.uk/rdf/services/chembl/servlet/query"; | |
my %params = ( | |
query => $query, | |
offset => 0, | |
limit => $limit, | |
inference => "false" | |
); | |
my $request = POST($endpoint, [%params]); | |
my $ua = LWP::UserAgent->new(); | |
$ua->ssl_opts( SSL_ca_file => Mozilla::CA::SSL_ca_file() ); | |
my $response = $ua->request($request); | |
my $ref = XMLin($response->content()); | |
foreach my $result (map{$_->{binding}}@{$ref->{results}->{result}}) { | |
my $id = $result->{id}->{literal}; | |
my $key = $result->{key}->{literal}; | |
$table{$id} = $key; | |
} | |
} | |
printf(STDERR "\nDone.\n") if $opts{verbose}; | |
return %table; | |
} | |
sub chembl2formula_array { | |
my (@chembl) = (@_); | |
my %table = chembl2formula_table(@chembl); | |
return map{exists($table{$_}) ? $table{$_} : "null"}@chembl; | |
} | |
########################## | |
# Miscellaneous routines # | |
########################## | |
sub print_help { | |
print <<EOS; | |
mztab_annotate.pl - Annotate mztab file according to provided metadata | |
Usage: | |
perl mztab_annotate.pl [options] [--input] <input> [--output] <output> | |
Options: | |
--force Force overwrite the input file. | |
--input Input mzTab file. | |
--output Name of mzTab output file. Defaults to STDOUT. | |
--verbose Execute in verbose mode. | |
EOS | |
my $status = shift || 0; | |
exit($status); | |
} | |
############################################################################### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment