Skip to content

Instantly share code, notes, and snippets.

@slowkow
Created March 24, 2011 06:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save slowkow/884661 to your computer and use it in GitHub Desktop.
Save slowkow/884661 to your computer and use it in GitHub Desktop.
Query any NCBI database (nucleotide, protein, nucgss, etc.) and retrieve resulting records.
#!/usr/bin/perl
# Author : Kamil Slowikowski <kslowikowski@gmail.com>
# Date : March 23, 2011
# Version : 0.1
# Description : Modified code from
# http://www.bioperl.org/wiki/HOWTO:EUtilities_Cookbook
use strict;
use warnings;
use Bio::DB::EUtilities;
use Getopt::Long::Descriptive;
my ($opt, $usage) = describe_options(
'%c [options...]',
[ 'output|o=s',
"write returned data to this file, default is stdout",
{ default => 'stdout' }
],
[ 'query|q=s',
"query sent to NCBI",
{ default => '' }
],
[ 'db|d=s',
"NCBI database to query, default is nucgss",
{ default => 'nucgss' } # nucleotide
],
[ 'listdatabases|l',
"list all available databases and exit",
],
[ 'email|e=s',
"your email, so NCBI can can track your project "
. "and contact you if there is a problem",
{ default => 'your@email.com' }
],
[ 'retstart|s=i',
"number of results to skip, default is 0",
{ default => 0 }
],
[ 'retmax|m=i',
"number of results to download at a time, default is 500",
{ default => 500 }
],
[ 'rettype|t=s',
"format of returned data, default is fasta",
{ default => 'fasta' }
],
[ 'retries|r=i',
"number of retries before giving up if the server doesn't respond",
{ default => 5 }
],
[],
[ 'help|h', "print usage message and exit" ],
);
# print a list of all available databases and exit
if ($opt->listdatabases) {
my $factory = Bio::DB::EUtilities->new(
-eutil => 'einfo',
-email => $opt->email,
);
print join(' ', sort $factory->get_available_databases), "\n";
exit;
}
# print help if query is missing or help is requested
if ($opt->help || length $opt->query < 1) {
print($usage->text);
exit;
}
# create an ESearch object
my $factory = Bio::DB::EUtilities->new(
-eutil => 'esearch',
-db => $opt->db,
-term => $opt->query,
-email => $opt->email,
-usehistory => 'y',
);
# die if the query fails to be translated
die unless my $translation = $factory->get_query_translation;
# check how many results we got back
my $count = $factory->get_count;
warn "Query translates to '$translation'\n";
warn "Found $count results\n";
warn "Download and print all data? [no] \n";
<STDIN> =~ /^ye?s?/i || exit;
# get history from queue
my $hist = $factory->next_History || die 'No history data returned';
# note db carries over from above
$factory->set_parameters(
-eutil => 'efetch',
-rettype => $opt->rettype,
-history => $hist,
);
# count how many times we retry
my $retry = 0;
# how many results to download at a time, which result index to start with
my ($retmax, $retstart) = ($opt->retmax, $opt->retstart);
# either print to stdout or to the file specified
my $out;
if ($opt->output eq 'stdout') {
$out = \*STDOUT;
} else {
open($out, '>', $opt->output) || die "Can't open file: $!";
}
RETRIEVE_RESULTS:
while ($retstart < $count) {
$factory->set_parameters(
-retmax => $retmax,
-retstart => $retstart
);
eval{
$factory->get_Response(
-cb => sub {
my ($data) = @_;
print $out $data;
}
);
};
if ($@) {
if ($retry == $opt->retries) {
# quit if we get to 5 retries
die "Server error: $@. Try again later";
} else {
# otherwise, retry
warn "Server error, retry #", ++$retry, "\n";
redo RETRIEVE_RESULTS;
}
}
warn "Retrieved $retstart-", ($retstart + $retmax), "\n";
$retstart += $retmax;
}
close $out;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment