Skip to content

Instantly share code, notes, and snippets.

@cjfields
Created March 12, 2012 17:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cjfields/2023358 to your computer and use it in GitHub Desktop.
Save cjfields/2023358 to your computer and use it in GitHub Desktop.
Updated script to grab gene locations, up and downstream
#!/usr/bin/perl
use strict;
use warnings;
use Bio::DB::EUtilities;
use Bio::SeqIO;
# this needs to be a list of EntrezGene unique IDs
my @gene_ids = @ARGV;
my $eutil = Bio::DB::EUtilities->new(-eutil => 'esummary',
-email => 'cjfields@bioperl.org',
-db => 'gene',
-id => \@gene_ids);
my $fetcher = Bio::DB::EUtilities->new(-eutil => 'efetch',
-email => 'cjfields@bioperl.org',
-db => 'nucleotide',
-rettype => 'gb');
while (my $docsum = $eutil->next_DocSum) {
# to ensure we grab the right ChrStart information, we grab the Item above
# it in the Item hierarchy (visible via print_all from the eutil instance)
my ($item) = $docsum->get_Items_by_name('GenomicInfoType');
my $gene_id = $docsum->get_id;
my %item_data = map {$_ => 0} qw(ChrAccVer ChrStart ChrStop);
while (my $sub_item = $item->next_subItem) {
if (exists $item_data{$sub_item->get_name}) {
$item_data{$sub_item->get_name} = $sub_item->get_content;
}
}
# adjust to 1-based coords (gene seems to assume 0-based closed)
$item_data{ChrStart}++;
$item_data{ChrStop}++;
my $strand = $item_data{ChrStart} > $item_data{ChrStop} ? 2 : 1;
print STDERR sprintf("Original %s, from %d-%d, strand %d\n", $item_data{ChrAccVer},
$item_data{ChrStart},
$item_data{ChrStop},
$strand
);
# check to make sure everything is set
for my $check (qw(ChrAccVer ChrStart ChrStop)) {
die "$check not set" unless $item_data{$check};
}
my $start_offset = 1;
my $end_offset = 1;
if ($strand == 2) {
($item_data{ChrStart}, $item_data{ChrStop}) = ($item_data{ChrStop}, $item_data{ChrStart});
$item_data{ChrStart} -= $end_offset;
$item_data{ChrStop} += $start_offset;
} else {
$item_data{ChrStart} -= $start_offset;
$item_data{ChrStop} += $end_offset;
}
print STDERR sprintf("Retrieving %s, from %d-%d, strand %d\n", $item_data{ChrAccVer},
$item_data{ChrStart},
$item_data{ChrStop},
$strand
);
$fetcher->set_parameters(-id => $item_data{ChrAccVer},
-seq_start => $item_data{ChrStart},
-seq_stop => $item_data{ChrStop},
-strand => $strand,
-rettype => 'fasta'
);
my $seq = $fetcher->get_Response->content;
open (my $fh, '<', \$seq);
my $in = Bio::SeqIO->new(-fh => $fh, -format => 'fasta');
# output files
my $out = Bio::SeqIO->new(-file => '>'.$gene_id."_all.fn", -format => 'fasta');
my $out_gene = Bio::SeqIO->new(-file => '>'.$gene_id."_gene.fn", -format => 'fasta');
my $out_up = Bio::SeqIO->new(-file => '>'.$gene_id."_up.fn", -format => 'fasta');
my $out_down = Bio::SeqIO->new(-file => '>'.$gene_id."_down.fn", -format => 'fasta');
my $seqobj = $in->next_seq;
# add Gene ID
$seqobj->display_id("GeneID:$gene_id:".$seqobj->display_id);
my ($trunc_start, $trunc_end) = (1 + $start_offset, $seqobj->length - $end_offset);
# write out entire sequence
$out->write_seq($seqobj);
# gene alone
$out_gene->write_seq($seqobj->trunc($trunc_start, $trunc_end));
# ups and downs
$out_up->write_seq($seqobj->trunc(1,$trunc_start - 1)) if $start_offset;
$out_down->write_seq($seqobj->trunc($trunc_end + 1, $seqobj->length))
if $end_offset;
}
@cjfields
Copy link
Author

Should work now

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment