cjfields/seq.pl

## seq.pl
#!/usr/bin/perl
use strict;
use warnings;
use Bio::DB::EUtilities;
use Bio::SeqIO;

# this needs to be a list of EntrezGene unique IDs
my @gene_ids = @ARGV;

my $eutil   = Bio::DB::EUtilities->new(-eutil => 'esummary',
                                       -email => 'cjfields@bioperl.org',
                                       -db    => 'gene',
                                       -id    => \@gene_ids);

my $fetcher = Bio::DB::EUtilities->new(-eutil   => 'efetch',
                                       -email => 'cjfields@bioperl.org',
                                       -db      => 'nucleotide',
                                       -rettype => 'gb');

while (my $docsum = $eutil->next_DocSum) {
    # to ensure we grab the right ChrStart information, we grab the Item above
    # it in the Item hierarchy (visible via print_all from the eutil instance)
    my ($item) = $docsum->get_Items_by_name('GenomicInfoType');

    my $gene_id = $docsum->get_id;

    my %item_data = map {$_ => 0} qw(ChrAccVer ChrStart ChrStop);


    while (my $sub_item = $item->next_subItem) {
        if (exists $item_data{$sub_item->get_name}) {
            $item_data{$sub_item->get_name} = $sub_item->get_content;
        }
    }

    # adjust to 1-based coords (gene seems to assume 0-based closed)
    $item_data{ChrStart}++;
    $item_data{ChrStop}++;

    my $strand = $item_data{ChrStart} > $item_data{ChrStop} ? 2 : 1;

    print STDERR sprintf("Original %s, from %d-%d, strand %d\n", $item_data{ChrAccVer},
                                                    $item_data{ChrStart},
                                                    $item_data{ChrStop},
                                                    $strand
                                                    );

    # check to make sure everything is set
    for my $check (qw(ChrAccVer ChrStart ChrStop)) {
        die "$check not set" unless $item_data{$check};
    }

    my $start_offset = 1;
    my $end_offset = 1;

    if ($strand == 2) {
        ($item_data{ChrStart}, $item_data{ChrStop}) = ($item_data{ChrStop}, $item_data{ChrStart});
        $item_data{ChrStart} -= $end_offset;
        $item_data{ChrStop} += $start_offset;
    } else {
        $item_data{ChrStart} -= $start_offset;
        $item_data{ChrStop} += $end_offset;
    }

    print STDERR sprintf("Retrieving %s, from %d-%d, strand %d\n", $item_data{ChrAccVer},
                                                    $item_data{ChrStart},
                                                    $item_data{ChrStop},
                                                    $strand
                                                    );

    $fetcher->set_parameters(-id => $item_data{ChrAccVer},
                             -seq_start => $item_data{ChrStart},
                             -seq_stop  => $item_data{ChrStop},
                             -strand    => $strand,
                             -rettype   => 'fasta'
                             );
    my $seq = $fetcher->get_Response->content;


    open (my $fh, '<', \$seq);
    my $in = Bio::SeqIO->new(-fh => $fh, -format => 'fasta');

    # output files
    my $out = Bio::SeqIO->new(-file => '>'.$gene_id."_all.fn", -format => 'fasta');
    my $out_gene = Bio::SeqIO->new(-file => '>'.$gene_id."_gene.fn", -format => 'fasta');
    my $out_up = Bio::SeqIO->new(-file => '>'.$gene_id."_up.fn", -format => 'fasta');
    my $out_down = Bio::SeqIO->new(-file => '>'.$gene_id."_down.fn", -format => 'fasta');

    my $seqobj = $in->next_seq;

    # add Gene ID
    $seqobj->display_id("GeneID:$gene_id:".$seqobj->display_id);

    my ($trunc_start, $trunc_end) = (1 + $start_offset, $seqobj->length - $end_offset);

    # write out entire sequence
    $out->write_seq($seqobj);

    # gene alone
    $out_gene->write_seq($seqobj->trunc($trunc_start, $trunc_end));

    # ups and downs
    $out_up->write_seq($seqobj->trunc(1,$trunc_start - 1)) if $start_offset;
    $out_down->write_seq($seqobj->trunc($trunc_end + 1, $seqobj->length))
        if $end_offset;
}
	#!/usr/bin/perl
	use strict;
	use warnings;
	use Bio::DB::EUtilities;
	use Bio::SeqIO;

	# this needs to be a list of EntrezGene unique IDs
	my @gene_ids = @ARGV;

	my $eutil = Bio::DB::EUtilities->new(-eutil => 'esummary',
	-email => 'cjfields@bioperl.org',
	-db => 'gene',
	-id => \@gene_ids);

	my $fetcher = Bio::DB::EUtilities->new(-eutil => 'efetch',
	-email => 'cjfields@bioperl.org',
	-db => 'nucleotide',
	-rettype => 'gb');

	while (my $docsum = $eutil->next_DocSum) {
	# to ensure we grab the right ChrStart information, we grab the Item above
	# it in the Item hierarchy (visible via print_all from the eutil instance)
	my ($item) = $docsum->get_Items_by_name('GenomicInfoType');

	my $gene_id = $docsum->get_id;

	my %item_data = map {$_ => 0} qw(ChrAccVer ChrStart ChrStop);



	while (my $sub_item = $item->next_subItem) {
	if (exists $item_data{$sub_item->get_name}) {
	$item_data{$sub_item->get_name} = $sub_item->get_content;
	}
	}

	# adjust to 1-based coords (gene seems to assume 0-based closed)
	$item_data{ChrStart}++;
	$item_data{ChrStop}++;

	my $strand = $item_data{ChrStart} > $item_data{ChrStop} ? 2 : 1;

	print STDERR sprintf("Original %s, from %d-%d, strand %d\n", $item_data{ChrAccVer},
	$item_data{ChrStart},
	$item_data{ChrStop},
	$strand
	);

	# check to make sure everything is set
	for my $check (qw(ChrAccVer ChrStart ChrStop)) {
	die "$check not set" unless $item_data{$check};
	}

	my $start_offset = 1;
	my $end_offset = 1;

	if ($strand == 2) {
	($item_data{ChrStart}, $item_data{ChrStop}) = ($item_data{ChrStop}, $item_data{ChrStart});
	$item_data{ChrStart} -= $end_offset;
	$item_data{ChrStop} += $start_offset;
	} else {
	$item_data{ChrStart} -= $start_offset;
	$item_data{ChrStop} += $end_offset;
	}

	print STDERR sprintf("Retrieving %s, from %d-%d, strand %d\n", $item_data{ChrAccVer},
	$item_data{ChrStart},
	$item_data{ChrStop},
	$strand
	);

	$fetcher->set_parameters(-id => $item_data{ChrAccVer},
	-seq_start => $item_data{ChrStart},
	-seq_stop => $item_data{ChrStop},
	-strand => $strand,
	-rettype => 'fasta'
	);
	my $seq = $fetcher->get_Response->content;


	open (my $fh, '<', \$seq);
	my $in = Bio::SeqIO->new(-fh => $fh, -format => 'fasta');

	# output files
	my $out = Bio::SeqIO->new(-file => '>'.$gene_id."_all.fn", -format => 'fasta');
	my $out_gene = Bio::SeqIO->new(-file => '>'.$gene_id."_gene.fn", -format => 'fasta');
	my $out_up = Bio::SeqIO->new(-file => '>'.$gene_id."_up.fn", -format => 'fasta');
	my $out_down = Bio::SeqIO->new(-file => '>'.$gene_id."_down.fn", -format => 'fasta');

	my $seqobj = $in->next_seq;

	# add Gene ID
	$seqobj->display_id("GeneID:$gene_id:".$seqobj->display_id);

	my ($trunc_start, $trunc_end) = (1 + $start_offset, $seqobj->length - $end_offset);

	# write out entire sequence
	$out->write_seq($seqobj);

	# gene alone
	$out_gene->write_seq($seqobj->trunc($trunc_start, $trunc_end));

	# ups and downs
	$out_up->write_seq($seqobj->trunc(1,$trunc_start - 1)) if $start_offset;
	$out_down->write_seq($seqobj->trunc($trunc_end + 1, $seqobj->length))
	if $end_offset;
	}