shenwei356/data.gff

## data.gff
scaf001	glean	mRNA	20	30	0.4	-	.	somestring
scaf001	glean	CDS	30	50	.	+	0	asdfasd
scaf003	glean	CDS	10	25	0.4	-	.	somestring

## extract_cds_by_gff.pl
#!/usr/bin/env perl

use strict;
use File::Basename;
use Getopt::Long;
use BioUtil::Seq;
use BioUtil::Util;
use Data::Dumper;

$0 = basename($0);
my $usage = q(
Usage: $0 [options] gff_file fasta_file
Options:
    -t,   --type         gene type [CDS]
    -us,  --up-stream    up stream length [0]
    -ds,  --down-stream  down stream length [0]
    -h,   --help         show this usage

);

my $argv = {};
$$argv{type}        = 'CDS';
$$argv{up_stream}   = 0;
$$argv{down_stream} = 0;

GetOptions(
    'help|h'           => \$$argv{help},
    'type|t=s'         => \$$argv{type},
    'up-stream|us=s'   => \$$argv{up_stream},
    'down-stream|ds=s' => \$$argv{down_stream},
);

die $usage if $$argv{help};
die $usage if scalar(@ARGV) != 2;

check_positive_integer( $$argv{up_stream} + 1 );
check_positive_integer( $$argv{down_stream} + 1 );

my ( $gff_file, $fasta_file ) = @ARGV;

my $genes = read_gff_file($gff_file);

# print Dumper($genes);

my $next_seq = FastaReader($fasta_file);
while ( my $fa = &$next_seq() ) {
    my ( $name, $genome ) = @$fa;
    next if not exists $$genes{$name};

    for my $gene ( @{ $$genes{$name} } ) {
        next if lc $$gene{type} ne lc $$argv{type};    # specific type
        my $seq = '';

        if ( $$gene{strand} eq '+' ) {
            my $s = $$gene{start} - $$argv{up_stream} - 1;
            $s = 0 if $s < 0;
            $seq = substr(
                $genome, $s,
                $$gene{end}
                    - $$gene{start}
                    + $$argv{down_stream} + 1

            );
        }
        else {
            my $s = $$gene{start} - $$argv{down_stream} - 1;
            $s = 0 if $s < 0;
            $seq = revcom(
                substr(
                    $genome, $s,
                    $$gene{end} - $$gene{start} + $$argv{up_stream} + 1
                )
            );
        }
        printf( ">%s_%d..%d..%s\n%s",
            $name, $$gene{start}, $$gene{end}, $$gene{strand},
            format_seq($seq) );
    }

}

sub read_gff_file {
    my ($file) = @_;
    my $genes = {};
    open( my $fh, "<", $file ) or die "fail to open file: $file\n";
    while (<$fh>) {
        my @data = split( /\s+/, $_ );
        next unless scalar(@data) >= 9;
        my $name = $data[0];
        my $gene = {};
        ( $$gene{type}, $$gene{start}, $$gene{end}, $$gene{strand} )
            = ( $data[2], $data[3], $data[4], $data[6] );
        if ( not exists $$genes{$name} ) {
            $$genes{$name} = [];
        }
        push @{ $$genes{$name} }, $gene;

    }
    close($fh);
    return $genes;
}

## extract_cds_by_gff.py
#!/usr/bin/env python

from __future__ import print_function
import argparse
import sys
import gzip
from collections import defaultdict
from Bio import SeqIO

parser = argparse.ArgumentParser(description="extract_cds_by_gff")
parser.add_argument('-t', '--type', type=str,
					default='CDS', help='gene type [CDS]')
parser.add_argument('-us', '--up-stream', type=int,
					default=0, help='up stream length [0]')
parser.add_argument('-ds', '--down-stream', type=int,
					default=0, help='down stream length [0]')
parser.add_argument('gff_file', type=str, help='gff file')
parser.add_argument('fasta_file', type=str, help='fasta file')
args = parser.parse_args()
if not (args.up_stream >= 0 and args.down_stream >= 0):
    print('value of --up-stream and --down-stream should be >= 0', file=sys.stderr)
    sys.exit(1)


def read_gff_file(file):
    genes = defaultdict(list)
    with open(file, 'rt') as fh:
        for row in fh:
            data = row.strip().split('\t')
            if len(data) < 9:
                continue
            name = data[0]
            gene = dict()
            gene['type'], gene['start'], gene['end'], gene['strand']
				= data[2], int(data[3]), int(data[4]), data[6]
            genes[name].append(gene)
    return genes


genes = read_gff_file(args.gff_file)

fh = gzip.open(args.fasta_file, 'rt') if args.fasta_file.endswith('.gz') else open(args.fasta_file, 'r')
for record in SeqIO.parse(fh, 'fasta'):
    name, genome = record.id, record.seq
    if record.id not in genes:
        continue

    for gene in genes[name]:
        if gene['type'].lower() != args.type.lower():
            continue
        seq = ''
        if gene['strand'] == '+':
            s = gene['start'] - args.up_stream - 1
            s = 0 if s < 0 else s
            seq = genome[s:gene['end'] + args.down_stream]
        else:
            s = gene['start'] - args.down_stream - 1
            s = 0 if s < 0 else s
            seq = genome[s:gene['end'] + args.up_stream].reverse_complement()
        print('>{}_{}..{}..{}\n{}'.format(name, gene['start'], gene['end'], gene['strand'], seq))
fh.close()

## seq.fa
>scaf001
acgtatcgatgattactacgtatcgatgattactacgtatcgatgattactacgtatcgatgattact
>scaf002
aacgacattcatgagtgattactgattactgattac
>scaf003
acgtatcgatgattactacgtatcgatgattaacgacattcatgagtgattactgattactgatactac
	scaf001 glean mRNA 20 30 0.4 - . somestring
	scaf001 glean CDS 30 50 . + 0 asdfasd
	scaf003 glean CDS 10 25 0.4 - . somestring
	#!/usr/bin/env perl

	use strict;
	use File::Basename;
	use Getopt::Long;
	use BioUtil::Seq;
	use BioUtil::Util;
	use Data::Dumper;

	$0 = basename($0);
	my $usage = q(
	Usage: $0 [options] gff_file fasta_file
	Options:
	-t, --type gene type [CDS]
	-us, --up-stream up stream length [0]
	-ds, --down-stream down stream length [0]
	-h, --help show this usage

	);

	my $argv = {};
	$$argv{type} = 'CDS';
	$$argv{up_stream} = 0;
	$$argv{down_stream} = 0;

	GetOptions(
	'help\|h' => \$$argv{help},
	'type\|t=s' => \$$argv{type},
	'up-stream\|us=s' => \$$argv{up_stream},
	'down-stream\|ds=s' => \$$argv{down_stream},
	);

	die $usage if $$argv{help};
	die $usage if scalar(@ARGV) != 2;

	check_positive_integer( $$argv{up_stream} + 1 );
	check_positive_integer( $$argv{down_stream} + 1 );

	my ( $gff_file, $fasta_file ) = @ARGV;

	my $genes = read_gff_file($gff_file);

	# print Dumper($genes);

	my $next_seq = FastaReader($fasta_file);
	while ( my $fa = &$next_seq() ) {
	my ( $name, $genome ) = @$fa;
	next if not exists $$genes{$name};

	for my $gene ( @{ $$genes{$name} } ) {
	next if lc $$gene{type} ne lc $$argv{type}; # specific type
	my $seq = '';

	if ( $$gene{strand} eq '+' ) {
	my $s = $$gene{start} - $$argv{up_stream} - 1;
	$s = 0 if $s < 0;
	$seq = substr(
	$genome, $s,
	$$gene{end}
	- $$gene{start}
	+ $$argv{down_stream} + 1

	);
	}
	else {
	my $s = $$gene{start} - $$argv{down_stream} - 1;
	$s = 0 if $s < 0;
	$seq = revcom(
	substr(
	$genome, $s,
	$$gene{end} - $$gene{start} + $$argv{up_stream} + 1
	)
	);
	}
	printf( ">%s_%d..%d..%s\n%s",
	$name, $$gene{start}, $$gene{end}, $$gene{strand},
	format_seq($seq) );
	}

	}

	sub read_gff_file {
	my ($file) = @_;
	my $genes = {};
	open( my $fh, "<", $file ) or die "fail to open file: $file\n";
	while (<$fh>) {
	my @data = split( /\s+/, $_ );
	next unless scalar(@data) >= 9;
	my $name = $data[0];
	my $gene = {};
	( $$gene{type}, $$gene{start}, $$gene{end}, $$gene{strand} )
	= ( $data[2], $data[3], $data[4], $data[6] );
	if ( not exists $$genes{$name} ) {
	$$genes{$name} = [];
	}
	push @{ $$genes{$name} }, $gene;

	}
	close($fh);
	return $genes;
	}
	#!/usr/bin/env python

	from __future__ import print_function
	import argparse
	import sys
	import gzip
	from collections import defaultdict
	from Bio import SeqIO

	parser = argparse.ArgumentParser(description="extract_cds_by_gff")
	parser.add_argument('-t', '--type', type=str,
	default='CDS', help='gene type [CDS]')
	parser.add_argument('-us', '--up-stream', type=int,
	default=0, help='up stream length [0]')
	parser.add_argument('-ds', '--down-stream', type=int,
	default=0, help='down stream length [0]')
	parser.add_argument('gff_file', type=str, help='gff file')
	parser.add_argument('fasta_file', type=str, help='fasta file')
	args = parser.parse_args()
	if not (args.up_stream >= 0 and args.down_stream >= 0):
	print('value of --up-stream and --down-stream should be >= 0', file=sys.stderr)
	sys.exit(1)


	def read_gff_file(file):
	genes = defaultdict(list)
	with open(file, 'rt') as fh:
	for row in fh:
	data = row.strip().split('\t')
	if len(data) < 9:
	continue
	name = data[0]
	gene = dict()
	gene['type'], gene['start'], gene['end'], gene['strand']
	= data[2], int(data[3]), int(data[4]), data[6]
	genes[name].append(gene)
	return genes


	genes = read_gff_file(args.gff_file)

	fh = gzip.open(args.fasta_file, 'rt') if args.fasta_file.endswith('.gz') else open(args.fasta_file, 'r')
	for record in SeqIO.parse(fh, 'fasta'):
	name, genome = record.id, record.seq
	if record.id not in genes:
	continue

	for gene in genes[name]:
	if gene['type'].lower() != args.type.lower():
	continue
	seq = ''
	if gene['strand'] == '+':
	s = gene['start'] - args.up_stream - 1
	s = 0 if s < 0 else s
	seq = genome[s:gene['end'] + args.down_stream]
	else:
	s = gene['start'] - args.down_stream - 1
	s = 0 if s < 0 else s
	seq = genome[s:gene['end'] + args.up_stream].reverse_complement()
	print('>{}_{}..{}..{}\n{}'.format(name, gene['start'], gene['end'], gene['strand'], seq))
	fh.close()
	>scaf001
	acgtatcgatgattactacgtatcgatgattactacgtatcgatgattactacgtatcgatgattact
	>scaf002
	aacgacattcatgagtgattactgattactgattac
	>scaf003
	acgtatcgatgattactacgtatcgatgattaacgacattcatgagtgattactgattactgatactac