Last active
August 29, 2015 14:20
-
-
Save shenwei356/98a491c700173a0c229d to your computer and use it in GitHub Desktop.
extract_cds_by_gff, Perl and Python version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scaf001 glean mRNA 20 30 0.4 - . somestring | |
scaf001 glean CDS 30 50 . + 0 asdfasd | |
scaf003 glean CDS 10 25 0.4 - . somestring |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use File::Basename; | |
use Getopt::Long; | |
use BioUtil::Seq; | |
use BioUtil::Util; | |
use Data::Dumper; | |
$0 = basename($0); | |
my $usage = q( | |
Usage: $0 [options] gff_file fasta_file | |
Options: | |
-t, --type gene type [CDS] | |
-us, --up-stream up stream length [0] | |
-ds, --down-stream down stream length [0] | |
-h, --help show this usage | |
); | |
my $argv = {}; | |
$$argv{type} = 'CDS'; | |
$$argv{up_stream} = 0; | |
$$argv{down_stream} = 0; | |
GetOptions( | |
'help|h' => \$$argv{help}, | |
'type|t=s' => \$$argv{type}, | |
'up-stream|us=s' => \$$argv{up_stream}, | |
'down-stream|ds=s' => \$$argv{down_stream}, | |
); | |
die $usage if $$argv{help}; | |
die $usage if scalar(@ARGV) != 2; | |
check_positive_integer( $$argv{up_stream} + 1 ); | |
check_positive_integer( $$argv{down_stream} + 1 ); | |
my ( $gff_file, $fasta_file ) = @ARGV; | |
my $genes = read_gff_file($gff_file); | |
# print Dumper($genes); | |
my $next_seq = FastaReader($fasta_file); | |
while ( my $fa = &$next_seq() ) { | |
my ( $name, $genome ) = @$fa; | |
next if not exists $$genes{$name}; | |
for my $gene ( @{ $$genes{$name} } ) { | |
next if lc $$gene{type} ne lc $$argv{type}; # specific type | |
my $seq = ''; | |
if ( $$gene{strand} eq '+' ) { | |
my $s = $$gene{start} - $$argv{up_stream} - 1; | |
$s = 0 if $s < 0; | |
$seq = substr( | |
$genome, $s, | |
$$gene{end} | |
- $$gene{start} | |
+ $$argv{down_stream} + 1 | |
); | |
} | |
else { | |
my $s = $$gene{start} - $$argv{down_stream} - 1; | |
$s = 0 if $s < 0; | |
$seq = revcom( | |
substr( | |
$genome, $s, | |
$$gene{end} - $$gene{start} + $$argv{up_stream} + 1 | |
) | |
); | |
} | |
printf( ">%s_%d..%d..%s\n%s", | |
$name, $$gene{start}, $$gene{end}, $$gene{strand}, | |
format_seq($seq) ); | |
} | |
} | |
sub read_gff_file { | |
my ($file) = @_; | |
my $genes = {}; | |
open( my $fh, "<", $file ) or die "fail to open file: $file\n"; | |
while (<$fh>) { | |
my @data = split( /\s+/, $_ ); | |
next unless scalar(@data) >= 9; | |
my $name = $data[0]; | |
my $gene = {}; | |
( $$gene{type}, $$gene{start}, $$gene{end}, $$gene{strand} ) | |
= ( $data[2], $data[3], $data[4], $data[6] ); | |
if ( not exists $$genes{$name} ) { | |
$$genes{$name} = []; | |
} | |
push @{ $$genes{$name} }, $gene; | |
} | |
close($fh); | |
return $genes; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
import argparse | |
import sys | |
import gzip | |
from collections import defaultdict | |
from Bio import SeqIO | |
parser = argparse.ArgumentParser(description="extract_cds_by_gff") | |
parser.add_argument('-t', '--type', type=str, | |
default='CDS', help='gene type [CDS]') | |
parser.add_argument('-us', '--up-stream', type=int, | |
default=0, help='up stream length [0]') | |
parser.add_argument('-ds', '--down-stream', type=int, | |
default=0, help='down stream length [0]') | |
parser.add_argument('gff_file', type=str, help='gff file') | |
parser.add_argument('fasta_file', type=str, help='fasta file') | |
args = parser.parse_args() | |
if not (args.up_stream >= 0 and args.down_stream >= 0): | |
print('value of --up-stream and --down-stream should be >= 0', file=sys.stderr) | |
sys.exit(1) | |
def read_gff_file(file): | |
genes = defaultdict(list) | |
with open(file, 'rt') as fh: | |
for row in fh: | |
data = row.strip().split('\t') | |
if len(data) < 9: | |
continue | |
name = data[0] | |
gene = dict() | |
gene['type'], gene['start'], gene['end'], gene['strand'] | |
= data[2], int(data[3]), int(data[4]), data[6] | |
genes[name].append(gene) | |
return genes | |
genes = read_gff_file(args.gff_file) | |
fh = gzip.open(args.fasta_file, 'rt') if args.fasta_file.endswith('.gz') else open(args.fasta_file, 'r') | |
for record in SeqIO.parse(fh, 'fasta'): | |
name, genome = record.id, record.seq | |
if record.id not in genes: | |
continue | |
for gene in genes[name]: | |
if gene['type'].lower() != args.type.lower(): | |
continue | |
seq = '' | |
if gene['strand'] == '+': | |
s = gene['start'] - args.up_stream - 1 | |
s = 0 if s < 0 else s | |
seq = genome[s:gene['end'] + args.down_stream] | |
else: | |
s = gene['start'] - args.down_stream - 1 | |
s = 0 if s < 0 else s | |
seq = genome[s:gene['end'] + args.up_stream].reverse_complement() | |
print('>{}_{}..{}..{}\n{}'.format(name, gene['start'], gene['end'], gene['strand'], seq)) | |
fh.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>scaf001 | |
acgtatcgatgattactacgtatcgatgattactacgtatcgatgattactacgtatcgatgattact | |
>scaf002 | |
aacgacattcatgagtgattactgattactgattac | |
>scaf003 | |
acgtatcgatgattactacgtatcgatgattaacgacattcatgagtgattactgattactgatactac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment