Skip to content

Instantly share code, notes, and snippets.

@shenwei356
Last active August 29, 2015 14:20
Show Gist options
  • Save shenwei356/98a491c700173a0c229d to your computer and use it in GitHub Desktop.
Save shenwei356/98a491c700173a0c229d to your computer and use it in GitHub Desktop.
extract_cds_by_gff, Perl and Python version
scaf001 glean mRNA 20 30 0.4 - . somestring
scaf001 glean CDS 30 50 . + 0 asdfasd
scaf003 glean CDS 10 25 0.4 - . somestring
#!/usr/bin/env perl
use strict;
use File::Basename;
use Getopt::Long;
use BioUtil::Seq;
use BioUtil::Util;
use Data::Dumper;
$0 = basename($0);
my $usage = q(
Usage: $0 [options] gff_file fasta_file
Options:
-t, --type gene type [CDS]
-us, --up-stream up stream length [0]
-ds, --down-stream down stream length [0]
-h, --help show this usage
);
my $argv = {};
$$argv{type} = 'CDS';
$$argv{up_stream} = 0;
$$argv{down_stream} = 0;
GetOptions(
'help|h' => \$$argv{help},
'type|t=s' => \$$argv{type},
'up-stream|us=s' => \$$argv{up_stream},
'down-stream|ds=s' => \$$argv{down_stream},
);
die $usage if $$argv{help};
die $usage if scalar(@ARGV) != 2;
check_positive_integer( $$argv{up_stream} + 1 );
check_positive_integer( $$argv{down_stream} + 1 );
my ( $gff_file, $fasta_file ) = @ARGV;
my $genes = read_gff_file($gff_file);
# print Dumper($genes);
my $next_seq = FastaReader($fasta_file);
while ( my $fa = &$next_seq() ) {
my ( $name, $genome ) = @$fa;
next if not exists $$genes{$name};
for my $gene ( @{ $$genes{$name} } ) {
next if lc $$gene{type} ne lc $$argv{type}; # specific type
my $seq = '';
if ( $$gene{strand} eq '+' ) {
my $s = $$gene{start} - $$argv{up_stream} - 1;
$s = 0 if $s < 0;
$seq = substr(
$genome, $s,
$$gene{end}
- $$gene{start}
+ $$argv{down_stream} + 1
);
}
else {
my $s = $$gene{start} - $$argv{down_stream} - 1;
$s = 0 if $s < 0;
$seq = revcom(
substr(
$genome, $s,
$$gene{end} - $$gene{start} + $$argv{up_stream} + 1
)
);
}
printf( ">%s_%d..%d..%s\n%s",
$name, $$gene{start}, $$gene{end}, $$gene{strand},
format_seq($seq) );
}
}
sub read_gff_file {
my ($file) = @_;
my $genes = {};
open( my $fh, "<", $file ) or die "fail to open file: $file\n";
while (<$fh>) {
my @data = split( /\s+/, $_ );
next unless scalar(@data) >= 9;
my $name = $data[0];
my $gene = {};
( $$gene{type}, $$gene{start}, $$gene{end}, $$gene{strand} )
= ( $data[2], $data[3], $data[4], $data[6] );
if ( not exists $$genes{$name} ) {
$$genes{$name} = [];
}
push @{ $$genes{$name} }, $gene;
}
close($fh);
return $genes;
}
#!/usr/bin/env python
from __future__ import print_function
import argparse
import sys
import gzip
from collections import defaultdict
from Bio import SeqIO
parser = argparse.ArgumentParser(description="extract_cds_by_gff")
parser.add_argument('-t', '--type', type=str,
default='CDS', help='gene type [CDS]')
parser.add_argument('-us', '--up-stream', type=int,
default=0, help='up stream length [0]')
parser.add_argument('-ds', '--down-stream', type=int,
default=0, help='down stream length [0]')
parser.add_argument('gff_file', type=str, help='gff file')
parser.add_argument('fasta_file', type=str, help='fasta file')
args = parser.parse_args()
if not (args.up_stream >= 0 and args.down_stream >= 0):
print('value of --up-stream and --down-stream should be >= 0', file=sys.stderr)
sys.exit(1)
def read_gff_file(file):
genes = defaultdict(list)
with open(file, 'rt') as fh:
for row in fh:
data = row.strip().split('\t')
if len(data) < 9:
continue
name = data[0]
gene = dict()
gene['type'], gene['start'], gene['end'], gene['strand']
= data[2], int(data[3]), int(data[4]), data[6]
genes[name].append(gene)
return genes
genes = read_gff_file(args.gff_file)
fh = gzip.open(args.fasta_file, 'rt') if args.fasta_file.endswith('.gz') else open(args.fasta_file, 'r')
for record in SeqIO.parse(fh, 'fasta'):
name, genome = record.id, record.seq
if record.id not in genes:
continue
for gene in genes[name]:
if gene['type'].lower() != args.type.lower():
continue
seq = ''
if gene['strand'] == '+':
s = gene['start'] - args.up_stream - 1
s = 0 if s < 0 else s
seq = genome[s:gene['end'] + args.down_stream]
else:
s = gene['start'] - args.down_stream - 1
s = 0 if s < 0 else s
seq = genome[s:gene['end'] + args.up_stream].reverse_complement()
print('>{}_{}..{}..{}\n{}'.format(name, gene['start'], gene['end'], gene['strand'], seq))
fh.close()
>scaf001
acgtatcgatgattactacgtatcgatgattactacgtatcgatgattactacgtatcgatgattact
>scaf002
aacgacattcatgagtgattactgattactgattac
>scaf003
acgtatcgatgattactacgtatcgatgattaacgacattcatgagtgattactgattactgatactac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment