Skip to content

Instantly share code, notes, and snippets.

@pmenzel
Created June 18, 2022 14:44
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pmenzel/50b5ae7aec8ac789274cdd159bc06cfa to your computer and use it in GitHub Desktop.
Save pmenzel/50b5ae7aec8ac789274cdd159bc06cfa to your computer and use it in GitHub Desktop.
Perl script for rotating a fasta sequence to start a specified gene search by BLAST
#!/usr/bin/env perl
#
# rotates and (if necessary reverse complements) an assembly of a circular genome
# so that it starts with the sequence having the best blast hit to a gene database,
# e.g. dnaA
#
# depends on blastn being installed
#
# Example usage:
# rotate_assembly.pl assembly.fasta dnaA.fa > rotated_assembly.fa
#
# 1. argument: input assembly with one circular chromosome
# 2. argument: blast database of genes
# The fasta file with rotated sequences is sent to STDOUT
#
use strict;
use warnings;
# Rotates a circular sequence in $_[0] so that it begins at $_[1]. If $_[2] is set, it also reverse complements the sequence
sub rotate_sequence {
die unless length($_[0]) && $_[1] > 0 && $_[1] <= length($_[0]);
my $rotated_seq = $_[2] ? substr($_[0],$_[1]) . substr($_[0],0,$_[1]) : substr($_[0],$_[1]-1) . substr($_[0],0,$_[1]-1);
if($_[2]) { #if revcompl
#replace all chars that do not belong to the set of complementable characters by N
$rotated_seq =~ s/[^ATGCatgcRYSWKMBVDHNryswkmbvdhn\.\-\?]/N/g;
#complement:
$rotated_seq =~ tr/ATGCatgcRYSWKMBVDHryswkmbvdh/TACGtacgYRSWMKVBHDyrswmkvbhd/;
#reverse:
$rotated_seq = reverse($rotated_seq);
}
return $rotated_seq;
}
#my $seq = "GGAACCTTCATGGCCAAT";
#print $seq,"\n";
#print rotate_sequence($seq,11,1),"\n";
#print "\n";
#$seq = "GGAACCTTATGGGCCAAT";
#print $seq,"\n";
#print rotate_sequence($seq,19,0),"\n";
#
#exit;
my $threshold_pid = 95.0;
if(@ARGV < 2) { die "Specify the 2 arguments: input_assembly.fa genes.fa.\ngenes.fa needs to be indexed with makeblastdb"; }
my %id2rotate;
my %id2revcompl;
my @blastout = `blastn -db $ARGV[1] -task blastn -query $ARGV[0] -outfmt '7 qseqid pident length qstart qend sstart send slen'`;
foreach my $l (@blastout) {
next if $l =~ /^\#/;
chomp $l;
my @F = split(/\t/,$l);
if(!defined($id2rotate{$F[0]}) && $F[1] >= $threshold_pid && $F[7] == $F[2]) {
my $id = $F[0];
print STDERR "Rotating sequence $id\n";
print STDERR $l,"\n";
if($F[6] < $F[5]) {
$id2revcompl{ $id } = 1;
$id2rotate{$id} = $F[4];
}
else {
$id2rotate{$id} = $F[3];
}
}
}
open(my $in_fh, $ARGV[0]) or die "Could not open file $ARGV[0].";
my $curr_seq = "";
my $curr_name = "";
while(<$in_fh>) {
chomp;
if(/^>(.*)/) {
if(length($curr_seq) == 0) {
$curr_name = $1;
next;
}
print ">$curr_name\n";
if(defined($id2rotate{$curr_name})) {
print rotate_sequence($curr_seq, $id2rotate{$curr_name}, $id2revcompl{$curr_name}), "\n";
}
else {
print "$curr_seq\n";
}
$curr_name = $1;
$curr_seq = "";
}
else {
s/\s//g;
$curr_seq .= $_;
}
}
print ">$curr_name\n";
if(defined($id2rotate{$curr_name})) {
print rotate_sequence($curr_seq, $id2rotate{$curr_name}, $id2revcompl{$curr_name}), "\n";
}
else {
print "$curr_seq\n";
}
close($in_fh);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment