Skip to content

Instantly share code, notes, and snippets.

@IdoBar
Last active June 18, 2019 09:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save IdoBar/8a324fff97c6a866ea7e0cbcec41d570 to your computer and use it in GitHub Desktop.
Save IdoBar/8a324fff97c6a866ea7e0cbcec41d570 to your computer and use it in GitHub Desktop.
Fasta2APG
#!/bin/env perl
### david.studholme@tsl.ac.uk
### Generates contigs (in FastA) and scaffolding information (in AGP) from Velvet 'contigs.fa' supercontigs file
### Use entirely at you own risk!! There may be bugs!
### modified by chienchi@lanl.gov 2010/09/13
### add flags -i -size -o
### change naming convention for HMP assembly purpose
### modified by mpop@umd.edu 2010/9/15
### added flag -name
### allowing to change names of sequences
### modified by mpop@umd.edu 2010/9/27
### removed 10bp limit for inter-contig gaps
### modified by annelies.haegeman@ilvo.vlaanderen.be 2014/11/27
### see http://seqanswers.com/forums/showpost.php?s=65a823bf03e0f9d596d5dbba36b61b82&p=155500&postcount=12
### fixed counter $i to restart from 1 at each scaffold + unique fragment number generation
### modified by i.bar@griffith.edu.au 2019/06/18
### remove N's in the beginning of scaffolds/contigs
### use /bin/env to determine perl to use
### see relevant issue on https://www.biostars.org/p/335834/
use strict;
use warnings ;
use Bio::SeqIO ;
use Getopt::Long;
use File::Basename;
my $file;
my $outDir;
my $scaf_size_cutoff=0;
my $name_prefix="AGP";
GetOptions(
'i=s' => \$file, # scaf seq in fasta
'size=i' => \$scaf_size_cutoff,
'o=s' => \$outDir,
'name=s' => \$name_prefix);
die "Usage: $0 -i <sequence file> -o <out_dir> [-size <scaf_size_cutoff>] [-name <name>]\n" unless $file;
my ($file_name, $path, $suffix)=fileparse("$file", qr/\.[^.]*/);
my ($sample_name,$center)=split /_/,$file_name;
### Output file for contigs in Fasta format
### The various output file names can be tuned here.
my $contig_outfile = "$outDir/$name_prefix.contigs.fa";
my $scaffolds_outfile = "$outDir/$name_prefix.scaffolds.fa";
my $agp_outfile = "$outDir/$name_prefix.agp";
#open (FILE, ">$outdir/$contig_outfile") and
# warn "Will write contigs to file '$contig_outfile' to $outDir Directory\n" or
# die "Failed to write to file '$fasta_outfile'\n";
my $contig_out = Bio::SeqIO->new('-file' => ">$contig_outfile",
'-format' => 'Fasta') and
warn "Will write contigs to file '$contig_outfile' to $outDir Directory\n" or
die "Failed to write to file '$contig_outfile'\n";
open (AGP_FILE, ">$agp_outfile") and
warn "Will write contigs to file $agp_outfile to $outDir Directory\n" or
die "Failed to write to file $agp_outfile\n";
#open (SCAFF_FILE, ">$outDir/$scaffolds_outfile") and
my $scaff_out = Bio::SeqIO->new('-file' => ">$scaffolds_outfile",
'-format' => 'Fasta') and
warn "Will write scaffolds to file $scaffolds_outfile to $outDir Directory\n" or
die "Failed to write to file $scaffolds_outfile\n";
print AGP_FILE "# Generated from SOAPdenovo assembly file $file using script $0\n";
warn "Scaffold Sequence cutoff = $scaf_size_cutoff nt\n" if ($scaf_size_cutoff);
my $i = 0;# a counter, used for generating unique contig names
my $inseq = Bio::SeqIO->new('-file' => "<$file",
'-format' => 'Fasta' ) ;
while (my $seq_obj = $inseq->next_seq ) {
my $supercontig_id = $seq_obj->id ;
my $supercontig_seq = $seq_obj->seq ;
$supercontig_seq =~ s/^[Nn]+//; # remove leading N's
my $supercontig_desc = $seq_obj->description ;
my $supercontig_length = length($supercontig_seq);
$i=0; #reset each time you have a new scaffold
# only process the long supercontigs
next if ($supercontig_length < $scaf_size_cutoff);
### NCBI do not allow coverage and length information in the FastA identifier
### e.g. NODE_1160_length_397673_cov_14.469489 is an illegal FastA ID
### So we will replace these with simple numbers
if ($supercontig_id =~ m/NODE_(\d+)_length_\d+_cov_\d+/ or
$supercontig_id =~ m/^(\d+)$/ or
$supercontig_id =~ m/^scaffold(\d+)$/) {
$supercontig_id = "${name_prefix}_scaffold_$1";
}
# print SCAFF_FILE ">$supercontig_id\n$supercontig_seq\n";
my $scf = Bio::PrimarySeq->new(-seq => "$supercontig_seq",
-id => "$supercontig_id");
$scaff_out->write_seq($scf);
my $start_pos = 1; # keep track of whereabouts in this supercontig we are
my %substring_sequences;
foreach my $substring_sequence ( split /(N+)/i, $supercontig_seq ) {
#warn "\n$substring_sequence\n" if $supercontig_id eq '1160'; for #debugging only
$i++; # a counter, used for generating unique contig names
### Define the AGP column contents
my $object1 = $supercontig_id;
my $object_beg2 = $start_pos;
my $object_end3 = $start_pos + length($substring_sequence) - 1;
my $part_number4 = $i;
my $component_type5;
my $component_id6a;
my $gap_length6b;
my $component_beg7a;
my $gap_type7b;
my $component_end8a;
my $linkage8b;
my $orientation9a;
my $filler9b;
if ( $substring_sequence =~ m/^N+$/i ) {
### This is poly-N gap between contigs
$component_type5 = 'N';
$gap_length6b = length($substring_sequence);
$gap_type7b = 'fragment';
$linkage8b = 'yes';
$filler9b = '';
} elsif ( $substring_sequence =~ m/^[ACGT]+$/i ) {
### This is a contig
$component_type5 = 'W';
$component_id6a = "$supercontig_id"."_"."$i";
$component_beg7a = 1;
$component_end8a = length($substring_sequence);
$orientation9a = '+';
### Print FastA formatted contig
# print FILE ">$component_id6a\n$substring_sequence\n";
my $ctg = Bio::PrimarySeq->new(-seq => "$substring_sequence",
-id => "$component_id6a");
$contig_out->write_seq($ctg);
} else {
die "Illegal characters in sequence\n$substring_sequence\n";
}
$start_pos += length ($substring_sequence);
if ($component_type5 eq 'N') {
### print AGP line for gap
print AGP_FILE "$object1\t$object_beg2\t$object_end3\t$part_number4\t$component_type5\t$gap_length6b\t$gap_type7b\t$linkage8b\t$filler9b\n";
} else {
### print AGP line for contig
print AGP_FILE "$object1\t$object_beg2\t$object_end3\t$part_number4\t$component_type5\t$component_id6a\t$component_beg7a\t$component_end8a\t$orientation9a\n";
}
}
}
$contig_out->close();
close AGP_FILE;
$scaff_out->close();
@IdoBar
Copy link
Author

IdoBar commented Jun 18, 2019

Originally part of the Velvet assembly tools, available at https://www.hmpdacc.org/doc/fasta2apg.pl
Modified slightly to fix counter (see seqanswers thread)
And by myself, to use /bin/env to determine which perl to use and to handle scaffolds starting with Ns.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment