Skip to content

Instantly share code, notes, and snippets.

@mfcovington
Created June 14, 2012 06:09
Show Gist options
  • Save mfcovington/2928252 to your computer and use it in GitHub Desktop.
Save mfcovington/2928252 to your computer and use it in GitHub Desktop.
fasta_flattener
#!/usr/bin/env perl
# fasta_flattener.pl
# Mike Covington
# created: 2012-06-14
#
# Description: flatten a FASTA file by removing white space
#
use strict;
use warnings;
use Bio::SeqIO;
use v5.10; #or later... or change 'say' to 'print' X_x
my $fasta_in = "input.fa";
open my $fasta_out, ">", "output.fa";
my $seqio_in = Bio::SeqIO->new(
-file => $fasta_in,
-format => 'Fasta',
);
my ( $seq_obj, %seq_hash );
while ( my $seq_obj = $seqio_in->next_seq() ) {
my $seq_id = $seq_obj->display_id(); #this is the sequence ID
my $seq = $seq_obj->seq(); #this is the actual sequence
$seq_hash{$seq_id} = $seq; #and hashed!
#to print them to your screen in a "consolidated" FASTA format:
say ">$seq_id";
say $seq_hash{$seq_id};
#to save to a file in a "consolidated" FASTA format:
say $fasta_out ">$seq_id";
say $fasta_out $seq_hash{$seq_id};
}
exit;
>AT4G00050
ATGAGCCAATGTGTTCCAAACTGTCACATCGATGATACTCCGGCAGCAGC
CACCACCACCGTCCGCTCCACCACAGCCGCAGACATCCCCATATTAGACT
ACGAGGTAGCCGAGCTGACGTGGGAGAACGGGCAACTAGGCTTGCACGGC
TTAGGTCCACCGCGAGTGACGGCTTCGTCGACCAAGTACTCCACAGGCGC
CGGTGGAACGTTGGAGTCGATAGTGGACCAAGCTACTCGCCTCCCTAACC
CTAAGCCCACGGATGAGCTCGTCCCGTGGTTCCATCATCGCTCCTCCAGG
GCCGCGATGGCAATGGACGCGCTTGTCCCTTGCTCCAACCTAGTACACGA
GCAGCAGAGCAAGCCTGGTGGCGTTGGCTCCACCCGGGTGGGGTCATGTA
GCGATGGTCGTACCATGGGCGGTGGAAAACGAGCAAGAGTGGCACCGGAG
TGGAGCGGCGGCGGGAGTCAGCGGCTGACCATGGACACTTACGACGTAGG
TTTCACCTCAACATCAATGGGCTCGCACGATAACACAATCGACGATCATG
ACTCCGTCTGCCACAGCCGCCCACAGATGGAGGACGAAGAAGAGAAGAAA
GCCGGAGGAAAATCATCAGTTTCAACCAAGAGAAGCAGAGCTGCTGCTAT
TCATAACCAATCCGAACGTAAGAGGAGAGATAAAATCAATCAAAGGATGA
AGACTTTGCAAAAACTGGTTCCCAATTCCAGCAAGACGGATAAAGCATCT
ATGTTGGATGAAGTGATAGAGTATTTGAAGCAACTTCAAGCACAAGTGAG
CATGATGAGCAGAATGAATATGCCTTCTATGATGCTTCCTATGGCCATGC
AGCAACAACAACAACTACAAATGTCTCTCATGTCCAATCCCATGGGTTTA
GGGATGGGCATGGGGATGCCCGGTCTCGGTCTCCTCGACCTTAATTCTAT
GAACCGAGCTGCTGCAAGCGCTCCTAATATCCATGCCAACATGATGCCAA
ACCCATTTTTGCCCATGAATTGTCCATCGTGGGATGCTTCTTCCAATGAC
TCTCGATTTCAGTCTCCTCTCATCCCCGATCCTATGTCTGCCTTTCTTGC
ATGCTCTACTCAGCCAACGACGATGGAAGCGTATAGCAGGATGGCTACAT
TATATCAGCAAATGCAACAACAACTTCCTCCTCCTTCGAATCCAAAATGA
>AT2G25930
ATGAAGAGAGGGAAAGATGAGGAGAAGATATTGGAACCTATGTTTCCTCG
GCTTCATGTGAATGATGCAGATAAAGGAGGGCCTAGAGCTCCTCCTAGAA
ACAAGATGGCTCTTTATGAGCAGCTTAGTATTCCTTCTCAGAGGTTTGGT
GATCATGGAACGATGAATTCTCGTAGTAACAACACAAGCACTTTGGTTCA
TCCTGGACCATCTAGTCAGCCTTGTGGTGTGGAAAGAAACTTATCTGTCC
AGCATCTTGATTCTTCAGCCGCAAACCAAGCAACTGAGAAGTTTGTCTCC
CAAATGTCCTTCATGGAAAATGTGAGATCTTCGGCACAGCATGATCAGAG
GAAAATGGTGAGAGAGGAAGAAGATTTTGCAGTTCCAGTATATATTAACT
CAAGAAGATCTCAGTCTCATGGCAGAACCAAGAGTGGTATTGAGAAGGAA
AAACACACCCCAATGGTGGCACCTAGCTCTCATCACTCCATTCGATTTCA
AGAAGTGAATCAGACAGGCTCAAAGCAAAACGTATGTTTGGCTACTTGTT
CAAAACCTGAAGTTAGGGATCAGGTCAAGGCGAATGCAAGGTCAGGTGGC
TTTGTAATCTCTTTAGATGTATCAGTCACAGAGGAGATTGATCTCGAAAA
ATCAGCATCAAGTCATGATAGAGTAAATGATTATAATGCTTCCTTGAGAC
AAGAGTCTAGAAATCGGTTATACCGAGATGGTGGCAAAACTCGTCTGAAG
GACACTGATAATGGAGCTGAATCTCACTTGGCAACGGAAAATCATTCACA
AGAGGGTCATGGCAGTCCTGAAGACATTGATAATGATCGTGAATACAGCA
AAAGCAGAGCATGCGCCTCTCTGCAGCAGATAAATGAAGAGGCAAGTGAT
GACGTTTCTGATGATTCGATGGTGGATTCTATATCCAGCATAGATGTCTC
TCCCGATGATGTTGTGGGTATATTAGGTCAAAAACGTTTCTGGAGAGCAA
GGAAAGCCATTGCCAATCAACAAAGAGTATTTGCTGTTCAACTATTTGAG
TTGCACAGACTGATTAAGGTTCAAAAACTTATTGCTGCATCACCGGATCT
CTTGCTCGATGAGATCAGTTTTCTTGGAAAAGTTTCTGCTAAAAGCTATC
CAGTGAAGAAGCTCCTTCCATCAGAATTTCTGGTAAAGCCTCCTCTACCA
CATGTTGTCGTCAAACAAAGGGGTGACTCGGAGAAGACTGACCAACATAA
AATGGAAAGCTCAGCTGAGAACGTAGTTGGGAGGTTGTCAAATCAAGGTC
ATCATCAACAATCCAACTACATGCCTTTTGCAAACAACCCACCGGCTTCA
CCGGCTCCAAATGGATATTGCTTTCCTCCTCAGCCTCCTCCTTCAGGAAA
TCATCAGCAATGGTTGATCCCTGTAATGTCTCCCTCGGAAGGACTGATAT
ACAAGCCTCACCCAGGTATGGCACACACGGGGCATTATGGAGGATATTAT
GGTCATTATATGCCTACACCAATGGTAATGCCTCAATATCACCCCGGCAT
GGGATTCCCACCTCCTGGTAATGGCTACTTCCCTCCATATGGAATGATGC
CCACCATAATGAACCCATATTGTTCAAGCCAACAACAACAACAACAACAA
CCCAATGAGCAAATGAACCAGTTTGGACATCCTGGAAATCTTCAGAACAC
CCAACAACAACAACAGAGATCTGATAATGAACCTGCTCCACAGCAACAGC
AACAGCCAACAAAGTCTTATCCGCGAGCAAGAAAGAGCAGGCAAGGGAGC
ACAGGAAGCAGTCCAAGTGGGCCACAGGGAATCTCTGGTAGCAAGTCCTT
TCGGCCATTCGCAGCCGTTGATGAGGACAGCAACATCAACAATGCACCTG
AGCAAACGATGACAACAACCACAACGACGACAAGAACAACTGTTACTCAG
ACAACAAGAGATGGGGGAGGAGTGACGAGAGTGATAAAGGTGGTACCTCA
CAACGCAAAGCTCGCGAGTGAGAATGCTGCCAGAATTTTCCAGTCAATAC
AAGAAGAACGTAAACGCTATGACTCCTCTAAGCCTTAA
>AT4G00050
ATGAGCCAATGTGTTCCAAACTGTCACATCGATGATACTCCGGCAGCAGCCACCACCACCGTCCGCTCCACCACAGCCGCAGACATCCCCATATTAGACTACGAGGTAGCCGAGCTGACGTGGGAGAACGGGCAACTAGGCTTGCACGGCTTAGGTCCACCGCGAGTGACGGCTTCGTCGACCAAGTACTCCACAGGCGCCGGTGGAACGTTGGAGTCGATAGTGGACCAAGCTACTCGCCTCCCTAACCCTAAGCCCACGGATGAGCTCGTCCCGTGGTTCCATCATCGCTCCTCCAGGGCCGCGATGGCAATGGACGCGCTTGTCCCTTGCTCCAACCTAGTACACGAGCAGCAGAGCAAGCCTGGTGGCGTTGGCTCCACCCGGGTGGGGTCATGTAGCGATGGTCGTACCATGGGCGGTGGAAAACGAGCAAGAGTGGCACCGGAGTGGAGCGGCGGCGGGAGTCAGCGGCTGACCATGGACACTTACGACGTAGGTTTCACCTCAACATCAATGGGCTCGCACGATAACACAATCGACGATCATGACTCCGTCTGCCACAGCCGCCCACAGATGGAGGACGAAGAAGAGAAGAAAGCCGGAGGAAAATCATCAGTTTCAACCAAGAGAAGCAGAGCTGCTGCTATTCATAACCAATCCGAACGTAAGAGGAGAGATAAAATCAATCAAAGGATGAAGACTTTGCAAAAACTGGTTCCCAATTCCAGCAAGACGGATAAAGCATCTATGTTGGATGAAGTGATAGAGTATTTGAAGCAACTTCAAGCACAAGTGAGCATGATGAGCAGAATGAATATGCCTTCTATGATGCTTCCTATGGCCATGCAGCAACAACAACAACTACAAATGTCTCTCATGTCCAATCCCATGGGTTTAGGGATGGGCATGGGGATGCCCGGTCTCGGTCTCCTCGACCTTAATTCTATGAACCGAGCTGCTGCAAGCGCTCCTAATATCCATGCCAACATGATGCCAAACCCATTTTTGCCCATGAATTGTCCATCGTGGGATGCTTCTTCCAATGACTCTCGATTTCAGTCTCCTCTCATCCCCGATCCTATGTCTGCCTTTCTTGCATGCTCTACTCAGCCAACGACGATGGAAGCGTATAGCAGGATGGCTACATTATATCAGCAAATGCAACAACAACTTCCTCCTCCTTCGAATCCAAAATGA
>AT2G25930
ATGAAGAGAGGGAAAGATGAGGAGAAGATATTGGAACCTATGTTTCCTCGGCTTCATGTGAATGATGCAGATAAAGGAGGGCCTAGAGCTCCTCCTAGAAACAAGATGGCTCTTTATGAGCAGCTTAGTATTCCTTCTCAGAGGTTTGGTGATCATGGAACGATGAATTCTCGTAGTAACAACACAAGCACTTTGGTTCATCCTGGACCATCTAGTCAGCCTTGTGGTGTGGAAAGAAACTTATCTGTCCAGCATCTTGATTCTTCAGCCGCAAACCAAGCAACTGAGAAGTTTGTCTCCCAAATGTCCTTCATGGAAAATGTGAGATCTTCGGCACAGCATGATCAGAGGAAAATGGTGAGAGAGGAAGAAGATTTTGCAGTTCCAGTATATATTAACTCAAGAAGATCTCAGTCTCATGGCAGAACCAAGAGTGGTATTGAGAAGGAAAAACACACCCCAATGGTGGCACCTAGCTCTCATCACTCCATTCGATTTCAAGAAGTGAATCAGACAGGCTCAAAGCAAAACGTATGTTTGGCTACTTGTTCAAAACCTGAAGTTAGGGATCAGGTCAAGGCGAATGCAAGGTCAGGTGGCTTTGTAATCTCTTTAGATGTATCAGTCACAGAGGAGATTGATCTCGAAAAATCAGCATCAAGTCATGATAGAGTAAATGATTATAATGCTTCCTTGAGACAAGAGTCTAGAAATCGGTTATACCGAGATGGTGGCAAAACTCGTCTGAAGGACACTGATAATGGAGCTGAATCTCACTTGGCAACGGAAAATCATTCACAAGAGGGTCATGGCAGTCCTGAAGACATTGATAATGATCGTGAATACAGCAAAAGCAGAGCATGCGCCTCTCTGCAGCAGATAAATGAAGAGGCAAGTGATGACGTTTCTGATGATTCGATGGTGGATTCTATATCCAGCATAGATGTCTCTCCCGATGATGTTGTGGGTATATTAGGTCAAAAACGTTTCTGGAGAGCAAGGAAAGCCATTGCCAATCAACAAAGAGTATTTGCTGTTCAACTATTTGAGTTGCACAGACTGATTAAGGTTCAAAAACTTATTGCTGCATCACCGGATCTCTTGCTCGATGAGATCAGTTTTCTTGGAAAAGTTTCTGCTAAAAGCTATCCAGTGAAGAAGCTCCTTCCATCAGAATTTCTGGTAAAGCCTCCTCTACCACATGTTGTCGTCAAACAAAGGGGTGACTCGGAGAAGACTGACCAACATAAAATGGAAAGCTCAGCTGAGAACGTAGTTGGGAGGTTGTCAAATCAAGGTCATCATCAACAATCCAACTACATGCCTTTTGCAAACAACCCACCGGCTTCACCGGCTCCAAATGGATATTGCTTTCCTCCTCAGCCTCCTCCTTCAGGAAATCATCAGCAATGGTTGATCCCTGTAATGTCTCCCTCGGAAGGACTGATATACAAGCCTCACCCAGGTATGGCACACACGGGGCATTATGGAGGATATTATGGTCATTATATGCCTACACCAATGGTAATGCCTCAATATCACCCCGGCATGGGATTCCCACCTCCTGGTAATGGCTACTTCCCTCCATATGGAATGATGCCCACCATAATGAACCCATATTGTTCAAGCCAACAACAACAACAACAACAACCCAATGAGCAAATGAACCAGTTTGGACATCCTGGAAATCTTCAGAACACCCAACAACAACAACAGAGATCTGATAATGAACCTGCTCCACAGCAACAGCAACAGCCAACAAAGTCTTATCCGCGAGCAAGAAAGAGCAGGCAAGGGAGCACAGGAAGCAGTCCAAGTGGGCCACAGGGAATCTCTGGTAGCAAGTCCTTTCGGCCATTCGCAGCCGTTGATGAGGACAGCAACATCAACAATGCACCTGAGCAAACGATGACAACAACCACAACGACGACAAGAACAACTGTTACTCAGACAACAAGAGATGGGGGAGGAGTGACGAGAGTGATAAAGGTGGTACCTCACAACGCAAAGCTCGCGAGTGAGAATGCTGCCAGAATTTTCCAGTCAATACAAGAAGAACGTAAACGCTATGACTCCTCTAAGCCTTAA
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment