hyphaltip/barcode.pl

## barcode.pl
#!/bin/perl -w
#sort sequences into two files according to 5' barcode

use strict;
use warnings;
use Bio::SeqIO;


my $file = 'trimmed_seq.fa';
my $in = Bio::SeqIO->new(-format => 'Fasta',
                         -file => $file);


open (my $reportAG => '>AG_barcode_file.fa') || die $!;
open (my $reportCT => '>CT_barcode_file.fa') || die $!;

while ( my $seq =  $in->next_seq() )
{

        my $str = $seq->seq; # get the sequence as a string
                $str =~ s/\*//g; # remove all '*' and space from sequence


        my $adapter = substr($str,0,2);

        #check to see if it the right adapter.

        if ($adapter eq 'AG')
        {
                        my $id = $seq->id;
                        my $str2 = substr($str,2);
                                print $reportAG ">$id\n";
                                print $reportAG "$str2\n";
        }

                elsif ($adapter eq 'CT')
{
                        my $id = $seq->id;
                        my $str3 = substr($str,2);
                                print $reportCT ">$id\n";
                                print $reportCT "$str3\n";
        }


}

## mirna_blast.pl
#!/bin/perl -w
#blast sequences to soybean miRNAs and report normalized reads per million

use warnings;
use strict;
use Bio::SeqIO;

my $seq;
my $seqn;
my $id;
my $id_strt;
my $rptm;

open OUT, '>normalized_AG-CT.txt' or die "Cannot write to a file\n";
# what is the expected format of this file
my $file = 'AG-CT_collapse.fa';
my $file2 = 'gma_miRNA.fa';
my $in = Bio::SeqIO->new(-format => 'fasta',
                         -file => $file);
my %unique = ();
while ( my $seq_obj = $in->next_seq) {
 $seq = $seq_obj->seq;
 # shouldn't you be adding up the total number from the collapsed, by parsing the sequence ID here
 # I changed the code so it just adds one - counting for you.
 $unique{$seq}++;
}
my $in2 = Bio::SeqIO->new(-format => 'fasta',
                         -file => $file2);
print OUT join("\t","ID" , "Seq" , "Norm_exp"),"\n";
while ( my $seq_objct = $in2->next_seq) {

 $seqn= $seq_objct->seq;
 $id= $seq_objct->display_id;
 $id_strt= substr ($id, 0, 3);
 for my $name (sort keys %unique) {
  $seqn =~ tr/U/T/;
  if (($name eq $seqn) && ($id_strt eq "gma")) {
   $rptm = (($unique{$name})*1000000)/404954;# or 366254;
   print OUT $id, "\t", $seqn , "\t", $rptm, "\n";
  }
 }
}

## nt_length.pl
#!/bin/perl -w
#nucleotide length distribution

use warnings;
use strict;
use Bio::SeqIO;
use Statistics::Descriptive;

my$stat = Statistics::Descriptive::Full->new();
my (%distrib);
my @bins = qw/18 19 20 21 22 23 24 25 26 27 28/;

my $fastaFile = '.fa';
open (FASTA, "<$fastaFile");
$/ = ">";

my $junkFirstOne = <FASTA>;

while (<FASTA>) {
        chomp;
        my ($def,@seqlines) = split /\n/, $_;
        my $seq = join '', @seqlines;
        $stat->add_data(length($seq));
}


%distrib = $stat->frequency_distribution(\@bins);

print "Total reads:\t" . $stat->count() . "\n";
print "Total nt:\t" . $stat->sum() . "\n";
print "Mean length:\t" . $stat->mean() . "\n";
print "Median length:\t" . $stat->median() . "\n";
print "Mode length:\t" . $stat->mode() . "\n";
print "Max length:\t" . $stat->max() . "\n";
print "Min length:\t" . $stat->min() . "\n";
print "Length\t# Seqs\n";
foreach (sort {$a <=> $b} keys %distrib) {
        print "$_\t$distrib{$_}\n";

## trim.pl
#!/bin/perl -w
#trim adaptor, filter >18nt and remove N sequences

use strict;
use Bio::SeqIO;

my $seqio = Bio::SeqIO->new(-format=>'fastq',-file=>'flowcell_lane8.fastq');
my $adapter='CACTCGGGCACCAAGGACGTATGCCGTCTTCTGCTTG';

open OUT,'>trimmed_seq.fa' or die "Cannot write to a file\n";

my ($i,$j) = 0;
while (my $seq = $seqio->next_seq)
{
my $adapter_6nt = substr($adapter,0,6);
if ($seq->seq =~ "$adapter_6nt")
{
my $pos = index($seq->seq, $adapter_6nt, -1);
my $str = substr($seq->seq, 0, $pos);
if($str =~ "N"||length($str)<18) #remove sequence less than 18nt and seqences with N
{
$i++; #count sequences greater than 18nt
}
else
{
print OUT ">",$seq->id,"\n";
print OUT "$str\n";
}
$j++; #count total number of sequences
}

print "The total number of sequences in the file is $j.\nThe number of trimmed sequences greater than 18nt is $i.\n";
	#!/bin/perl -w
	#sort sequences into two files according to 5' barcode

	use strict;
	use warnings;
	use Bio::SeqIO;


	my $file = 'trimmed_seq.fa';
	my $in = Bio::SeqIO->new(-format => 'Fasta',
	-file => $file);



	open (my $reportAG => '>AG_barcode_file.fa') \|\| die $!;
	open (my $reportCT => '>CT_barcode_file.fa') \|\| die $!;

	while ( my $seq = $in->next_seq() )
	{

	my $str = $seq->seq; # get the sequence as a string
	$str =~ s/\//g; # remove all '' and space from sequence


	my $adapter = substr($str,0,2);

	#check to see if it the right adapter.

	if ($adapter eq 'AG')
	{
	my $id = $seq->id;
	my $str2 = substr($str,2);
	print $reportAG ">$id\n";
	print $reportAG "$str2\n";
	}

	elsif ($adapter eq 'CT')
	{
	my $id = $seq->id;
	my $str3 = substr($str,2);
	print $reportCT ">$id\n";
	print $reportCT "$str3\n";
	}


	}
	#!/bin/perl -w
	#blast sequences to soybean miRNAs and report normalized reads per million

	use warnings;
	use strict;
	use Bio::SeqIO;

	my $seq;
	my $seqn;
	my $id;
	my $id_strt;
	my $rptm;

	open OUT, '>normalized_AG-CT.txt' or die "Cannot write to a file\n";
	# what is the expected format of this file
	my $file = 'AG-CT_collapse.fa';
	my $file2 = 'gma_miRNA.fa';
	my $in = Bio::SeqIO->new(-format => 'fasta',
	-file => $file);
	my %unique = ();
	while ( my $seq_obj = $in->next_seq) {
	$seq = $seq_obj->seq;
	# shouldn't you be adding up the total number from the collapsed, by parsing the sequence ID here
	# I changed the code so it just adds one - counting for you.
	$unique{$seq}++;
	}
	my $in2 = Bio::SeqIO->new(-format => 'fasta',
	-file => $file2);
	print OUT join("\t","ID" , "Seq" , "Norm_exp"),"\n";
	while ( my $seq_objct = $in2->next_seq) {

	$seqn= $seq_objct->seq;
	$id= $seq_objct->display_id;
	$id_strt= substr ($id, 0, 3);
	for my $name (sort keys %unique) {
	$seqn =~ tr/U/T/;
	if (($name eq $seqn) && ($id_strt eq "gma")) {
	$rptm = (($unique{$name})*1000000)/404954;# or 366254;
	print OUT $id, "\t", $seqn , "\t", $rptm, "\n";
	}
	}
	}
	#!/bin/perl -w
	#nucleotide length distribution

	use warnings;
	use strict;
	use Bio::SeqIO;
	use Statistics::Descriptive;

	my$stat = Statistics::Descriptive::Full->new();
	my (%distrib);
	my @bins = qw/18 19 20 21 22 23 24 25 26 27 28/;

	my $fastaFile = '.fa';
	open (FASTA, "<$fastaFile");
	$/ = ">";

	my $junkFirstOne = <FASTA>;

	while (<FASTA>) {
	chomp;
	my ($def,@seqlines) = split /\n/, $_;
	my $seq = join '', @seqlines;
	$stat->add_data(length($seq));
	}


	%distrib = $stat->frequency_distribution(\@bins);

	print "Total reads:\t" . $stat->count() . "\n";
	print "Total nt:\t" . $stat->sum() . "\n";
	print "Mean length:\t" . $stat->mean() . "\n";
	print "Median length:\t" . $stat->median() . "\n";
	print "Mode length:\t" . $stat->mode() . "\n";
	print "Max length:\t" . $stat->max() . "\n";
	print "Min length:\t" . $stat->min() . "\n";
	print "Length\t# Seqs\n";
	foreach (sort {$a <=> $b} keys %distrib) {
	print "$_\t$distrib{$_}\n";
	#!/bin/perl -w
	#trim adaptor, filter >18nt and remove N sequences

	use strict;
	use Bio::SeqIO;

	my $seqio = Bio::SeqIO->new(-format=>'fastq',-file=>'flowcell_lane8.fastq');
	my $adapter='CACTCGGGCACCAAGGACGTATGCCGTCTTCTGCTTG';

	open OUT,'>trimmed_seq.fa' or die "Cannot write to a file\n";

	my ($i,$j) = 0;
	while (my $seq = $seqio->next_seq)
	{
	my $adapter_6nt = substr($adapter,0,6);
	if ($seq->seq =~ "$adapter_6nt")
	{
	my $pos = index($seq->seq, $adapter_6nt, -1);
	my $str = substr($seq->seq, 0, $pos);
	if($str =~ "N"\|\|length($str)<18) #remove sequence less than 18nt and seqences with N
	{
	$i++; #count sequences greater than 18nt
	}
	else
	{
	print OUT ">",$seq->id,"\n";
	print OUT "$str\n";
	}
	$j++; #count total number of sequences
	}

	print "The total number of sequences in the file is $j.\nThe number of trimmed sequences greater than 18nt is $i.\n";