-
-
Save hyphaltip/4220482 to your computer and use it in GitHub Desktop.
Project Scripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/perl -w | |
#sort sequences into two files according to 5' barcode | |
use strict; | |
use warnings; | |
use Bio::SeqIO; | |
my $file = 'trimmed_seq.fa'; | |
my $in = Bio::SeqIO->new(-format => 'Fasta', | |
-file => $file); | |
open (my $reportAG => '>AG_barcode_file.fa') || die $!; | |
open (my $reportCT => '>CT_barcode_file.fa') || die $!; | |
while ( my $seq = $in->next_seq() ) | |
{ | |
my $str = $seq->seq; # get the sequence as a string | |
$str =~ s/\*//g; # remove all '*' and space from sequence | |
my $adapter = substr($str,0,2); | |
#check to see if it the right adapter. | |
if ($adapter eq 'AG') | |
{ | |
my $id = $seq->id; | |
my $str2 = substr($str,2); | |
print $reportAG ">$id\n"; | |
print $reportAG "$str2\n"; | |
} | |
elsif ($adapter eq 'CT') | |
{ | |
my $id = $seq->id; | |
my $str3 = substr($str,2); | |
print $reportCT ">$id\n"; | |
print $reportCT "$str3\n"; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/perl -w | |
#blast sequences to soybean miRNAs and report normalized reads per million | |
use warnings; | |
use strict; | |
use Bio::SeqIO; | |
my $seq; | |
my $seqn; | |
my $id; | |
my $id_strt; | |
my $rptm; | |
open OUT, '>normalized_AG-CT.txt' or die "Cannot write to a file\n"; | |
# what is the expected format of this file | |
my $file = 'AG-CT_collapse.fa'; | |
my $file2 = 'gma_miRNA.fa'; | |
my $in = Bio::SeqIO->new(-format => 'fasta', | |
-file => $file); | |
my %unique = (); | |
while ( my $seq_obj = $in->next_seq) { | |
$seq = $seq_obj->seq; | |
# shouldn't you be adding up the total number from the collapsed, by parsing the sequence ID here | |
# I changed the code so it just adds one - counting for you. | |
$unique{$seq}++; | |
} | |
my $in2 = Bio::SeqIO->new(-format => 'fasta', | |
-file => $file2); | |
print OUT join("\t","ID" , "Seq" , "Norm_exp"),"\n"; | |
while ( my $seq_objct = $in2->next_seq) { | |
$seqn= $seq_objct->seq; | |
$id= $seq_objct->display_id; | |
$id_strt= substr ($id, 0, 3); | |
for my $name (sort keys %unique) { | |
$seqn =~ tr/U/T/; | |
if (($name eq $seqn) && ($id_strt eq "gma")) { | |
$rptm = (($unique{$name})*1000000)/404954;# or 366254; | |
print OUT $id, "\t", $seqn , "\t", $rptm, "\n"; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/perl -w | |
#nucleotide length distribution | |
use warnings; | |
use strict; | |
use Bio::SeqIO; | |
use Statistics::Descriptive; | |
my$stat = Statistics::Descriptive::Full->new(); | |
my (%distrib); | |
my @bins = qw/18 19 20 21 22 23 24 25 26 27 28/; | |
my $fastaFile = '.fa'; | |
open (FASTA, "<$fastaFile"); | |
$/ = ">"; | |
my $junkFirstOne = <FASTA>; | |
while (<FASTA>) { | |
chomp; | |
my ($def,@seqlines) = split /\n/, $_; | |
my $seq = join '', @seqlines; | |
$stat->add_data(length($seq)); | |
} | |
%distrib = $stat->frequency_distribution(\@bins); | |
print "Total reads:\t" . $stat->count() . "\n"; | |
print "Total nt:\t" . $stat->sum() . "\n"; | |
print "Mean length:\t" . $stat->mean() . "\n"; | |
print "Median length:\t" . $stat->median() . "\n"; | |
print "Mode length:\t" . $stat->mode() . "\n"; | |
print "Max length:\t" . $stat->max() . "\n"; | |
print "Min length:\t" . $stat->min() . "\n"; | |
print "Length\t# Seqs\n"; | |
foreach (sort {$a <=> $b} keys %distrib) { | |
print "$_\t$distrib{$_}\n"; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/perl -w | |
#trim adaptor, filter >18nt and remove N sequences | |
use strict; | |
use Bio::SeqIO; | |
my $seqio = Bio::SeqIO->new(-format=>'fastq',-file=>'flowcell_lane8.fastq'); | |
my $adapter='CACTCGGGCACCAAGGACGTATGCCGTCTTCTGCTTG'; | |
open OUT,'>trimmed_seq.fa' or die "Cannot write to a file\n"; | |
my ($i,$j) = 0; | |
while (my $seq = $seqio->next_seq) | |
{ | |
my $adapter_6nt = substr($adapter,0,6); | |
if ($seq->seq =~ "$adapter_6nt") | |
{ | |
my $pos = index($seq->seq, $adapter_6nt, -1); | |
my $str = substr($seq->seq, 0, $pos); | |
if($str =~ "N"||length($str)<18) #remove sequence less than 18nt and seqences with N | |
{ | |
$i++; #count sequences greater than 18nt | |
} | |
else | |
{ | |
print OUT ">",$seq->id,"\n"; | |
print OUT "$str\n"; | |
} | |
$j++; #count total number of sequences | |
} | |
print "The total number of sequences in the file is $j.\nThe number of trimmed sequences greater than 18nt is $i.\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment