Skip to content

Instantly share code, notes, and snippets.

@ChemicalJames
Created December 5, 2012 05:46
Show Gist options
  • Save ChemicalJames/4212731 to your computer and use it in GitHub Desktop.
Save ChemicalJames/4212731 to your computer and use it in GitHub Desktop.
Project Scripts
#!/bin/perl -w
#sort sequences into two files according to 5' barcode
use strict;
use warnings;
use Bio::SeqIO;
my $file = 'trimmed_seq.fa';
my $in = Bio::SeqIO->new(-format => 'Fasta',
-file => $file);
open (my $reportAG => '>AG_barcode_file.fa') || die $!;
open (my $reportCT => '>CT_barcode_file.fa') || die $!;
while ( my $seq = $in->next_seq() )
{
my $str = $seq->seq; # get the sequence as a string
$str =~ s/\*//g; # remove all '*' and space from sequence
my $adapter = substr($str,0,2);
#check to see if it the right adapter.
if ($adapter eq 'AG')
{
my $id = $seq->id;
my $str2 = substr($str,2);
print $reportAG ">$id\n";
print $reportAG "$str2\n";
}
elsif ($adapter eq 'CT')
{
my $id = $seq->id;
my $str3 = substr($str,2);
print $reportCT ">$id\n";
print $reportCT "$str3\n";
}
}
#!/bin/perl -w
#match sequences to soybean miRNAs and report normalized reads per million
use warnings;
use strict;
use Bio::SeqIO;
my $seq;
my $seqn;
my $id;
my $id_strt;
my $rptm;
open OUT, '>normalized_AG-CT.txt' or die "Cannot write to a file\n";
# what is the expected format of this file
my $file = 'AG-CT_barcode_file.fa';
my $file2 = 'gma_miRNA.fa';
my $in = Bio::SeqIO->new(-format => 'fasta',
-file => $file);
my %unique = ();
while ( my $seq_obj = $in->next_seq) {
$seq = $seq_obj->seq;
$unique{$seq}++;
}
my $in2 = Bio::SeqIO->new(-format => 'fasta',
-file => $file2);
print OUT join("\t","ID" , "Seq" , "Norm_exp"),"\n";
while ( my $seq_objct = $in2->next_seq) {
$seqn= $seq_objct->seq;
$id= $seq_objct->display_id;
$id_strt= substr ($id, 0, 3);
for my $name (sort keys %unique) {
$seqn =~ tr/U/T/;
if (($name eq $seqn) && ($id_strt eq "gma")) {
$rptm = (($unique{$name})*1000000)/404954;# or 366254;
print OUT $id, "\t", $seqn , "\t", $rptm, "\n";
}
}
}
#!/bin/perl -w
#nucleotide length distribution
use warnings;
use strict;
use Bio::SeqIO;
use Statistics::Descriptive;
my$stat = Statistics::Descriptive::Full->new();
my (%distrib);
my @bins = qw/18 19 20 21 22 23 24 25 26 27 28/;
my $fastaFile = '.fa';
open (FASTA, "<$fastaFile");
$/ = ">";
my $junkFirstOne = <FASTA>;
while (<FASTA>) {
chomp;
my ($def,@seqlines) = split /\n/, $_;
my $seq = join '', @seqlines;
$stat->add_data(length($seq));
}
%distrib = $stat->frequency_distribution(\@bins);
print "Total reads:\t" . $stat->count() . "\n";
print "Total nt:\t" . $stat->sum() . "\n";
print "Mean length:\t" . $stat->mean() . "\n";
print "Median length:\t" . $stat->median() . "\n";
print "Mode length:\t" . $stat->mode() . "\n";
print "Max length:\t" . $stat->max() . "\n";
print "Min length:\t" . $stat->min() . "\n";
print "Length\t# Seqs\n";
foreach (sort {$a <=> $b} keys %distrib) {
print "$_\t$distrib{$_}\n";
#!/bin/perl -w
#trim adaptor, filter >18nt and remove N sequences
use strict;
use Bio::SeqIO;
my $seqio = Bio::SeqIO->new(-format=>'fastq',-file=>'flowcell_lane8.fastq');
my $adapter='CACTCGGGCACCAAGGACGTATGCCGTCTTCTGCTTG';
open OUT,'>trimmed_seq.fa' or die "Cannot write to a file\n";
my ($i,$j) = 0;
while (my $seq = $seqio->next_seq)
{
my $adapter_6nt = substr($adapter,0,6);
if ($seq->seq =~ "$adapter_6nt")
{
my $pos = index($seq->seq, $adapter_6nt, -1);
my $str = substr($seq->seq, 0, $pos);
if($str =~ "N"||length($str)<18) #remove sequence less than 18nt and seqences with N
{
$i++; #count sequences greater than 18nt
}
else
{
print OUT ">",$seq->id,"\n";
print OUT "$str\n";
}
$j++; #count total number of sequences
}
print "The total number of sequences in the file is $j.\nThe number of trimmed sequences greater than 18nt is $i.\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment