Skip to content

Instantly share code, notes, and snippets.

@slavailn
Created April 26, 2016 06:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save slavailn/379864243602a9864fba4dee58d8236f to your computer and use it in GitHub Desktop.
Save slavailn/379864243602a9864fba4dee58d8236f to your computer and use it in GitHub Desktop.
#! /usr/bin/perl
use strict; use warnings;
my $config = shift; # Specify configuration file
my $trimmed_reads; # Store file name with trimmed reads in fastq format
my @paths; # Store full path to each bowtie index
my @unaligned_files; # Store files with reads that could not be aligned by bowtie
my @aligned_files; # Store files with aligned sequences in fastq format, only needed to count reads
open ( my $config_in, "<", $config ) or die "Cannot open configuration file: $!\n";
while( my $line = <$config_in> )
{
chomp $line;
if ( $line !~ /^#/ && $line =~ /TRIM/ )
{
my @trimmed = split( ' ', $line );
$trimmed_reads = $trimmed[1];
}
elsif ( $line !~ /^#/ && $line =~ /^[A-Z]/ )
{
my ( $index, $path ) = split( ' ', $line );
my $unaligned = join( "_", $index, 'unaligned' );
my $aligned = join( "_", $index, 'aligned' );
push ( @paths, $path );
push ( @unaligned_files, $unaligned );
push ( @aligned_files, $aligned );
}
}
# Create a new array storing fastq files with reads aligned by bowtie
# Basically its a modified @unaligned_files array
my @reads = @unaligned_files;
pop ( @reads );
$reads[0] = 'GENOME_aligned';
# Modify @unaligned_files, so the last one is called unclassified, and remove the genome_unaligned as a first one
$unaligned_files[-1] = 'unclassified';
shift( @unaligned_files );
# Remove first element 'GENOME_aligned' from @aligned_files, since we are doing genome alignment outside of the loop
shift( @aligned_files );
# First align trimmed reads to the genome in order to filter out those that are no match
my $path_to_genome = shift( @paths );
my $no_match = 'GENOME_unaligned';
`bowtie -v 1 --best -p 4 --un $no_match --al GENOME_aligned $path_to_genome $trimmed_reads hit`;
my $array_of_bowtie_files_ref = &create_array_of_arrays( \@reads, \@paths, \@unaligned_files, \@aligned_files );
foreach my $bowtie_files ( @{$array_of_bowtie_files_ref} )
{
&bowtie_align( @{$bowtie_files} );
}
&count_aligned_reads( \@aligned_files, 'unclassified', $trimmed_reads, $no_match );
sub create_array_of_arrays
# Creates array of arrays where each element is ( unaligned_file, path_to_bowtie_index, reads_to_align )
# all of those will be fed into bowtie in a step wise fasion, we require to build library composition
{
my $reads_ref = shift;
my $path_ref = shift;
my $unaligned_ref = shift;
my $aligned_ref = shift;
my @reads = @{$reads_ref};
my @paths = @{$path_ref};
my @unaligned_files = @{$unaligned_ref};
my @aligned_files = @{$aligned_ref};
my @array_of_bowtie_files;
for ( my $i = 0; $i < scalar( @reads ); $i++ )
{
$array_of_bowtie_files[$i] = [ $unaligned_files[$i], $paths[$i], $reads[$i], $aligned_files[$i] ];
}
return \@array_of_bowtie_files;
}
sub bowtie_align
{
my @bowtie_params = @_;
my $unaligned = $bowtie_params[0];
my $path = $bowtie_params[1];
my $read_file = $bowtie_params[2];
my $aligned_file = $bowtie_params[3];
`bowtie -v 1 --best -p 4 --al $aligned_file --un $unaligned $path $read_file hit`;
print "bowtie -v 1 --best -p 4 --al $aligned_file --un $unaligned $path $read_file hit\n";
}
sub count_aligned_reads
{
# Take in arguments
my $aligned_files_ref = shift;
my $unclassified_reads = shift;
my $trimmed_reads_file = shift;
my $no_match_file = shift;
# Output values
my $total_reads_count;
my $unclassified_count;
my $no_match_count;
# Deref @aligned_files
my @aligned_files = @{ $aligned_files_ref };
# open file to write out the results
open( my $out_file, ">", 'align_stats.txt' ) or die "Cannot open file for writing: $!\n";
# open fastq with total reads and count lines, divide number of lines by 4 to get number of sequences
{
my $count = 0;
open( my $total, "<", $trimmed_reads_file ) or die "Cannot open small RNA fastq: $!\n";
while( <$total> )
{
$count++;
}
$total_reads_count = $count/4;
}
print $out_file "total\t$total_reads_count\n";
# Count reads in unclassified
{
my $count = 0;
open( my $unclassified, "<", 'unclassified' ) or die "Cannot open file with unclassified sequences: $!\n";
while( <$unclassified> )
{
$count++;
}
$unclassified_count = $count/4;
}
{
my $count = 0;
open( my $no_match, "<", $no_match_file ) or die "Cannot open file with unclassified sequences: $!\n";
while( <$no_match> )
{
$count++;
}
$no_match_count = $count/4;
}
# Count reads in the rest of the files
{
foreach my $aligned_file ( @aligned_files )
{
my $count = 0;
open( my $aligned, "<", $aligned_file ) or die "Cannot open file with aligned reads: $!\n";
while( <$aligned> )
{
$count++;
}
my ( $category, $rest ) = split( "_", $aligned_file );
my $lowercase_category = lc $category;
$count /= 4;
print $out_file "$lowercase_category\t$count\n";
}
}
print $out_file "unclassified\t$unclassified_count\n";
print $out_file "no_match\t$no_match_count\n";
}
__END__
# Example of config file
# Config file for library_composition.pl script
# Specify full path to the genome, all of the datasets and trimmed reads file.
# Note! Order of the datasets is important, since the reads aligning to the
# dataset will be excluded from further alignments
TRIM trimmed_reads.fastq
GENOME /home/slava/GENOME/Homo_sapiens_UCSC_hg19/Homo_sapiens/UCSC/hg19/Sequence/BowtieIndex/genome
MIRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/mirna
PIRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/piRNA
SNORNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/snoRNA
SNRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/snRNA
RRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/rRNA
TRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/tRNA
CDS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/cds
PROMOTERS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/promoters
UTR5 /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/UTR5
UTR3 /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/UTR3
INTRONS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/introns
REPEATS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/repeats
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment