Created
April 26, 2016 06:26
-
-
Save slavailn/379864243602a9864fba4dee58d8236f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
use strict; use warnings; | |
my $config = shift; # Specify configuration file | |
my $trimmed_reads; # Store file name with trimmed reads in fastq format | |
my @paths; # Store full path to each bowtie index | |
my @unaligned_files; # Store files with reads that could not be aligned by bowtie | |
my @aligned_files; # Store files with aligned sequences in fastq format, only needed to count reads | |
open ( my $config_in, "<", $config ) or die "Cannot open configuration file: $!\n"; | |
while( my $line = <$config_in> ) | |
{ | |
chomp $line; | |
if ( $line !~ /^#/ && $line =~ /TRIM/ ) | |
{ | |
my @trimmed = split( ' ', $line ); | |
$trimmed_reads = $trimmed[1]; | |
} | |
elsif ( $line !~ /^#/ && $line =~ /^[A-Z]/ ) | |
{ | |
my ( $index, $path ) = split( ' ', $line ); | |
my $unaligned = join( "_", $index, 'unaligned' ); | |
my $aligned = join( "_", $index, 'aligned' ); | |
push ( @paths, $path ); | |
push ( @unaligned_files, $unaligned ); | |
push ( @aligned_files, $aligned ); | |
} | |
} | |
# Create a new array storing fastq files with reads aligned by bowtie | |
# Basically its a modified @unaligned_files array | |
my @reads = @unaligned_files; | |
pop ( @reads ); | |
$reads[0] = 'GENOME_aligned'; | |
# Modify @unaligned_files, so the last one is called unclassified, and remove the genome_unaligned as a first one | |
$unaligned_files[-1] = 'unclassified'; | |
shift( @unaligned_files ); | |
# Remove first element 'GENOME_aligned' from @aligned_files, since we are doing genome alignment outside of the loop | |
shift( @aligned_files ); | |
# First align trimmed reads to the genome in order to filter out those that are no match | |
my $path_to_genome = shift( @paths ); | |
my $no_match = 'GENOME_unaligned'; | |
`bowtie -v 1 --best -p 4 --un $no_match --al GENOME_aligned $path_to_genome $trimmed_reads hit`; | |
my $array_of_bowtie_files_ref = &create_array_of_arrays( \@reads, \@paths, \@unaligned_files, \@aligned_files ); | |
foreach my $bowtie_files ( @{$array_of_bowtie_files_ref} ) | |
{ | |
&bowtie_align( @{$bowtie_files} ); | |
} | |
&count_aligned_reads( \@aligned_files, 'unclassified', $trimmed_reads, $no_match ); | |
sub create_array_of_arrays | |
# Creates array of arrays where each element is ( unaligned_file, path_to_bowtie_index, reads_to_align ) | |
# all of those will be fed into bowtie in a step wise fasion, we require to build library composition | |
{ | |
my $reads_ref = shift; | |
my $path_ref = shift; | |
my $unaligned_ref = shift; | |
my $aligned_ref = shift; | |
my @reads = @{$reads_ref}; | |
my @paths = @{$path_ref}; | |
my @unaligned_files = @{$unaligned_ref}; | |
my @aligned_files = @{$aligned_ref}; | |
my @array_of_bowtie_files; | |
for ( my $i = 0; $i < scalar( @reads ); $i++ ) | |
{ | |
$array_of_bowtie_files[$i] = [ $unaligned_files[$i], $paths[$i], $reads[$i], $aligned_files[$i] ]; | |
} | |
return \@array_of_bowtie_files; | |
} | |
sub bowtie_align | |
{ | |
my @bowtie_params = @_; | |
my $unaligned = $bowtie_params[0]; | |
my $path = $bowtie_params[1]; | |
my $read_file = $bowtie_params[2]; | |
my $aligned_file = $bowtie_params[3]; | |
`bowtie -v 1 --best -p 4 --al $aligned_file --un $unaligned $path $read_file hit`; | |
print "bowtie -v 1 --best -p 4 --al $aligned_file --un $unaligned $path $read_file hit\n"; | |
} | |
sub count_aligned_reads | |
{ | |
# Take in arguments | |
my $aligned_files_ref = shift; | |
my $unclassified_reads = shift; | |
my $trimmed_reads_file = shift; | |
my $no_match_file = shift; | |
# Output values | |
my $total_reads_count; | |
my $unclassified_count; | |
my $no_match_count; | |
# Deref @aligned_files | |
my @aligned_files = @{ $aligned_files_ref }; | |
# open file to write out the results | |
open( my $out_file, ">", 'align_stats.txt' ) or die "Cannot open file for writing: $!\n"; | |
# open fastq with total reads and count lines, divide number of lines by 4 to get number of sequences | |
{ | |
my $count = 0; | |
open( my $total, "<", $trimmed_reads_file ) or die "Cannot open small RNA fastq: $!\n"; | |
while( <$total> ) | |
{ | |
$count++; | |
} | |
$total_reads_count = $count/4; | |
} | |
print $out_file "total\t$total_reads_count\n"; | |
# Count reads in unclassified | |
{ | |
my $count = 0; | |
open( my $unclassified, "<", 'unclassified' ) or die "Cannot open file with unclassified sequences: $!\n"; | |
while( <$unclassified> ) | |
{ | |
$count++; | |
} | |
$unclassified_count = $count/4; | |
} | |
{ | |
my $count = 0; | |
open( my $no_match, "<", $no_match_file ) or die "Cannot open file with unclassified sequences: $!\n"; | |
while( <$no_match> ) | |
{ | |
$count++; | |
} | |
$no_match_count = $count/4; | |
} | |
# Count reads in the rest of the files | |
{ | |
foreach my $aligned_file ( @aligned_files ) | |
{ | |
my $count = 0; | |
open( my $aligned, "<", $aligned_file ) or die "Cannot open file with aligned reads: $!\n"; | |
while( <$aligned> ) | |
{ | |
$count++; | |
} | |
my ( $category, $rest ) = split( "_", $aligned_file ); | |
my $lowercase_category = lc $category; | |
$count /= 4; | |
print $out_file "$lowercase_category\t$count\n"; | |
} | |
} | |
print $out_file "unclassified\t$unclassified_count\n"; | |
print $out_file "no_match\t$no_match_count\n"; | |
} | |
__END__ | |
# Example of config file | |
# Config file for library_composition.pl script | |
# Specify full path to the genome, all of the datasets and trimmed reads file. | |
# Note! Order of the datasets is important, since the reads aligning to the | |
# dataset will be excluded from further alignments | |
TRIM trimmed_reads.fastq | |
GENOME /home/slava/GENOME/Homo_sapiens_UCSC_hg19/Homo_sapiens/UCSC/hg19/Sequence/BowtieIndex/genome | |
MIRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/mirna | |
PIRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/piRNA | |
SNORNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/snoRNA | |
SNRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/snRNA | |
RRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/rRNA | |
TRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/tRNA | |
CDS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/cds | |
PROMOTERS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/promoters | |
UTR5 /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/UTR5 | |
UTR3 /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/UTR3 | |
INTRONS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/introns | |
REPEATS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/repeats | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment