slavailn/library_composition.pl

## library_composition.pl
#! /usr/bin/perl
use strict; use warnings;

my $config = shift; # Specify configuration file
my $trimmed_reads; # Store file name with trimmed reads in fastq format
my @paths; # Store full path to each bowtie index
my @unaligned_files; # Store files with reads that could not be aligned by bowtie
my @aligned_files; # Store files with aligned sequences in fastq format, only needed to count reads
open ( my $config_in, "<", $config ) or die "Cannot open configuration file: $!\n";

while( my $line = <$config_in> )
{
	chomp $line;
	if ( $line !~ /^#/ && $line =~ /TRIM/ )
	{
		my @trimmed = split( ' ', $line );
		$trimmed_reads = $trimmed[1];
	}
	elsif ( $line !~ /^#/ && $line =~ /^[A-Z]/ )
	{
		my ( $index, $path ) = split( ' ', $line );
		my $unaligned = join( "_", $index, 'unaligned' );
		my $aligned = join( "_", $index, 'aligned' );
		push ( @paths, $path );
		push ( @unaligned_files, $unaligned );
		push ( @aligned_files, $aligned );

	}

}

# Create a new array storing fastq files with reads aligned by bowtie
# Basically its a modified @unaligned_files array
my @reads = @unaligned_files;
pop ( @reads );
$reads[0] = 'GENOME_aligned';

# Modify @unaligned_files, so the last one is called unclassified, and remove the genome_unaligned as a first one
$unaligned_files[-1] = 'unclassified';
shift( @unaligned_files );

# Remove first element 'GENOME_aligned' from @aligned_files, since we are doing genome alignment outside of the loop
shift( @aligned_files );

# First align trimmed reads to the genome in order to filter out those that are no match
my $path_to_genome = shift( @paths );
my $no_match = 'GENOME_unaligned';
`bowtie -v 1 --best -p 4 --un $no_match --al GENOME_aligned $path_to_genome $trimmed_reads hit`;


my $array_of_bowtie_files_ref = &create_array_of_arrays( \@reads, \@paths, \@unaligned_files, \@aligned_files );
foreach my $bowtie_files ( @{$array_of_bowtie_files_ref} )
{
	&bowtie_align( @{$bowtie_files} );
}

&count_aligned_reads( \@aligned_files, 'unclassified', $trimmed_reads, $no_match );


sub create_array_of_arrays
# Creates array of arrays where each element is ( unaligned_file, path_to_bowtie_index, reads_to_align )
# all of those will be fed into bowtie in a step wise fasion, we require to build library composition
{
	my $reads_ref = shift;
	my $path_ref = shift;
	my $unaligned_ref = shift;
	my $aligned_ref = shift;

	my @reads = @{$reads_ref};
	my @paths = @{$path_ref};
	my @unaligned_files = @{$unaligned_ref};
	my @aligned_files = @{$aligned_ref};

	my @array_of_bowtie_files;

	for ( my $i = 0; $i < scalar( @reads ); $i++ )
	{
		$array_of_bowtie_files[$i] = [ $unaligned_files[$i], $paths[$i], $reads[$i], $aligned_files[$i] ];
	}
	return \@array_of_bowtie_files;
}


sub bowtie_align
{
	my @bowtie_params = @_;
	my $unaligned = $bowtie_params[0];
	my $path = $bowtie_params[1];
	my $read_file = $bowtie_params[2];
	my $aligned_file = $bowtie_params[3];

	`bowtie -v 1 --best -p 4 --al $aligned_file --un $unaligned $path $read_file hit`;
	print "bowtie -v 1 --best -p 4 --al $aligned_file --un $unaligned $path $read_file hit\n";
}

sub count_aligned_reads
{
	# Take in arguments
	my $aligned_files_ref = shift;
	my $unclassified_reads = shift;
	my $trimmed_reads_file = shift;
	my $no_match_file = shift;

	# Output values
	my $total_reads_count;
	my $unclassified_count;
	my $no_match_count;

	# Deref @aligned_files
	my @aligned_files = @{ $aligned_files_ref };

	# open file to write out the results
	open( my $out_file, ">", 'align_stats.txt' ) or die "Cannot open file for writing: $!\n";

	# open fastq with total reads and count lines, divide number of lines by 4 to get number of sequences
	{
		my $count = 0;
		open( my $total, "<", $trimmed_reads_file ) or die "Cannot open small RNA fastq: $!\n";
		while( <$total> )
		{
			$count++;
		}
		$total_reads_count = $count/4;


	}
	print $out_file "total\t$total_reads_count\n";

	# Count reads in unclassified
	{
		my $count = 0;
		open( my $unclassified, "<", 'unclassified' ) or die "Cannot open file with unclassified sequences: $!\n";
		while( <$unclassified> )
		{
			$count++;
		}
		$unclassified_count = $count/4;
	}

	{
		my $count = 0;
		open( my $no_match, "<", $no_match_file ) or die "Cannot open file with unclassified sequences: $!\n";
		while( <$no_match> )
		{
			$count++;
		}
		$no_match_count = $count/4;
	}

	# Count reads in the rest of the files
	{
		foreach my $aligned_file ( @aligned_files )
		{
			my $count = 0;
			open( my $aligned, "<", $aligned_file  ) or die "Cannot open file with aligned reads: $!\n";
			while( <$aligned> )
			{
				$count++;
			}
			my ( $category, $rest ) = split( "_", $aligned_file );
			my $lowercase_category = lc $category;
                        $count /= 4;
			print $out_file "$lowercase_category\t$count\n";

		}
	}


	print $out_file "unclassified\t$unclassified_count\n";
	print $out_file "no_match\t$no_match_count\n";
}

__END__
# Example of config file
# Config file for library_composition.pl script
# Specify full path to the genome, all of the datasets and trimmed reads file.
# Note! Order of the datasets is important, since the reads aligning to the
# dataset will be excluded from further alignments
TRIM trimmed_reads.fastq
GENOME /home/slava/GENOME/Homo_sapiens_UCSC_hg19/Homo_sapiens/UCSC/hg19/Sequence/BowtieIndex/genome
MIRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/mirna
PIRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/piRNA
SNORNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/snoRNA
SNRNA  /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/snRNA
RRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/rRNA
TRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/tRNA
CDS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/cds
PROMOTERS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/promoters
UTR5 /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/UTR5
UTR3 /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/UTR3
INTRONS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/introns
REPEATS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/repeats
	#! /usr/bin/perl
	use strict; use warnings;

	my $config = shift; # Specify configuration file
	my $trimmed_reads; # Store file name with trimmed reads in fastq format
	my @paths; # Store full path to each bowtie index
	my @unaligned_files; # Store files with reads that could not be aligned by bowtie
	my @aligned_files; # Store files with aligned sequences in fastq format, only needed to count reads
	open ( my $config_in, "<", $config ) or die "Cannot open configuration file: $!\n";

	while( my $line = <$config_in> )
	{
	chomp $line;
	if ( $line !~ /^#/ && $line =~ /TRIM/ )
	{
	my @trimmed = split( ' ', $line );
	$trimmed_reads = $trimmed[1];
	}
	elsif ( $line !~ /^#/ && $line =~ /^[A-Z]/ )
	{
	my ( $index, $path ) = split( ' ', $line );
	my $unaligned = join( "_", $index, 'unaligned' );
	my $aligned = join( "_", $index, 'aligned' );
	push ( @paths, $path );
	push ( @unaligned_files, $unaligned );
	push ( @aligned_files, $aligned );

	}

	}

	# Create a new array storing fastq files with reads aligned by bowtie
	# Basically its a modified @unaligned_files array
	my @reads = @unaligned_files;
	pop ( @reads );
	$reads[0] = 'GENOME_aligned';

	# Modify @unaligned_files, so the last one is called unclassified, and remove the genome_unaligned as a first one
	$unaligned_files[-1] = 'unclassified';
	shift( @unaligned_files );

	# Remove first element 'GENOME_aligned' from @aligned_files, since we are doing genome alignment outside of the loop
	shift( @aligned_files );

	# First align trimmed reads to the genome in order to filter out those that are no match
	my $path_to_genome = shift( @paths );
	my $no_match = 'GENOME_unaligned';
	`bowtie -v 1 --best -p 4 --un $no_match --al GENOME_aligned $path_to_genome $trimmed_reads hit`;


	my $array_of_bowtie_files_ref = &create_array_of_arrays( \@reads, \@paths, \@unaligned_files, \@aligned_files );
	foreach my $bowtie_files ( @{$array_of_bowtie_files_ref} )
	{
	&bowtie_align( @{$bowtie_files} );
	}

	&count_aligned_reads( \@aligned_files, 'unclassified', $trimmed_reads, $no_match );


	sub create_array_of_arrays
	# Creates array of arrays where each element is ( unaligned_file, path_to_bowtie_index, reads_to_align )
	# all of those will be fed into bowtie in a step wise fasion, we require to build library composition
	{
	my $reads_ref = shift;
	my $path_ref = shift;
	my $unaligned_ref = shift;
	my $aligned_ref = shift;

	my @reads = @{$reads_ref};
	my @paths = @{$path_ref};
	my @unaligned_files = @{$unaligned_ref};
	my @aligned_files = @{$aligned_ref};

	my @array_of_bowtie_files;

	for ( my $i = 0; $i < scalar( @reads ); $i++ )
	{
	$array_of_bowtie_files[$i] = [ $unaligned_files[$i], $paths[$i], $reads[$i], $aligned_files[$i] ];
	}
	return \@array_of_bowtie_files;
	}


	sub bowtie_align
	{
	my @bowtie_params = @_;
	my $unaligned = $bowtie_params[0];
	my $path = $bowtie_params[1];
	my $read_file = $bowtie_params[2];
	my $aligned_file = $bowtie_params[3];

	`bowtie -v 1 --best -p 4 --al $aligned_file --un $unaligned $path $read_file hit`;
	print "bowtie -v 1 --best -p 4 --al $aligned_file --un $unaligned $path $read_file hit\n";
	}

	sub count_aligned_reads
	{
	# Take in arguments
	my $aligned_files_ref = shift;
	my $unclassified_reads = shift;
	my $trimmed_reads_file = shift;
	my $no_match_file = shift;

	# Output values
	my $total_reads_count;
	my $unclassified_count;
	my $no_match_count;

	# Deref @aligned_files
	my @aligned_files = @{ $aligned_files_ref };

	# open file to write out the results
	open( my $out_file, ">", 'align_stats.txt' ) or die "Cannot open file for writing: $!\n";

	# open fastq with total reads and count lines, divide number of lines by 4 to get number of sequences
	{
	my $count = 0;
	open( my $total, "<", $trimmed_reads_file ) or die "Cannot open small RNA fastq: $!\n";
	while( <$total> )
	{
	$count++;
	}
	$total_reads_count = $count/4;


	}
	print $out_file "total\t$total_reads_count\n";

	# Count reads in unclassified
	{
	my $count = 0;
	open( my $unclassified, "<", 'unclassified' ) or die "Cannot open file with unclassified sequences: $!\n";
	while( <$unclassified> )
	{
	$count++;
	}
	$unclassified_count = $count/4;
	}

	{
	my $count = 0;
	open( my $no_match, "<", $no_match_file ) or die "Cannot open file with unclassified sequences: $!\n";
	while( <$no_match> )
	{
	$count++;
	}
	$no_match_count = $count/4;
	}

	# Count reads in the rest of the files
	{
	foreach my $aligned_file ( @aligned_files )
	{
	my $count = 0;
	open( my $aligned, "<", $aligned_file ) or die "Cannot open file with aligned reads: $!\n";
	while( <$aligned> )
	{
	$count++;
	}
	my ( $category, $rest ) = split( "_", $aligned_file );
	my $lowercase_category = lc $category;
	$count /= 4;
	print $out_file "$lowercase_category\t$count\n";

	}
	}



	print $out_file "unclassified\t$unclassified_count\n";
	print $out_file "no_match\t$no_match_count\n";
	}

	__END__
	# Example of config file
	# Config file for library_composition.pl script
	# Specify full path to the genome, all of the datasets and trimmed reads file.
	# Note! Order of the datasets is important, since the reads aligning to the
	# dataset will be excluded from further alignments
	TRIM trimmed_reads.fastq
	GENOME /home/slava/GENOME/Homo_sapiens_UCSC_hg19/Homo_sapiens/UCSC/hg19/Sequence/BowtieIndex/genome
	MIRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/mirna
	PIRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/piRNA
	SNORNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/snoRNA
	SNRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/snRNA
	RRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/rRNA
	TRNA /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/tRNA
	CDS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/cds
	PROMOTERS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/promoters
	UTR5 /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/UTR5
	UTR3 /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/UTR3
	INTRONS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/introns
	REPEATS /home/slava/Projects/Project_Sambu_smallRNA/other_smallRNA_analysis/datasets/repeats