radaniba/extract-oligos.pl

## extract-oligos.pl
#!/usr/bin/perl -w

# oligos.pl
# Create and analyze an overlapping series of oligos
# WI Bioinformatics course - Feb 2002 - Lecture 5
# WI Bioinformatics course - Revised - Sep 2003
# Example of input taken as multiple arguments

# Check input and give info if arguments are missing
if (! $ARGV[3])
{
   print "
This program creates a series of oligos from 'sequence'
of length 'oligoLength' starting at startPosition nt and
ending at endPosition nt
USAGE: oligos.pl sequence oligoLength startPosition endPosition\n\n";

   exit(0);
}
else
{
   $seq = $ARGV[0];   # sequence name
   $mer = $ARGV[1];   # length of oligos to extract
   $start = $ARGV[2];   # nt number for beginning of first oligo
   $end = $ARGV[3];   # nt number for beginning of last oligo
}

# Get continuous sequence from sequence file
open (SEQ, $seq) || die "cannot open $seq for reading: $!";
@seq = <SEQ>;            # make array of lines
$defline = $seq[0];         # get defline

# Split definition line into fields separated by spaces
@defline_fields = split(/\s/, $defline);

$seq[0] = "";            # delete defline
$seq = join ("", @seq);         # join($glue, @list) to get seq without defline
$seq =~ s/\s*//g;         # delete spaces and end of lines
$seqLength = length ($seq);       # get length of sequence (optional)

# $defline_fields[0] is the first part of defline (up to first space)
print "Oligos ($mer mers) for $defline_fields[0] ($seqLength nt) and % GC content\n";

# Beware of OBOB ==> for ($i = $start; $i < $end; $i++) would be one nucleotide off
# This "off-by-one bug" is a common problem, so always check your code

# Adjust counting since computers start at 0 but people think of sequences starting at 1
for ($i = $start - 1; $i < $end - 1; $i++)
{
   # Extract substring of sequence to make oligo ==> $oligo = substr(seq, start, length);
   $oligo = substr($seq, $i, $mer);

   # Adjust to conventional way of counting (e.g., first nt = 1, not 0)
   $nt = $i + 1;

   # Count the number of Gs and Cs in the oligo by counting
   # the number of times G or C are "translated" into the same G or C within $oligo
   $numGC = $oligo =~ tr/GC/GC/;

   # Calculate per cent GC
   $pcGC = 100 * $numGC / $mer;

   # Round off percent GC to 1 decimal places (%0.2f ==> 2 decimal places)
   $pcGCrounded = sprintf("%0.1f", $pcGC);

   # Print results delimited by tabs (for easy importing into spreadsheets, etc)
   print "$nt\t$oligo\t$pcGCrounded\n";
}
print "\n";
	#!/usr/bin/perl -w

	# oligos.pl
	# Create and analyze an overlapping series of oligos
	# WI Bioinformatics course - Feb 2002 - Lecture 5
	# WI Bioinformatics course - Revised - Sep 2003
	# Example of input taken as multiple arguments

	# Check input and give info if arguments are missing
	if (! $ARGV[3])
	{
	print "
	This program creates a series of oligos from 'sequence'
	of length 'oligoLength' starting at startPosition nt and
	ending at endPosition nt
	USAGE: oligos.pl sequence oligoLength startPosition endPosition\n\n";

	exit(0);
	}
	else
	{
	$seq = $ARGV[0]; # sequence name
	$mer = $ARGV[1]; # length of oligos to extract
	$start = $ARGV[2]; # nt number for beginning of first oligo
	$end = $ARGV[3]; # nt number for beginning of last oligo
	}

	# Get continuous sequence from sequence file
	open (SEQ, $seq) \|\| die "cannot open $seq for reading: $!";
	@seq = <SEQ>; # make array of lines
	$defline = $seq[0]; # get defline

	# Split definition line into fields separated by spaces
	@defline_fields = split(/\s/, $defline);

	$seq[0] = ""; # delete defline
	$seq = join ("", @seq); # join($glue, @list) to get seq without defline
	$seq =~ s/\s*//g; # delete spaces and end of lines
	$seqLength = length ($seq); # get length of sequence (optional)

	# $defline_fields[0] is the first part of defline (up to first space)
	print "Oligos ($mer mers) for $defline_fields[0] ($seqLength nt) and % GC content\n";

	# Beware of OBOB ==> for ($i = $start; $i < $end; $i++) would be one nucleotide off
	# This "off-by-one bug" is a common problem, so always check your code

	# Adjust counting since computers start at 0 but people think of sequences starting at 1
	for ($i = $start - 1; $i < $end - 1; $i++)
	{
	# Extract substring of sequence to make oligo ==> $oligo = substr(seq, start, length);
	$oligo = substr($seq, $i, $mer);

	# Adjust to conventional way of counting (e.g., first nt = 1, not 0)
	$nt = $i + 1;

	# Count the number of Gs and Cs in the oligo by counting
	# the number of times G or C are "translated" into the same G or C within $oligo
	$numGC = $oligo =~ tr/GC/GC/;

	# Calculate per cent GC
	$pcGC = 100 * $numGC / $mer;

	# Round off percent GC to 1 decimal places (%0.2f ==> 2 decimal places)
	$pcGCrounded = sprintf("%0.1f", $pcGC);

	# Print results delimited by tabs (for easy importing into spreadsheets, etc)
	print "$nt\t$oligo\t$pcGCrounded\n";
	}
	print "\n";