Created
November 29, 2012 16:48
-
-
Save radaniba/4170292 to your computer and use it in GitHub Desktop.
extract oligos from a sequence and analyze them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# oligos.pl | |
# Create and analyze an overlapping series of oligos | |
# WI Bioinformatics course - Feb 2002 - Lecture 5 | |
# WI Bioinformatics course - Revised - Sep 2003 | |
# Example of input taken as multiple arguments | |
# Check input and give info if arguments are missing | |
if (! $ARGV[3]) | |
{ | |
print " | |
This program creates a series of oligos from 'sequence' | |
of length 'oligoLength' starting at startPosition nt and | |
ending at endPosition nt | |
USAGE: oligos.pl sequence oligoLength startPosition endPosition\n\n"; | |
exit(0); | |
} | |
else | |
{ | |
$seq = $ARGV[0]; # sequence name | |
$mer = $ARGV[1]; # length of oligos to extract | |
$start = $ARGV[2]; # nt number for beginning of first oligo | |
$end = $ARGV[3]; # nt number for beginning of last oligo | |
} | |
# Get continuous sequence from sequence file | |
open (SEQ, $seq) || die "cannot open $seq for reading: $!"; | |
@seq = <SEQ>; # make array of lines | |
$defline = $seq[0]; # get defline | |
# Split definition line into fields separated by spaces | |
@defline_fields = split(/\s/, $defline); | |
$seq[0] = ""; # delete defline | |
$seq = join ("", @seq); # join($glue, @list) to get seq without defline | |
$seq =~ s/\s*//g; # delete spaces and end of lines | |
$seqLength = length ($seq); # get length of sequence (optional) | |
# $defline_fields[0] is the first part of defline (up to first space) | |
print "Oligos ($mer mers) for $defline_fields[0] ($seqLength nt) and % GC content\n"; | |
# Beware of OBOB ==> for ($i = $start; $i < $end; $i++) would be one nucleotide off | |
# This "off-by-one bug" is a common problem, so always check your code | |
# Adjust counting since computers start at 0 but people think of sequences starting at 1 | |
for ($i = $start - 1; $i < $end - 1; $i++) | |
{ | |
# Extract substring of sequence to make oligo ==> $oligo = substr(seq, start, length); | |
$oligo = substr($seq, $i, $mer); | |
# Adjust to conventional way of counting (e.g., first nt = 1, not 0) | |
$nt = $i + 1; | |
# Count the number of Gs and Cs in the oligo by counting | |
# the number of times G or C are "translated" into the same G or C within $oligo | |
$numGC = $oligo =~ tr/GC/GC/; | |
# Calculate per cent GC | |
$pcGC = 100 * $numGC / $mer; | |
# Round off percent GC to 1 decimal places (%0.2f ==> 2 decimal places) | |
$pcGCrounded = sprintf("%0.1f", $pcGC); | |
# Print results delimited by tabs (for easy importing into spreadsheets, etc) | |
print "$nt\t$oligo\t$pcGCrounded\n"; | |
} | |
print "\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment