jimhester/parseREBASE.pl

## parseREBASE.pl
my %sites;

while(<>){
  next if( 1 .. /Rich Roberts/);

  next unless /\S/;

  my(@names) = split /[\s\)\(]+/;
  my $site = IUB_to_regexp(uc($names[-1]));
  if(not exists $sites{$site} and not exists $sites{$site}){
    print join("\t",$names[0],$site)."\n";
    $sites{$site}++;
  }
}
# Translate IUB ambiguity codes to regular expressions # IUB_to_regexp
#
# A subroutine that, given a sequence with IUB ambiguity codes,
# outputs a translation with IUB codes changed to regular expressions
#
  # These are the IUB ambiguity codes
  # (Eur. J. Biochem. 150: 1-5, 1985):
  # R = G or A
  # Y = C or T
  # M = A or C
  # K = G or T
  # S = G or C
  # W = A or T
  # B = not A (C or G or T)
  # D = not C (A or G or T)
  # H = not G (A or C or T)
  # V = not T (A or C or G)
  # N = A or C or G or T

sub revcomIUB{
  my $seq = shift;
  $seq =~ tr/ACGTRYMKSWBDHVN/TGCAYRKMWSVHDBN/;
  reverse $seq;
}

sub IUB_to_regexp {

  my($iub) = @_;

  my $regular_expression = '';

  my %iub2character_class = (
    A => 'A', C => 'C', G => 'G', T => 'T', R => '[GA]', Y => '[CT]', M => '[AC]',
    K => '[GT]', S => '[GC]', W => '[AT]', B => '[CGT]', D => '[AGT]', H => '[ACT]',
    V => '[ACG]', N => '[ACGT]',
  );
  $iub =~ s/\^//g;

  for ( my $i = 0 ; $i < length($iub) ; ++$i ) {
    $regular_expression .= $iub2character_class{substr($iub, $i, 1)};
  }
  return $regular_expression;
}

## regex.pl
#!/usr/bin/perl

use strict;use warnings;
use Time::HiRes qw(gettimeofday tv_interval);

my $enzymeFile = shift;

my @enzymes;
open my $enzymeIn, $enzymeFile or die "$!:Could not open $enzymeFile\n";
while(<$enzymeIn>){
  my($name, $enzyme) = split;
  push @enzymes, [$name,$enzyme];
}
local $/ = ">";

my $first = <>;
while(my $record = <>){
  chomp $record;
  my $newline_loc = index($record,"\n");
  my $header = substr($record,0,$newline_loc);
  my $sequence = substr($record,$newline_loc+1);
  $sequence = uc $sequence;
  $sequence =~ tr/\n//d;
  my $time = [ gettimeofday ];
  for my $enzyme(@enzymes){
    my $count = 0;
    my $search = $enzyme->[1];
    while($sequence =~ /$search/g){
      $count++;
    }
  }
  printf "perl: %.4f secs per enzyme\n" ,tv_interval($time)/@enzymes;
  exit;
}

## regex.py
#!/usr/bin/env python
import os,sys,re
import time
def searchFasta(header,sequence):
  start = time.time()
  for enzyme in enzymes:
    count = 0
    count = len(enzyme[1].findall(sequence))
  end = time.time()
  return (end - start) / len(enzymes)

files = sys.argv
enzymeFile = files.pop(1)
enzymeIn = open(enzymeFile,'rU')
enzymes=[]
for line in enzymeIn:
  name,enzyme = line.split()
  enzymes.append((name,re.compile(enzyme)))

f = open(sys.argv[1],'rU')
header = f.readline()
header = header.rstrip(os.linesep)
sequence=''
for line in f:
  line = line.rstrip('\n')
  if(line[0] == '>'):
    header = header[1:]
    sequence = sequence.upper()
    searchTime = searchFasta(header,sequence)
    print "python: %.4f secs per enzyme" % (searchTime)
    sys.exit(0)
    header = line
    sequence = ''
  else:
    sequence += line

## regex.rb
#!/usr/bin/env ruby

enzymeFile = File.open(ARGV.shift)

enzymes = Array.new

enzymeFile.each do |line|
  name, enzyme = line.split
  enzymes << [ name, enzyme ]
end

$/ = ">"
ARGF.gets
while rec = ARGF.gets
  rec.chomp!
  nl = rec.index("\n")
  header = rec[0..nl-1]
  seq = rec[nl+1..-1]
  seq.gsub!(/\n/,'')
  seq.upcase!
  start = Time.new()
  for enzyme in enzymes
    count = 0
    seq.scan(/#{enzyme[1]}/){
      count+=1
    }
   end
  end_time = Time.new()
  printf "ruby:\t%.4f secs per enzyme\n",((end_time - start)/enzymes.length())
   exit
end
	my %sites;

	while(<>){
	next if( 1 .. /Rich Roberts/);

	next unless /\S/;

	my(@names) = split /[\s\)\(]+/;
	my $site = IUB_to_regexp(uc($names[-1]));
	if(not exists $sites{$site} and not exists $sites{$site}){
	print join("\t",$names[0],$site)."\n";
	$sites{$site}++;
	}
	}
	# Translate IUB ambiguity codes to regular expressions # IUB_to_regexp
	#
	# A subroutine that, given a sequence with IUB ambiguity codes,
	# outputs a translation with IUB codes changed to regular expressions
	#
	# These are the IUB ambiguity codes
	# (Eur. J. Biochem. 150: 1-5, 1985):
	# R = G or A
	# Y = C or T
	# M = A or C
	# K = G or T
	# S = G or C
	# W = A or T
	# B = not A (C or G or T)
	# D = not C (A or G or T)
	# H = not G (A or C or T)
	# V = not T (A or C or G)
	# N = A or C or G or T

	sub revcomIUB{
	my $seq = shift;
	$seq =~ tr/ACGTRYMKSWBDHVN/TGCAYRKMWSVHDBN/;
	reverse $seq;
	}

	sub IUB_to_regexp {

	my($iub) = @_;

	my $regular_expression = '';

	my %iub2character_class = (
	A => 'A', C => 'C', G => 'G', T => 'T', R => '[GA]', Y => '[CT]', M => '[AC]',
	K => '[GT]', S => '[GC]', W => '[AT]', B => '[CGT]', D => '[AGT]', H => '[ACT]',
	V => '[ACG]', N => '[ACGT]',
	);
	$iub =~ s/\^//g;

	for ( my $i = 0 ; $i < length($iub) ; ++$i ) {
	$regular_expression .= $iub2character_class{substr($iub, $i, 1)};
	}
	return $regular_expression;
	}
	#!/usr/bin/perl

	use strict;use warnings;
	use Time::HiRes qw(gettimeofday tv_interval);

	my $enzymeFile = shift;

	my @enzymes;
	open my $enzymeIn, $enzymeFile or die "$!:Could not open $enzymeFile\n";
	while(<$enzymeIn>){
	my($name, $enzyme) = split;
	push @enzymes, [$name,$enzyme];
	}
	local $/ = ">";

	my $first = <>;
	while(my $record = <>){
	chomp $record;
	my $newline_loc = index($record,"\n");
	my $header = substr($record,0,$newline_loc);
	my $sequence = substr($record,$newline_loc+1);
	$sequence = uc $sequence;
	$sequence =~ tr/\n//d;
	my $time = [ gettimeofday ];
	for my $enzyme(@enzymes){
	my $count = 0;
	my $search = $enzyme->[1];
	while($sequence =~ /$search/g){
	$count++;
	}
	}
	printf "perl: %.4f secs per enzyme\n" ,tv_interval($time)/@enzymes;
	exit;
	}
	#!/usr/bin/env python
	import os,sys,re
	import time
	def searchFasta(header,sequence):
	start = time.time()
	for enzyme in enzymes:
	count = 0
	count = len(enzyme[1].findall(sequence))
	end = time.time()
	return (end - start) / len(enzymes)

	files = sys.argv
	enzymeFile = files.pop(1)
	enzymeIn = open(enzymeFile,'rU')
	enzymes=[]
	for line in enzymeIn:
	name,enzyme = line.split()
	enzymes.append((name,re.compile(enzyme)))

	f = open(sys.argv[1],'rU')
	header = f.readline()
	header = header.rstrip(os.linesep)
	sequence=''
	for line in f:
	line = line.rstrip('\n')
	if(line[0] == '>'):
	header = header[1:]
	sequence = sequence.upper()
	searchTime = searchFasta(header,sequence)
	print "python: %.4f secs per enzyme" % (searchTime)
	sys.exit(0)
	header = line
	sequence = ''
	else:
	sequence += line
	#!/usr/bin/env ruby

	enzymeFile = File.open(ARGV.shift)

	enzymes = Array.new

	enzymeFile.each do \|line\|
	name, enzyme = line.split
	enzymes << [ name, enzyme ]
	end

	$/ = ">"
	ARGF.gets
	while rec = ARGF.gets
	rec.chomp!
	nl = rec.index("\n")
	header = rec[0..nl-1]
	seq = rec[nl+1..-1]
	seq.gsub!(/\n/,'')
	seq.upcase!
	start = Time.new()
	for enzyme in enzymes
	count = 0
	seq.scan(/#{enzyme[1]}/){
	count+=1
	}
	end
	end_time = Time.new()
	printf "ruby:\t%.4f secs per enzyme\n",((end_time - start)/enzymes.length())
	exit
	end