epaule/acediff.pl

## acediff.pl
#!/software/bin/perl -w
#
# acediff.pl : an improved acediff
#
# gw3
#
# Usage : acediff.pl [-options]
#
# Last edited by: $Author: gw3 $
# Last edited on: $Date: 2011-04-26 15:49:01 $

use strict;
use Getopt::Long;

my ($test, $debug, $store, $file1, $file2, $output);

GetOptions (
	    "test"     => \$test,
	    "debug:s"  => \$debug,
	    "reference:s"    => \$file1,
	    "new:s"    => \$file2,
	    "output:s" => \$output,
	    )
		    or die("invalid commandline option\n");
	    ;

# use the time and the process ID to make a unique file extension
my $time = time();
my $pid = "$$";
my $tmpdir = $ENV{TMPDIR} || '/tmp';
my $outfile1 = "$tmpdir/acediff1.$pid.$time.new";
my $outfile2 = "$tmpdir/acediff2.$pid.$time.new";
my $tmp = "$tmpdir/acediff.$pid.$time.tmp";


# Change input separator to paragraph mode, but store old mode in $oldlinesep
my $oldlinesep = $/;
$/ = "";

open (FILE, "<$file1") or die("cant open $file1 : $!\n");
open (PRE, ">$tmp") or die("cant open $tmp : $!\n");
while (my $record = <FILE>) {
  while ($record =~ /^\s*\/\//) {$record =~ s/^\s*\/\/.*?\n//} # strip out any comments at the start of the record
  $record =~ s/\t/ /g;
  $record =~ s/\n/\t/g;
  print PRE $record, "\n";
}
close (PRE);
close (FILE);

# sort the file - sometimes the sorted file doesn't appear - this is very odd - try a few times to make it
my $tries = 5;
while ($tries-- && ! -e "$outfile1") {
  system("sort -S 4G $tmp -o $outfile1");
  system('sleep 5'); # wait a few seconds for NFS to realise that there really is a file there
}

open (FILE, "<$file2") or die("cant open $file2 : $!\n");
open (PRE, ">$tmp") or die("cant open $tmp : $!\n");
while (my $record = <FILE>) {
  while ($record =~ /^\s*\/\//) {$record =~ s/^\s*\/\/.*?\n//} # strip out any comments at the start of the record
  $record =~ s/\t/ /g;
  $record =~ s/\n/\t/g;
  print PRE $record, "\n";
}
close (PRE);
close (FILE);

# sort the file - sometimes the sorted file doesn't appear - this is very odd - try a few times to make it
$tries = 5;
while ($tries-- && ! -e "$outfile2") {
  system("sort -S 4G $tmp -o $outfile2");
  system('sleep 5'); # wait a few seconds for NFS to realise that there really is a file there
}

# reset input line separator
$/= $oldlinesep;

# get diff output - use 'system' rather than $wormbase->run_command
# because diff returns '1' if there is a difference, 2 means error
my $status = system("/usr/bin/diff $outfile1 $outfile2 > $tmp");
if ( ( $status >> 8 ) == 2 ) {
  die("diff exited with an error\n");
}

# now read in $tmp
# when there is an object that is new (no previous instance): output object
# when there is an object that is deleted (no subsequent instance): output -D object
# when there is an object that is changed: read both instances in and out deleted tags as -D tags and add new tags

# '>' is a new object
# '<' is an old object

my %new;
my %reference;

open (ACE, "<$tmp") or die("cant read from $tmp : $!\n");
while (my $line = <ACE>) {

  # put the data into the hashes
  my ($origin, $first, $rest) = ($line =~ /^([\>\<])\s+(.+?)\t(.+)/);
  if (!defined $origin) {next}
  if ($origin eq '>') {
    # put tags in %new, keyed by class and ID line
    $new{$first} = $rest;
  } elsif ($origin eq '<') {
    # put tags in %reference, keyed by class and ID line
    $reference{$first} = $rest;

  } else { # number or --- lines
    # do nothing
  }

}
close ACE;


# now go through %new and check in %reference to see if this is new or changed
# delete the %reference ones we find that are in %new

open (OUT, ">$output") || die("cantopen $output : $!\n");
my $started;
foreach my $newkey (keys %new) {
  if (exists $reference{$newkey}) {
    # changed data - store the tags in hases for comparison
    my %n;
    my %r;
    my @new = split /\t/, $new{$newkey};
    my @ref = split /\t/, $reference{$newkey};
    foreach my $tag (@new) {$n{$tag}=1}
    foreach my $tag (@ref) {$r{$tag}=1}

    # removed tags
    $started = 0;
    foreach my $tag (keys %r) {
      if (! exists $n{$tag}) {
	if (!$started) {
	  print OUT "\n// Changed deleted data\n";
	  print OUT "\n$newkey\n";
	  $started = 1;
	}
	print OUT "-D $tag\n";
      }
    }

    # added tags
    $started = 0;
    foreach my $tag (keys %n) {
      if (! exists $r{$tag}) {
	if (!$started) {
	  print OUT "\n// Changed added data\n";
	  print OUT "\n$newkey\n";
	  $started = 1;
	}
	print OUT "$tag\n";
      }
    }

    delete $reference{$newkey};

  } else {
    # new data
    print OUT "\n// New data\n";
    print OUT "\n$newkey\n";
    my $rest = $new{$newkey};
    $rest =~ s/\t/\n/g;
    print OUT "$rest";
  }
}


# go through %reference to find the ones that are left - these are just deleted
foreach my $refkey (keys %reference) {
  if (exists $reference{$refkey}) {
    print OUT "\n// Deleted data\n";
    print OUT "\n-D $refkey\n";
  }
}
close(OUT);

# tidy up
unlink $outfile1;
unlink $outfile2;
unlink $tmp;


# Add perl documentation in POD format
# This should expand on your brief description above and
# add details of any options that can be used with the program.
# Such documentation can be viewed using the perldoc command.


__END__

=pod

=head2 NAME - acediff.pl

=head1 USAGE

=over 4

=item acediff.pl  -reference file1.ace -new file2.ace -out changed_file.ace

=back

This script is a rewrite of acediff in perl. It takes a reference ace fiekl and a new ace file and writes out an ace file that will patch the old database to reflect the new data.

script_template.pl MANDATORY arguments:

=over 4

=item -reference file, the old ace file

=back

=over 4

=item -new file, the new ace file

=back

=over 4

=item -output file, the difference ace file

=back

script_template.pl  OPTIONAL arguments:

=over 4

=item -h, Help

=back

=over 4

=item -debug, Debug mode, set this to the username who should receive the emailed log messages. The default is that everyone in the group receives them.

=back

=over 4

=item -test, Test mode, run the script, but don't change anything.

=back

=over 4

=item -verbose, output lots of chatty test messages

=back


=head1 REQUIREMENTS

=over 4

=item None at present.

=back

=head1 AUTHOR

=over 4

=item Gary Williams (gw3@sanger.ac.uk)

=back

=cut

## dump_cds.pl
#!/bin/env perl

use Ace;

my $db = Ace->connect(-path => shift()) || die(Ace->Error);

my $cds = $db->fetch(CDS => shift());
exit() unless $cds;
my $seq = $cds->Sequence;

my ($cds_child) = grep {/"$cds"/} split("\n",$seq->asAce);
$cds_child =~ s/(""\t)+(CDS_child)*/CDS_child\t/;

print "-D CDS : $cds\n\n";
print "Sequence : $seq\n$cds_child\n\n";
my @lastline;
my @clines = split("\n",$cds->asAce);
foreach my $line(@clines){
   my @l = split("\t",$line);
   for (my $i=0; $i < scalar @l; $i++){
       $lastline[$i] = $l[$i] unless $l[$i]=~/""/;
   }

   if ($line=~/""/){
       my @elements = split("\t",$line);
       for (my $i=0; $i < scalar @elements; $i++){
          if ($elements[$i]=~/""/){
               print $lastline[$i],"\t";
          }else{
               print $elements[$i],"\t";
          }
       }
       print "\n"
   } else {print "$line\n"}
}
print "\n"

## dump_gff.pl
#!/usr/bin/env perl
# take a reference database and a handful of curation ones and udpdate the reference_db

use strict;
use Getopt::Long;
use Digest::MD5 qw(md5_hex);
use Ace;
use Ace::Sequence;

my ($sourceDB,$targetDB);
my @curationDBs;
my @tmpfiles;

my $tmp = $ENV{TMPDIR} || '/tmp/';

GetOptions(
    'sourceDB=s'   => \$targetDB,
)||die($!);

die("need a target\n") unless ($targetDB);

# setup
# * log
# * wormbase
my $tace = '/scratch/bin.LINUX_64/tace';
# my $sace = '/homes/mh6/project/software/packages/acedb/4.9.54/giface';
my $sace = '/homes/mh6/project/software/packages/acedb/4.9.60/giface';
# my $sace = '/scratch/bin.LINUX_64/giface';
# my $sace = '/scratch/ed_test/giface';

# 5.1) proteins / genomes
my @seqs = dump_get_Sequences($targetDB);
# 5.2) GFF
dump_gff($targetDB,@seqs);

#################################################### Functions ###########################


# dumps the curated featuers on sequences as GFF2
# arguments:
#   (database, list of sequences to dump)
sub dump_gff{
        my ($database,@seqs)=@_;

        foreach my $s(@seqs){
           my $cmd = "gif seqget $s; seqfeatures -version 2 -file $database/$s.gff2";
           open (WRITEDB,"echo '$cmd' | $sace $database |")||die "$!\n";
           while (my $line = <WRITEDB>) {
	       print "ERROR detected while GFF dumping $s:\n\t$line\ngiface: $cmd\n" if $line=~/ERROR/;
	   }
	   close WRITEDB;
           system("cat $database/$s.gff2 >> $database/genome.gff2") && die($!);
           unlink "$database/$s.gff2"||die($!);
        }
}

# dumps genome, proteins and returns a list of genomic_canonical sequences
# arguments:
#  (target database path)
sub dump_get_Sequences {
	my $dbpath = shift;

        # sequence bit
        my @sequenceNames;
        my $acedb = Ace->connect(-path => $dbpath) ||die(Ace->Error);
        my @sequences = $acedb->fetch(-query => 'Find Sequence; Genomic_canonical');
        #open(OUTF, ">$dbpath/genome.fas") ||die($!);
        foreach my $s(@sequences){
        #    print OUTF $s->asDNA;
            push @sequenceNames,"$s";
        }
        #close OUTF;

        # protein bit
        open(OUTF,">$dbpath/proteins.fas")||die($!);
        my $cdses=$acedb->fetch_many(-query => 'Find CDS; Method=curated') ||die(Ace->Error);
        while (my $cds = $cdses->next){
            print OUTF $cds->asPeptide;
        }
        close OUTF;

        return @sequenceNames;
}

## fix_gene_spans.pl
#!/usr/bin/env perl
# fix the genespan based on it's connected CDSes
#
use Ace;
use Ace::Sequence;
use strict;

my $db = Ace->connect(-path => shift()) ||die(Ace->Error);
my $genes = $db->fetch_many(-query => 'Find Gene Corresponding_CDS');
while (my $gene = $genes->next){
    my ($start,$stop,$seq);
    next unless $gene->Species eq 'Brugia malayi';
    foreach my $cds ($gene->Corresponding_CDS) {
        my $c = Ace::Sequence->new($cds);
        die("cannot find parent of $cds\n") unless $c;
        $c->refseq($c->parent);
        $start||=$c->start;
        $stop||=$c->end;
        if ($c->start > $c->end){ #reverse
           $start = $c->start if $c->start > $start;
           $stop = $c->end if $c->end < $stop;
        }else{
           $start = $c->start if $c->start < $start;
           $stop = $c->end if $c->end > $stop;
        }
        $seq = $c->parent();
    }
    print<<HERE;
Sequence : $seq
Gene_child $gene $start $stop

HERE
}

## fix_genes.tace
query find CDS *;Species="Brugia malayi" AND method="curated"
Follow Gene
edit Laboratory ELG
edit Species "Brugia malayi"
edit Method Gene
edit Status Live
query !Version
edit Version 1
edit History Version_change 1 now WBPerson4055 Event Imported merge_split
save
quit


## merge_split_brugia.pl
#!/usr/bin/env perl
# take a reference database and a handful of curation ones and udpdate the reference_db

use strict;
use Getopt::Long;
use Digest::MD5 qw(md5_hex);
use Ace;
use Ace::Sequence;

my ($sourceDB,$targetDB);
my @curationDBs;
my @tmpfiles;

my $tmp = $ENV{TMPDIR} || '/tmp';

GetOptions(
    'curationDB=s' => \@curationDBs,
    'sourceDB=s'   => \$sourceDB,
    'targetDB=s'   => \$targetDB,
)||die($!);

die("need a source and target\n") unless ($sourceDB && $targetDB);

# setup
# * log
# * wormbase
my $tace = '/scratch/bin.LINUX_64/tace';
# my $sace = '/scratch/bin.LINUX_64/giface';
my $sace = '/scratch/ed_test/giface';

# grab the reference files
my $reference = dumpDB($sourceDB);

# foreach do
# 1.) dump CDS / Transcript / Sequence

my %curationFiles;
foreach my $d(@curationDBs){
   $curationFiles{$d} = dumpDB($d);
}

# 2.) acediff them
acediff($reference,\%curationFiles);

# 3.) which CDS / Transcript was changed?
die("has hash collisions\n") if checkCollisions(\%curationFiles);

# 4.) load them
# 4.1) copy the reference database
system("mkdir -p $targetDB") && die($!);
system("cp -r $sourceDB/* $targetDB/") && die($!);

# 4.2) reformat to speed it up
# 4.3) load(file,user,db)
foreach my $db(values %curationFiles){
    foreach my $file(values %$db){
        loadace($file,$targetDB);
    }
}

# fix gene tags
fix_gene_tags($targetDB);

# add new gene ids
add_gene_ids($targetDB);

# fix gene spans
fix_gene_spans($targetDB);

# 5.) dump them
# 5.1) proteins / genomes
my @seqs = dump_get_Sequences($targetDB);
# 5.2) GFF
dump_gff($targetDB,@seqs);

#################################################### Functions ###########################

# creates new tag gene_ids
# arguments:
#   (acedb database)
sub add_gene_ids{
	my ($db) = @_;
        my @letters = qw(a b c d e f g h i j k l m n o p q r s t u v w x y z);

        my $acedb = Ace->connect(-path => $db) ||die(Ace->Error);
        $acedb->auto_save(1);

        my $counter= 0;
        my ($ace,$ace2);

        my $it = $acedb->fetch_many(Gene => 'TAG*');
        while (my $g = $it->next){
            my $name = "$g";
            my ($num) = $name=~/TAG(\d+)/;
            $counter = $num if $num > $counter;
        }

        $it = $acedb->fetch_many(-query => 'Find Gene;Corresponding_CDS AND Species = "Brugia malayi"');
        while (my $g = $it->next){
            next if "$g"=~/^TAG\d+/;
            $counter++;
            print "renaming Gene : $g to TAG$counter\n";
            $ace.="-R Gene : $g TAG$counter\n";
            $ace2.="Gene : TAG$counter\nSequence_name Bmal$counter\nPublic_name Bm$counter\n\n";

            my @cdses = $g->Corresponding_CDS;
            if (scalar(@cdses) == 1){
               my $c = $cdses[0];
               $ace.="-R CDS : $c Bm$counter\n";
               print "renaming CDS : $c to Bm$counter\n"
            }else{
               for (my $n=0;$n<scalar(@cdses);$n++){
                 my $c = $cdses[$n];
                 $ace.="-R CDS : $c Bm$counter".$letters[$n]."\n";
                 print "renaming CDS : $c to Bm$counter".$letters[$n]."\n"
               }
            }
	}

        # for non-coding transcripts
        $it = $acedb->fetch_many(-query => 'Find Gene;Corresponding_Transcript AND Species = "Brugia malayi"');
        while (my $g = $it->next){
            next if "$g"=~/^TAG\d+/;
            $counter++;
            print "renaming Gene : $g to TAG$counter\n";
            $ace.="-R Gene : $g TAG$counter\n";
            $ace2.="Gene : TAG$counter\nSequence_name Bmal$counter\nPublic_name Bm$counter\n\n";
            my @transcripts = $g->Corresponding_Transcript;
            if (scalar(@transcripts) == 1){
               my $c = $transcripts[0];
               $ace.="-R transcript : $c Bm$counter\n";
               print "renaming transcript : $c to Bm$counter\n"
            }else{
               for (my $n=0;$n<scalar(@transcripts);$n++){
                 my $c = $transcripts[$n];
                 $ace.="-R Transcript : $c Bm$counter".$letters[$n]."\n";
                 print "renaming Transcript : $c to Bm$counter".$letters[$n]."\n"
               }
            }

        }
        $acedb->close;

        # that bit below is because the Ace->parse seems to not work

        my $outf1 =  "$tmp/".md5_hex($db) .':new_gene_rename';
        push @tmpfiles,$outf1;
        open OUTF,">$outf1";
        print OUTF "$ace\n";
        close OUTF;

        my $outf2 = "$tmp/".md5_hex($db) .':new_gene_tags';
        push @tmpfiles,$outf2;
        open OUTF,">$outf2";
        print OUTF "$ace2\n";
        close OUTF;

        loadace($outf1,$db);
        loadace($outf2,$db);

}

# populates a minimum set of tags (tace commands are in a flatfile)
# arguments:
#   (acedb database)
sub fix_gene_tags {
    my ($db)=@_;
    print `tace $db < fix_genes.tace`
}

# fixes the gene spans based on connected CDSes/transcripts
# arguments:
#  (acedb database)
sub fix_gene_spans {
  my $db = shift;

  my $outfile = "$tmp/".md5_hex($db) .':gene_spans';
  push @tmpfiles,$outfile;
  open (OUTF,">$outfile")||die($!);

  my $acedb = Ace->connect(-path => $db) ||die(Ace->Error);
  my $genes = $acedb->fetch_many(-query => 'Find Gene');
  while (my $gene = $genes->next){
    my ($start,$stop,$seq);
    next unless ($gene->Corresponding_CDS ||$gene->Corresponding_transcript);
    next unless $gene->Species eq 'Brugia malayi';
    foreach my $cds ($gene->Corresponding_CDS,$gene->Corresponding_Transcript) {
        my $c = Ace::Sequence->new($cds) ||die("something fishy with $cds\n");
        $c->refseq($c->parent);
        $start||=$c->start;
        $stop||=$c->end;
        if ($c->start > $c->end){ #reverse
           $start = $c->start if $c->start > $start;
           $stop = $c->end if $c->end < $stop;
        }else{
           $start = $c->start if $c->start < $start;
           $stop = $c->end if $c->end > $stop;
        }
        $seq = $c->parent();
    }
    print OUTF <<HERE;
Sequence : $seq
Gene_child $gene $start $stop

HERE
  }
  close OUTF;

  loadace($outfile,$db);
}


# dumps the curated featuers on sequences as GFF2
# arguments:
#   (database, list of sequences to dump)
sub dump_gff{
        my ($database,@seqs)=@_;

        foreach my $s(@seqs){
           my $cmd = "gif seqget $s; seqfeatures -version 2 -file $database/$s.gff2";
           open (WRITEDB,"echo '$cmd' | $sace $database |")||die "$!\n";
           while (my $line = <WRITEDB>) {
	       print "ERROR detected while GFF dumping $s:\n\t$line\ngiface: $cmd\n" if $line=~/ERROR/;
	   }
	   close WRITEDB;
           system("cat $database/$s.gff2 >> $database/genome.gff2") && die($!);
           unlink "$database/$s.gff2"||die($!);
        }
}

# dumps genome, proteins and returns a list of genomic_canonical sequences
# arguments:
#  (target database path)
sub dump_get_Sequences {
	my $dbpath = shift;

        # sequence bit
        my @sequenceNames;
        my $acedb = Ace->connect(-path => $dbpath) ||die(Ace->Error);
        my @sequences = $acedb->fetch(-query => 'Find Sequence; Genomic_canonical');
        open(OUTF, ">$dbpath/genome.fas") ||die($!);
        foreach my $s(@sequences){
            print OUTF $s->asDNA;
            push @sequenceNames,"$s";
        }
        close OUTF;

        # protein bit
        open(OUTF,">$dbpath/proteins.fas")||die($!);
        my $cdses=$acedb->fetch_many(-query => 'Find CDS; Method=curated') ||die(Ace->Error);
        while (my $cds = $cdses->next){
            print OUTF $cds->asPeptide;
        }
        close OUTF;

        return @sequenceNames;
}


# create the acediff files
# arguments:
#  (reference database, curation database)
sub acediff{
    my($r,$c)=@_;
    foreach my $classes (values %$c){
        while(my($class,$file) = each %$classes){
            system("perl /scratch/tmp/acedb/acediff.pl -reference $$r{$class} -new $file -output $file.tmp") && die($!);
            system("mv $file.tmp $file") && die($!);
        }
    }
    return 1;
}

# clean up behind the script
END{

  foreach my $file (@tmpfiles){
            # unlink $file;
      }
}

# check for hash collisions in the files
# arguments:
#     hasref of hashrefs of curation files
# returns:
#     number of hash collisions
sub checkCollisions{
    my %databases = %{shift()};
    my %tmpHash;
    my $collisions;
    while (my($directory,$classRef) = each %databases){
      while (my($class,$file) = each %{$classRef}){
        next if $class eq 'Sequence';
        print "checking  $class : $file\n" if $ENV{DEBUG};
        open INF, $file;
        while(my $line = <INF>){
          if($line =~ /$class : \"(\S+)\"/){
            print "$class $1\n" if $ENV{DEBUG};
            if ($tmpHash{$class}{$1} && $directory ne $tmpHash{$class}{$1}){
               print "ERROR: $directory $class : $1 conflicts with $tmpHash{$class}{$1}\n";
               $collisions++;
            }else{
               $tmpHash{$class}{$1} = $directory;
            }
          }
        }
        close INF;
      }
    }
    return $collisions;
}

# wrapper to dump a database CDS / Transcript / Sequence
# arguments:
#     (database path)
# returns:
#      hashref(Transcript => $file, CDS => $file, Sequence => $file)
sub dumpDB {
   my ($db)=@_;
   my %h;

   foreach my $c (qw(CDS Transcript Sequence Pseudogene)){
       my $file = "$tmp/". md5_hex($db) .":$c";
       $h{$c}=dumpace($c,$file,$db);
   }
   return \%h;
}

# functions lifted from Paul
###########################################################

# data dumping routine
# arguments:
#     (class to dump, file the dump ends up in, database)
sub dumpace {
  my ($class,$filepath,$ddb) = @_;

  my $command = "nosave\nquery find $class\nshow -a -f $filepath\nquit\n";

  print " dumping $class Filename: $filepath -> $ddb\n";

  open (TACE, "echo '$command' | $tace $ddb |" ) || die "Failed to open database connection to $ddb\n" ;
  while (<TACE>) {
    # keeps pipe open until command quit executes.
  }
  close TACE;

  # reformat it
  # system("perl reformat_acediff -file $filepath.tmp -fileout $filepath") && die($!);
  # unlink("$filepath.tmp");
  push @tmpfiles,$filepath; # cleanup hook

  return $filepath;
}

# Loading function
# arguments:
#     (file to load, database user, database)
sub loadace {
  my ($filepath,$ldb) = @_;

  my $command = "pparse $filepath\nsave\nquit\n";

  print "parsing Filename: $filepath -> ${ldb}\n";

  open (TACE, "| $tace $ldb -tsuser $ENV{USER}") || die "Couldn't open $ldb\n";
  print TACE $command;
  close TACE;
  print "SUCCESS! Loaded $filepath into ${ldb}\n";
  return 1;
}

=pod

=head2 NAME - merge_split_camaces

=head1 USAGE:

=over 4

=item merge_split_brugia.pl [-options]

=back

merge_split_brugia.pl is a wrapper to automate the process of merging brugia curation databases

merge_split_camaces mandatory arguments:

=over 4

=item reference

path to a reference database

=item curation
one or more curation databases to merge into the reference database

=item target
target path to not trample all over the reference database


=back


=head1 EXAMPLES:

=over 4

=item merge_split_brugia.pl -reference X -curation Y -curation Z -target W

=back

Dumps the relevant classes from camace_orig and the split databases, runs an
acediff to calculate the changes. This acediff file is then reformated to take
account of the acediff bug.

=head1 AUTHOR - Michael Paulini

Email michael.paulini@wormbase.org

=cut

## operon_miner.pl
#!/bin/env perl

use Getopt::Long;

my ($infile,$gff,$ace);
GetOptions(
    'infile=s' => \$infile,
    'gff'   => \$gff,
    'ace' => \$ace,
)||die($!);

open INF,$infile;

# $genes{seq}=@{%{start => 1, stop => 1, strand => '+', name => 'abc'},}
my %genes;
my %genes2operon;
my %operons;

# read genes
while(<INF>){
 if (/gene\tgene/){
   chomp;
   my @F=split;
   push(@{$genes{$F[0]}},{start => $F[3],stop => $F[4],strand => $F[6],name => $F[9]});
 }
}

seek(INF,0,0);

my $count = 1;
while(<INF>){
  if (/cufflinks_\S+\tCDS/){
    chomp;
    my @F=split;
    my $operon = "BMOP$count";
    my @genesHit = grep {$_->{start} <= $F[4] && $_->{stop} >= $F[3] && $_->{strand} eq $F[6] } @{$genes{$F[0]}};
    next unless scalar @genesHit > 1;
    my @genesHit = sort {$a->{start} <=> $b->{start}} @genesHit;
    map {$operon = $genes2operon{$_->{name}} if $genes2operon{$_->{name}}} @genesHit;
    map {$genes2operon{$_->{name}}=$operon} @genesHit;
    if ($operons{$operon}){
        ${$operons{$operon}}->{start} = $genesHit[0]->{start} if $genesHit[0]->{start} < ${$operons{$operon}}->{start};
        ${$operons{$operon}}->{stop} = $genesHit[-1]->{stop} if $genesHit[-1]->{stop} > ${$operons{$operon}}->{stop};
    }
    else {
        ${$operons{$operon}}->{start} = $genesHit[0]->{start};
        ${$operons{$operon}}->{stop} = $genesHit[-1]->{stop};
        ${$operons{$operon}}->{strand} = $genesHit[0]->{strand};
        ${$operons{$operon}}->{seq} = $F[0];
        $count++;
    }
    ${$operons{$operon}}->{members} = {} unless ${$operons{$operon}}->{members};
    map { ${$operons{$operon}}->{members}->{$_->{name}} = 1} @genesHit;
  }
}

while (my($k,$v)=each %operons){
   if ($gff){
    print "$$v->{seq}\toperon\toperon\t$$v->{start}\t$$v->{stop}\t.$vv->{strand}\t\.\tOperon \"$k\"\n";
   }
   elsif($ace){
    my ($start,$stop) = $$v->{strand} eq '+'?($$v->{start},$$v->{stop}):($$v->{stop},$$v->{start});
    print "Sequence : $$v->{seq}\nOperon $k $start $stop\n\n";
    print "Operon : $k\nCanonical_parent $$v->{seq}\nSpecies \"Brugia malayi\"\n";
    map {print "Contains_Gene $_\n"} keys %{$$v->{members}};
    print "Description \"Predicted operon from cufflinks transcripts\"\nMethod operon\n\n";
   }
   else{
    print "$k\t$$v->{seq}\t$$v->{start}\t$$v->{stop}\t$$v->{strand}\t";
    print join "\t",keys %{$$v->{members}};
    print "\n"
   }

}

## postflunk_description.pl
#!/usr/bin/env perl


use Ace;
my $db= Ace->connect(-path => shift());
while(<>){
 chomp;
 s/\t\t/\t/g;
 s/\tsimilarity\t/\tnucleotide_match\t/;

 print;
 if(/curated\sCDS/){
   my @F= split;
   $F[9]=~s/"//g;
   $c=$db->fetch(CDS =>$F[9]);
   unless($c){
     die "cannot find CDS:$F[9]\n";
   }

   # Evidence / Curation status
   if($c->Evidence){
       print" ; Note \"Curated\""
   }else{
       print " ; Note \"Predicted\""
   }

   my $bId;
   my $gene = $c->Gene;
   die "buggy $c -> $gene connection\n" unless $gene;

   # add the cgc name / sequence name
   print " ; Note \"gene:".$gene->Public_name."\" ; Alias \"${\$gene->Public_name}\"";

   my @remarks = $gene->at('Remark');
   foreach my $r(@remarks){
       print " ; Note \"remark:$r\""
   }


   # Brief Identification / UniProt
   if ($gene->Ortholog){
    my @orthologGenes= $gene->Ortholog;
    foreach my $g(@orthologGenes){
           next unless $g->Corresponding_CDS;
           foreach my $cd($g->Corresponding_CDS){
		   if($cd->Brief_identification){
                     $bId = $cd->Brief_identification
	   	   }
           }
    }
   }
   print " ; Note \"description:$bId\"" if $bId;

   }
 print"\n";
}
	#!/software/bin/perl -w
	#
	# acediff.pl : an improved acediff
	#
	# gw3
	#
	# Usage : acediff.pl [-options]
	#
	# Last edited by: $Author: gw3 $
	# Last edited on: $Date: 2011-04-26 15:49:01 $

	use strict;
	use Getopt::Long;

	my ($test, $debug, $store, $file1, $file2, $output);

	GetOptions (
	"test" => \$test,
	"debug:s" => \$debug,
	"reference:s" => \$file1,
	"new:s" => \$file2,
	"output:s" => \$output,
	)
	or die("invalid commandline option\n");
	;

	# use the time and the process ID to make a unique file extension
	my $time = time();
	my $pid = "$$";
	my $tmpdir = $ENV{TMPDIR} \|\| '/tmp';
	my $outfile1 = "$tmpdir/acediff1.$pid.$time.new";
	my $outfile2 = "$tmpdir/acediff2.$pid.$time.new";
	my $tmp = "$tmpdir/acediff.$pid.$time.tmp";


	# Change input separator to paragraph mode, but store old mode in $oldlinesep
	my $oldlinesep = $/;
	$/ = "";

	open (FILE, "<$file1") or die("cant open $file1 : $!\n");
	open (PRE, ">$tmp") or die("cant open $tmp : $!\n");
	while (my $record = <FILE>) {
	while ($record =~ /^\s\/\//) {$record =~ s/^\s\/\/.*?\n//} # strip out any comments at the start of the record
	$record =~ s/\t/ /g;
	$record =~ s/\n/\t/g;
	print PRE $record, "\n";
	}
	close (PRE);
	close (FILE);

	# sort the file - sometimes the sorted file doesn't appear - this is very odd - try a few times to make it
	my $tries = 5;
	while ($tries-- && ! -e "$outfile1") {
	system("sort -S 4G $tmp -o $outfile1");
	system('sleep 5'); # wait a few seconds for NFS to realise that there really is a file there
	}

	open (FILE, "<$file2") or die("cant open $file2 : $!\n");
	open (PRE, ">$tmp") or die("cant open $tmp : $!\n");
	while (my $record = <FILE>) {
	while ($record =~ /^\s\/\//) {$record =~ s/^\s\/\/.*?\n//} # strip out any comments at the start of the record
	$record =~ s/\t/ /g;
	$record =~ s/\n/\t/g;
	print PRE $record, "\n";
	}
	close (PRE);
	close (FILE);

	# sort the file - sometimes the sorted file doesn't appear - this is very odd - try a few times to make it
	$tries = 5;
	while ($tries-- && ! -e "$outfile2") {
	system("sort -S 4G $tmp -o $outfile2");
	system('sleep 5'); # wait a few seconds for NFS to realise that there really is a file there
	}

	# reset input line separator
	$/= $oldlinesep;

	# get diff output - use 'system' rather than $wormbase->run_command
	# because diff returns '1' if there is a difference, 2 means error
	my $status = system("/usr/bin/diff $outfile1 $outfile2 > $tmp");
	if ( ( $status >> 8 ) == 2 ) {
	die("diff exited with an error\n");
	}

	# now read in $tmp
	# when there is an object that is new (no previous instance): output object
	# when there is an object that is deleted (no subsequent instance): output -D object
	# when there is an object that is changed: read both instances in and out deleted tags as -D tags and add new tags

	# '>' is a new object
	# '<' is an old object

	my %new;
	my %reference;

	open (ACE, "<$tmp") or die("cant read from $tmp : $!\n");
	while (my $line = <ACE>) {

	# put the data into the hashes
	my ($origin, $first, $rest) = ($line =~ /^([\>\<])\s+(.+?)\t(.+)/);
	if (!defined $origin) {next}
	if ($origin eq '>') {
	# put tags in %new, keyed by class and ID line
	$new{$first} = $rest;
	} elsif ($origin eq '<') {
	# put tags in %reference, keyed by class and ID line
	$reference{$first} = $rest;

	} else { # number or --- lines
	# do nothing
	}

	}
	close ACE;



	# now go through %new and check in %reference to see if this is new or changed
	# delete the %reference ones we find that are in %new

	open (OUT, ">$output") \|\| die("cantopen $output : $!\n");
	my $started;
	foreach my $newkey (keys %new) {
	if (exists $reference{$newkey}) {
	# changed data - store the tags in hases for comparison
	my %n;
	my %r;
	my @new = split /\t/, $new{$newkey};
	my @ref = split /\t/, $reference{$newkey};
	foreach my $tag (@new) {$n{$tag}=1}
	foreach my $tag (@ref) {$r{$tag}=1}

	# removed tags
	$started = 0;
	foreach my $tag (keys %r) {
	if (! exists $n{$tag}) {
	if (!$started) {
	print OUT "\n// Changed deleted data\n";
	print OUT "\n$newkey\n";
	$started = 1;
	}
	print OUT "-D $tag\n";
	}
	}

	# added tags
	$started = 0;
	foreach my $tag (keys %n) {
	if (! exists $r{$tag}) {
	if (!$started) {
	print OUT "\n// Changed added data\n";
	print OUT "\n$newkey\n";
	$started = 1;
	}
	print OUT "$tag\n";
	}
	}

	delete $reference{$newkey};

	} else {
	# new data
	print OUT "\n// New data\n";
	print OUT "\n$newkey\n";
	my $rest = $new{$newkey};
	$rest =~ s/\t/\n/g;
	print OUT "$rest";
	}
	}


	# go through %reference to find the ones that are left - these are just deleted
	foreach my $refkey (keys %reference) {
	if (exists $reference{$refkey}) {
	print OUT "\n// Deleted data\n";
	print OUT "\n-D $refkey\n";
	}
	}
	close(OUT);

	# tidy up
	unlink $outfile1;
	unlink $outfile2;
	unlink $tmp;




	# Add perl documentation in POD format
	# This should expand on your brief description above and
	# add details of any options that can be used with the program.
	# Such documentation can be viewed using the perldoc command.


	__END__

	=pod

	=head2 NAME - acediff.pl

	=head1 USAGE

	=over 4

	=item acediff.pl -reference file1.ace -new file2.ace -out changed_file.ace

	=back

	This script is a rewrite of acediff in perl. It takes a reference ace fiekl and a new ace file and writes out an ace file that will patch the old database to reflect the new data.

	script_template.pl MANDATORY arguments:

	=over 4

	=item -reference file, the old ace file

	=back

	=over 4

	=item -new file, the new ace file

	=back

	=over 4

	=item -output file, the difference ace file

	=back

	script_template.pl OPTIONAL arguments:

	=over 4

	=item -h, Help

	=back

	=over 4

	=item -debug, Debug mode, set this to the username who should receive the emailed log messages. The default is that everyone in the group receives them.

	=back

	=over 4

	=item -test, Test mode, run the script, but don't change anything.

	=back

	=over 4

	=item -verbose, output lots of chatty test messages

	=back


	=head1 REQUIREMENTS

	=over 4

	=item None at present.

	=back

	=head1 AUTHOR

	=over 4

	=item Gary Williams (gw3@sanger.ac.uk)

	=back

	=cut
	#!/bin/env perl

	use Ace;

	my $db = Ace->connect(-path => shift()) \|\| die(Ace->Error);

	my $cds = $db->fetch(CDS => shift());
	exit() unless $cds;
	my $seq = $cds->Sequence;

	my ($cds_child) = grep {/"$cds"/} split("\n",$seq->asAce);
	$cds_child =~ s/(""\t)+(CDS_child)*/CDS_child\t/;

	print "-D CDS : $cds\n\n";
	print "Sequence : $seq\n$cds_child\n\n";
	my @lastline;
	my @clines = split("\n",$cds->asAce);
	foreach my $line(@clines){
	my @l = split("\t",$line);
	for (my $i=0; $i < scalar @l; $i++){
	$lastline[$i] = $l[$i] unless $l[$i]=~/""/;
	}

	if ($line=~/""/){
	my @elements = split("\t",$line);
	for (my $i=0; $i < scalar @elements; $i++){
	if ($elements[$i]=~/""/){
	print $lastline[$i],"\t";
	}else{
	print $elements[$i],"\t";
	}
	}
	print "\n"
	} else {print "$line\n"}
	}
	print "\n"
	#!/usr/bin/env perl
	# take a reference database and a handful of curation ones and udpdate the reference_db

	use strict;
	use Getopt::Long;
	use Digest::MD5 qw(md5_hex);
	use Ace;
	use Ace::Sequence;

	my ($sourceDB,$targetDB);
	my @curationDBs;
	my @tmpfiles;

	my $tmp = $ENV{TMPDIR} \|\| '/tmp/';

	GetOptions(
	'sourceDB=s' => \$targetDB,
	)\|\|die($!);

	die("need a target\n") unless ($targetDB);

	# setup
	# * log
	# * wormbase
	my $tace = '/scratch/bin.LINUX_64/tace';
	# my $sace = '/homes/mh6/project/software/packages/acedb/4.9.54/giface';
	my $sace = '/homes/mh6/project/software/packages/acedb/4.9.60/giface';
	# my $sace = '/scratch/bin.LINUX_64/giface';
	# my $sace = '/scratch/ed_test/giface';

	# 5.1) proteins / genomes
	my @seqs = dump_get_Sequences($targetDB);
	# 5.2) GFF
	dump_gff($targetDB,@seqs);

	#################################################### Functions ###########################


	# dumps the curated featuers on sequences as GFF2
	# arguments:
	# (database, list of sequences to dump)
	sub dump_gff{
	my ($database,@seqs)=@_;

	foreach my $s(@seqs){
	my $cmd = "gif seqget $s; seqfeatures -version 2 -file $database/$s.gff2";
	open (WRITEDB,"echo '$cmd' \| $sace $database \|")\|\|die "$!\n";
	while (my $line = <WRITEDB>) {
	print "ERROR detected while GFF dumping $s:\n\t$line\ngiface: $cmd\n" if $line=~/ERROR/;
	}
	close WRITEDB;
	system("cat $database/$s.gff2 >> $database/genome.gff2") && die($!);
	unlink "$database/$s.gff2"\|\|die($!);
	}
	}

	# dumps genome, proteins and returns a list of genomic_canonical sequences
	# arguments:
	# (target database path)
	sub dump_get_Sequences {
	my $dbpath = shift;

	# sequence bit
	my @sequenceNames;
	my $acedb = Ace->connect(-path => $dbpath) \|\|die(Ace->Error);
	my @sequences = $acedb->fetch(-query => 'Find Sequence; Genomic_canonical');
	#open(OUTF, ">$dbpath/genome.fas") \|\|die($!);
	foreach my $s(@sequences){
	# print OUTF $s->asDNA;
	push @sequenceNames,"$s";
	}
	#close OUTF;

	# protein bit
	open(OUTF,">$dbpath/proteins.fas")\|\|die($!);
	my $cdses=$acedb->fetch_many(-query => 'Find CDS; Method=curated') \|\|die(Ace->Error);
	while (my $cds = $cdses->next){
	print OUTF $cds->asPeptide;
	}
	close OUTF;

	return @sequenceNames;
	}
	#!/usr/bin/env perl
	# fix the genespan based on it's connected CDSes
	#
	use Ace;
	use Ace::Sequence;
	use strict;

	my $db = Ace->connect(-path => shift()) \|\|die(Ace->Error);
	my $genes = $db->fetch_many(-query => 'Find Gene Corresponding_CDS');
	while (my $gene = $genes->next){
	my ($start,$stop,$seq);
	next unless $gene->Species eq 'Brugia malayi';
	foreach my $cds ($gene->Corresponding_CDS) {
	my $c = Ace::Sequence->new($cds);
	die("cannot find parent of $cds\n") unless $c;
	$c->refseq($c->parent);
	$start\|\|=$c->start;
	$stop\|\|=$c->end;
	if ($c->start > $c->end){ #reverse
	$start = $c->start if $c->start > $start;
	$stop = $c->end if $c->end < $stop;
	}else{
	$start = $c->start if $c->start < $start;
	$stop = $c->end if $c->end > $stop;
	}
	$seq = $c->parent();
	}
	print<<HERE;
	Sequence : $seq
	Gene_child $gene $start $stop

	HERE
	}
	query find CDS *;Species="Brugia malayi" AND method="curated"
	Follow Gene
	edit Laboratory ELG
	edit Species "Brugia malayi"
	edit Method Gene
	edit Status Live
	query !Version
	edit Version 1
	edit History Version_change 1 now WBPerson4055 Event Imported merge_split
	save
	quit
	#!/bin/env perl

	use Getopt::Long;

	my ($infile,$gff,$ace);
	GetOptions(
	'infile=s' => \$infile,
	'gff' => \$gff,
	'ace' => \$ace,
	)\|\|die($!);

	open INF,$infile;

	# $genes{seq}=@{%{start => 1, stop => 1, strand => '+', name => 'abc'},}
	my %genes;
	my %genes2operon;
	my %operons;

	# read genes
	while(<INF>){
	if (/gene\tgene/){
	chomp;
	my @F=split;
	push(@{$genes{$F[0]}},{start => $F[3],stop => $F[4],strand => $F[6],name => $F[9]});
	}
	}

	seek(INF,0,0);

	my $count = 1;
	while(<INF>){
	if (/cufflinks_\S+\tCDS/){
	chomp;
	my @F=split;
	my $operon = "BMOP$count";
	my @genesHit = grep {$_->{start} <= $F[4] && $_->{stop} >= $F[3] && $_->{strand} eq $F[6] } @{$genes{$F[0]}};
	next unless scalar @genesHit > 1;
	my @genesHit = sort {$a->{start} <=> $b->{start}} @genesHit;
	map {$operon = $genes2operon{$_->{name}} if $genes2operon{$_->{name}}} @genesHit;
	map {$genes2operon{$_->{name}}=$operon} @genesHit;
	if ($operons{$operon}){
	${$operons{$operon}}->{start} = $genesHit[0]->{start} if $genesHit[0]->{start} < ${$operons{$operon}}->{start};
	${$operons{$operon}}->{stop} = $genesHit[-1]->{stop} if $genesHit[-1]->{stop} > ${$operons{$operon}}->{stop};
	}
	else {
	${$operons{$operon}}->{start} = $genesHit[0]->{start};
	${$operons{$operon}}->{stop} = $genesHit[-1]->{stop};
	${$operons{$operon}}->{strand} = $genesHit[0]->{strand};
	${$operons{$operon}}->{seq} = $F[0];
	$count++;
	}
	${$operons{$operon}}->{members} = {} unless ${$operons{$operon}}->{members};
	map { ${$operons{$operon}}->{members}->{$_->{name}} = 1} @genesHit;
	}
	}

	while (my($k,$v)=each %operons){
	if ($gff){
	print "$$v->{seq}\toperon\toperon\t$$v->{start}\t$$v->{stop}\t.$vv->{strand}\t\.\tOperon \"$k\"\n";
	}
	elsif($ace){
	my ($start,$stop) = $$v->{strand} eq '+'?($$v->{start},$$v->{stop}):($$v->{stop},$$v->{start});
	print "Sequence : $$v->{seq}\nOperon $k $start $stop\n\n";
	print "Operon : $k\nCanonical_parent $$v->{seq}\nSpecies \"Brugia malayi\"\n";
	map {print "Contains_Gene $_\n"} keys %{$$v->{members}};
	print "Description \"Predicted operon from cufflinks transcripts\"\nMethod operon\n\n";
	}
	else{
	print "$k\t$$v->{seq}\t$$v->{start}\t$$v->{stop}\t$$v->{strand}\t";
	print join "\t",keys %{$$v->{members}};
	print "\n"
	}

	}
	#!/usr/bin/env perl


	use Ace;
	my $db= Ace->connect(-path => shift());
	while(<>){
	chomp;
	s/\t\t/\t/g;
	s/\tsimilarity\t/\tnucleotide_match\t/;

	print;
	if(/curated\sCDS/){
	my @F= split;
	$F[9]=~s/"//g;
	$c=$db->fetch(CDS =>$F[9]);
	unless($c){
	die "cannot find CDS:$F[9]\n";
	}

	# Evidence / Curation status
	if($c->Evidence){
	print" ; Note \"Curated\""
	}else{
	print " ; Note \"Predicted\""
	}

	my $bId;
	my $gene = $c->Gene;
	die "buggy $c -> $gene connection\n" unless $gene;

	# add the cgc name / sequence name
	print " ; Note \"gene:".$gene->Public_name."\" ; Alias \"${\$gene->Public_name}\"";

	my @remarks = $gene->at('Remark');
	foreach my $r(@remarks){
	print " ; Note \"remark:$r\""
	}


	# Brief Identification / UniProt
	if ($gene->Ortholog){
	my @orthologGenes= $gene->Ortholog;
	foreach my $g(@orthologGenes){
	next unless $g->Corresponding_CDS;
	foreach my $cd($g->Corresponding_CDS){
	if($cd->Brief_identification){
	$bId = $cd->Brief_identification
	}
	}
	}
	}
	print " ; Note \"description:$bId\"" if $bId;

	}
	print"\n";
	}