epaule/add_tags.pl

## add_tags.pl
#!/usr/bin/env perl
#
# add missing tags to curation databases

use Ace;
use strict;

my $db= Ace->connect(-path => shift()) ||die (Ace->error);


# From Laboratory tag
my @cdses = $db->fetch(-query=>"find CDS; Gene; SMap");
foreach my $cds(@cdses){
    next if $cds->From_laboratory;
    print "CDS : $cds\n";
    print "From_laboratory ELG\n";
    print "\n";
}
undef @cdses;

my @transcripts = $db->fetch(-query=>"find Transcript; Gene; SMap");
foreach my $transcript (@transcripts){
    next if $transcript->From_laboratory;
    print "Transcript : $transcript\n";
    print "From_laboratory ELG\n";
    print "\n";
}
undef @transcripts;

# isoform tag
my @genes = $db->fetch(-query=>"find Gene; Corresponding_CDS; SMap");
foreach my $gene (@genes) {
   @cdses=$gene->Corresponding_CDS;
   if (scalar @cdses >1 ){
     foreach my $cds(@cdses) {
       next if $cds->Isoform;
       print "CDS : $cds\n";
       print "Isoform Inferred_automatically\n";
       print "\n"
     }
   }else{
      next unless $cdses[0]->Isoform;
      print "CDS : $cdses[0]\n";
      print "-D Isoform\n";
      print "\n";
   }
}

# clean dodgy cdses
print "//CDSes without SMaps\n";
@cdses = $db->fetch(-query => 'find CDS; ! SMap');
foreach my $cds(@cdses){
  next unless $cds->Species eq 'Brugia malayi';
  print "-D CDS $cds\n";
}

print "\n//CDSes with hand_built method\n";
@cdses = $db->fetch(-query => 'find CDS; Method=hand_built');
foreach my $cds(@cdses){
  print "-D CDS $cds\n";
}

print "\n//Transcripts with hand_built method\n";
@cdses = $db->fetch(-query => 'find Transcript; Method=hand_built');
foreach my $cds(@cdses){
  print "-D Transcript $cds\n";
}


## check_transcripts.pl
#!/bin/env perl
# that will trim away any low-scoring isoforms from uncurated genes

use Ace;
use Ace::Sequence;
use strict;

my $db = Ace->connect(-path => shift()) ||die (Ace->error);

my $genes= $db->fetch_many(-query => 'Find Gene SMap;COUNT Corresponding_CDS>1');
my $startGeneid=shift();

while (my $gene = $genes->next){
       my $flip=0;
       my @cdses = map {Ace::Sequence->new($_)} $gene->Corresponding_CDS;
       map {$_->refseq($_->parent)}@cdses;
       foreach my $cds(@cdses){
        my ($start,$stop)=sort {$a <=> $b} ($cds->start,$cds->stop);
        my @overlappingCDSes = grep{
             my($tmp_start,$tmp_stop)=sort{$a<=>$b}($_->start,$_->end);
             $tmp_start <= $stop && $tmp_stop >= $start
           } @cdses;
        if (scalar(@overlappingCDSes) == 1) {
             # print "can't find overlapping CDSes in gene $gene / ${\$cds->name}($cds)\n";
             $flip=$cds;
        }
       }
       if ($flip){
         renameCDS($flip);
         $startGeneid++;
       }
}

sub renameCDS {
   my ($f)=@_;
   my $from = $f->name;
   my $to = "Bm$startGeneid";
   my $gene = "TAG$startGeneid";
   my $seq = $f->parent->name;

print <<HERE;
-R CDS : $from $to

CDS : $to
Gene $gene

Gene : $gene
Version 1
Sequence $seq
Sequence_name $to
Public_name $to
Live
History Version_change 1 now WBPerson4055 Imported merge_split
Species "Brugia malayi"
Method Gene

HERE

}

## clean_brugia_genes.pl
#!/usr/bin/env perl
# go through all CDSes with Evidence, get their genes and delete connected CDSes without Evidence


use Ace;
use strict;

my $db= Ace->connect(-path => shift()) ||die (Ace->error);

my @cdses = $db->fetch(-query=>'find CDS Evidence ; follow gene ; follow Corresponding_CDS ; ! Evidence');
foreach my $cds(@cdses){
      print<<HERE;
CDS : $cds
Method history
Remark "removed automatically based on curation tags"

CDS : $cds
-D Gene

-R CDS $cds $cds:232

HERE


}

## clean_gene.pl
#!/usr/bin/env perl
# go through all CDSes with Evidence, get their genes and delete connected CDSes without Evidence


use Ace;
use strict;

my $db= Ace->connect(-path => shift()) ||die (Ace->error);
my $gene=shift();

my @cdses = $db->fetch(-query=>"find gene $gene ; follow Corresponding_CDS ; ! Evidence");
foreach my $cds(@cdses){
      print<<HERE;
CDS : $cds
Method history
Gene_history $gene
Remark "removed automatically based on curation tags"

CDS : $cds
-D Gene

-R CDS $cds $cds:bm7

HERE


}

## strip_isoforms.pl
#!/bin/env perl
# that will trim away any low-scoring isoforms from uncurated genes

use Ace;
use strict;

my $db = Ace->connect(-path => shift()) ||die (Ace->error);

my $genes= $db->fetch_many(-query => 'Find Gene SMap;Corresponding_CDS');

while (my $gene = $genes->next){
   my @a_;
   my @b_;
   my @cdses = sort {scalar(@a_=$b->CDS_predicted_by) <=> scalar(@b_=$a->CDS_predicted_by) } ($gene->Corresponding_CDS);
   next if $cdses[0]->Evidence;
   my @kill = grep {scalar(@a_= $_->CDS_predicted_by) < scalar(@b_=$cdses[0]->CDS_predicted_by) && scalar(@a_= $_->CDS_predicted_by)<7} @cdses;
   map {historify_cds($_)} @kill;
}

sub historify_cds {
  my ($c) = @_;
  my $g = $c->Gene;
  print<<HERE;
CDS : $c
Method history
Gene_history $g
Remark "removed automatically based on supporting predicitions count"

CDS : $c
-D Gene

-R CDS $c $c:bm11

HERE

}
	#!/usr/bin/env perl
	#
	# add missing tags to curation databases

	use Ace;
	use strict;

	my $db= Ace->connect(-path => shift()) \|\|die (Ace->error);


	# From Laboratory tag
	my @cdses = $db->fetch(-query=>"find CDS; Gene; SMap");
	foreach my $cds(@cdses){
	next if $cds->From_laboratory;
	print "CDS : $cds\n";
	print "From_laboratory ELG\n";
	print "\n";
	}
	undef @cdses;

	my @transcripts = $db->fetch(-query=>"find Transcript; Gene; SMap");
	foreach my $transcript (@transcripts){
	next if $transcript->From_laboratory;
	print "Transcript : $transcript\n";
	print "From_laboratory ELG\n";
	print "\n";
	}
	undef @transcripts;

	# isoform tag
	my @genes = $db->fetch(-query=>"find Gene; Corresponding_CDS; SMap");
	foreach my $gene (@genes) {
	@cdses=$gene->Corresponding_CDS;
	if (scalar @cdses >1 ){
	foreach my $cds(@cdses) {
	next if $cds->Isoform;
	print "CDS : $cds\n";
	print "Isoform Inferred_automatically\n";
	print "\n"
	}
	}else{
	next unless $cdses[0]->Isoform;
	print "CDS : $cdses[0]\n";
	print "-D Isoform\n";
	print "\n";
	}
	}

	# clean dodgy cdses
	print "//CDSes without SMaps\n";
	@cdses = $db->fetch(-query => 'find CDS; ! SMap');
	foreach my $cds(@cdses){
	next unless $cds->Species eq 'Brugia malayi';
	print "-D CDS $cds\n";
	}

	print "\n//CDSes with hand_built method\n";
	@cdses = $db->fetch(-query => 'find CDS; Method=hand_built');
	foreach my $cds(@cdses){
	print "-D CDS $cds\n";
	}

	print "\n//Transcripts with hand_built method\n";
	@cdses = $db->fetch(-query => 'find Transcript; Method=hand_built');
	foreach my $cds(@cdses){
	print "-D Transcript $cds\n";
	}
	#!/bin/env perl
	# that will trim away any low-scoring isoforms from uncurated genes

	use Ace;
	use Ace::Sequence;
	use strict;

	my $db = Ace->connect(-path => shift()) \|\|die (Ace->error);

	my $genes= $db->fetch_many(-query => 'Find Gene SMap;COUNT Corresponding_CDS>1');
	my $startGeneid=shift();

	while (my $gene = $genes->next){
	my $flip=0;
	my @cdses = map {Ace::Sequence->new($_)} $gene->Corresponding_CDS;
	map {$_->refseq($_->parent)}@cdses;
	foreach my $cds(@cdses){
	my ($start,$stop)=sort {$a <=> $b} ($cds->start,$cds->stop);
	my @overlappingCDSes = grep{
	my($tmp_start,$tmp_stop)=sort{$a<=>$b}($_->start,$_->end);
	$tmp_start <= $stop && $tmp_stop >= $start
	} @cdses;
	if (scalar(@overlappingCDSes) == 1) {
	# print "can't find overlapping CDSes in gene $gene / ${\$cds->name}($cds)\n";
	$flip=$cds;
	}
	}
	if ($flip){
	renameCDS($flip);
	$startGeneid++;
	}
	}

	sub renameCDS {
	my ($f)=@_;
	my $from = $f->name;
	my $to = "Bm$startGeneid";
	my $gene = "TAG$startGeneid";
	my $seq = $f->parent->name;

	print <<HERE;
	-R CDS : $from $to

	CDS : $to
	Gene $gene

	Gene : $gene
	Version 1
	Sequence $seq
	Sequence_name $to
	Public_name $to
	Live
	History Version_change 1 now WBPerson4055 Imported merge_split
	Species "Brugia malayi"
	Method Gene

	HERE

	}
	#!/usr/bin/env perl
	# go through all CDSes with Evidence, get their genes and delete connected CDSes without Evidence


	use Ace;
	use strict;

	my $db= Ace->connect(-path => shift()) \|\|die (Ace->error);

	my @cdses = $db->fetch(-query=>'find CDS Evidence ; follow gene ; follow Corresponding_CDS ; ! Evidence');
	foreach my $cds(@cdses){
	print<<HERE;
	CDS : $cds
	Method history
	Remark "removed automatically based on curation tags"

	CDS : $cds
	-D Gene

	-R CDS $cds $cds:232

	HERE


	}