Skip to content

Instantly share code, notes, and snippets.

@epaule
Created June 25, 2012 13:24
Show Gist options
  • Save epaule/2988549 to your computer and use it in GitHub Desktop.
Save epaule/2988549 to your computer and use it in GitHub Desktop.
wrapper to automatically merge additional acedb databases into a reference one
#!/software/bin/perl -w
#
# acediff.pl : an improved acediff
#
# gw3
#
# Usage : acediff.pl [-options]
#
# Last edited by: $Author: gw3 $
# Last edited on: $Date: 2011-04-26 15:49:01 $
use strict;
use Getopt::Long;
my ($test, $debug, $store, $file1, $file2, $output);
GetOptions (
"test" => \$test,
"debug:s" => \$debug,
"reference:s" => \$file1,
"new:s" => \$file2,
"output:s" => \$output,
)
or die("invalid commandline option\n");
;
# use the time and the process ID to make a unique file extension
my $time = time();
my $pid = "$$";
my $tmpdir = $ENV{TMPDIR} || '/tmp';
my $outfile1 = "$tmpdir/acediff1.$pid.$time.new";
my $outfile2 = "$tmpdir/acediff2.$pid.$time.new";
my $tmp = "$tmpdir/acediff.$pid.$time.tmp";
# Change input separator to paragraph mode, but store old mode in $oldlinesep
my $oldlinesep = $/;
$/ = "";
open (FILE, "<$file1") or die("cant open $file1 : $!\n");
open (PRE, ">$tmp") or die("cant open $tmp : $!\n");
while (my $record = <FILE>) {
while ($record =~ /^\s*\/\//) {$record =~ s/^\s*\/\/.*?\n//} # strip out any comments at the start of the record
$record =~ s/\t/ /g;
$record =~ s/\n/\t/g;
print PRE $record, "\n";
}
close (PRE);
close (FILE);
# sort the file - sometimes the sorted file doesn't appear - this is very odd - try a few times to make it
my $tries = 5;
while ($tries-- && ! -e "$outfile1") {
system("sort -S 4G $tmp -o $outfile1");
system('sleep 5'); # wait a few seconds for NFS to realise that there really is a file there
}
open (FILE, "<$file2") or die("cant open $file2 : $!\n");
open (PRE, ">$tmp") or die("cant open $tmp : $!\n");
while (my $record = <FILE>) {
while ($record =~ /^\s*\/\//) {$record =~ s/^\s*\/\/.*?\n//} # strip out any comments at the start of the record
$record =~ s/\t/ /g;
$record =~ s/\n/\t/g;
print PRE $record, "\n";
}
close (PRE);
close (FILE);
# sort the file - sometimes the sorted file doesn't appear - this is very odd - try a few times to make it
$tries = 5;
while ($tries-- && ! -e "$outfile2") {
system("sort -S 4G $tmp -o $outfile2");
system('sleep 5'); # wait a few seconds for NFS to realise that there really is a file there
}
# reset input line separator
$/= $oldlinesep;
# get diff output - use 'system' rather than $wormbase->run_command
# because diff returns '1' if there is a difference, 2 means error
my $status = system("/usr/bin/diff $outfile1 $outfile2 > $tmp");
if ( ( $status >> 8 ) == 2 ) {
die("diff exited with an error\n");
}
# now read in $tmp
# when there is an object that is new (no previous instance): output object
# when there is an object that is deleted (no subsequent instance): output -D object
# when there is an object that is changed: read both instances in and out deleted tags as -D tags and add new tags
# '>' is a new object
# '<' is an old object
my %new;
my %reference;
open (ACE, "<$tmp") or die("cant read from $tmp : $!\n");
while (my $line = <ACE>) {
# put the data into the hashes
my ($origin, $first, $rest) = ($line =~ /^([\>\<])\s+(.+?)\t(.+)/);
if (!defined $origin) {next}
if ($origin eq '>') {
# put tags in %new, keyed by class and ID line
$new{$first} = $rest;
} elsif ($origin eq '<') {
# put tags in %reference, keyed by class and ID line
$reference{$first} = $rest;
} else { # number or --- lines
# do nothing
}
}
close ACE;
# now go through %new and check in %reference to see if this is new or changed
# delete the %reference ones we find that are in %new
open (OUT, ">$output") || die("cantopen $output : $!\n");
my $started;
foreach my $newkey (keys %new) {
if (exists $reference{$newkey}) {
# changed data - store the tags in hases for comparison
my %n;
my %r;
my @new = split /\t/, $new{$newkey};
my @ref = split /\t/, $reference{$newkey};
foreach my $tag (@new) {$n{$tag}=1}
foreach my $tag (@ref) {$r{$tag}=1}
# removed tags
$started = 0;
foreach my $tag (keys %r) {
if (! exists $n{$tag}) {
if (!$started) {
print OUT "\n// Changed deleted data\n";
print OUT "\n$newkey\n";
$started = 1;
}
print OUT "-D $tag\n";
}
}
# added tags
$started = 0;
foreach my $tag (keys %n) {
if (! exists $r{$tag}) {
if (!$started) {
print OUT "\n// Changed added data\n";
print OUT "\n$newkey\n";
$started = 1;
}
print OUT "$tag\n";
}
}
delete $reference{$newkey};
} else {
# new data
print OUT "\n// New data\n";
print OUT "\n$newkey\n";
my $rest = $new{$newkey};
$rest =~ s/\t/\n/g;
print OUT "$rest";
}
}
# go through %reference to find the ones that are left - these are just deleted
foreach my $refkey (keys %reference) {
if (exists $reference{$refkey}) {
print OUT "\n// Deleted data\n";
print OUT "\n-D $refkey\n";
}
}
close(OUT);
# tidy up
unlink $outfile1;
unlink $outfile2;
unlink $tmp;
# Add perl documentation in POD format
# This should expand on your brief description above and
# add details of any options that can be used with the program.
# Such documentation can be viewed using the perldoc command.
__END__
=pod
=head2 NAME - acediff.pl
=head1 USAGE
=over 4
=item acediff.pl -reference file1.ace -new file2.ace -out changed_file.ace
=back
This script is a rewrite of acediff in perl. It takes a reference ace fiekl and a new ace file and writes out an ace file that will patch the old database to reflect the new data.
script_template.pl MANDATORY arguments:
=over 4
=item -reference file, the old ace file
=back
=over 4
=item -new file, the new ace file
=back
=over 4
=item -output file, the difference ace file
=back
script_template.pl OPTIONAL arguments:
=over 4
=item -h, Help
=back
=over 4
=item -debug, Debug mode, set this to the username who should receive the emailed log messages. The default is that everyone in the group receives them.
=back
=over 4
=item -test, Test mode, run the script, but don't change anything.
=back
=over 4
=item -verbose, output lots of chatty test messages
=back
=head1 REQUIREMENTS
=over 4
=item None at present.
=back
=head1 AUTHOR
=over 4
=item Gary Williams (gw3@sanger.ac.uk)
=back
=cut
#!/bin/env perl
use Ace;
my $db = Ace->connect(-path => shift()) || die(Ace->Error);
my $cds = $db->fetch(CDS => shift());
exit() unless $cds;
my $seq = $cds->Sequence;
my ($cds_child) = grep {/"$cds"/} split("\n",$seq->asAce);
$cds_child =~ s/(""\t)+(CDS_child)*/CDS_child\t/;
print "-D CDS : $cds\n\n";
print "Sequence : $seq\n$cds_child\n\n";
my @lastline;
my @clines = split("\n",$cds->asAce);
foreach my $line(@clines){
my @l = split("\t",$line);
for (my $i=0; $i < scalar @l; $i++){
$lastline[$i] = $l[$i] unless $l[$i]=~/""/;
}
if ($line=~/""/){
my @elements = split("\t",$line);
for (my $i=0; $i < scalar @elements; $i++){
if ($elements[$i]=~/""/){
print $lastline[$i],"\t";
}else{
print $elements[$i],"\t";
}
}
print "\n"
} else {print "$line\n"}
}
print "\n"
#!/usr/bin/env perl
# take a reference database and a handful of curation ones and udpdate the reference_db
use strict;
use Getopt::Long;
use Digest::MD5 qw(md5_hex);
use Ace;
use Ace::Sequence;
my ($sourceDB,$targetDB);
my @curationDBs;
my @tmpfiles;
my $tmp = $ENV{TMPDIR} || '/tmp/';
GetOptions(
'sourceDB=s' => \$targetDB,
)||die($!);
die("need a target\n") unless ($targetDB);
# setup
# * log
# * wormbase
my $tace = '/scratch/bin.LINUX_64/tace';
# my $sace = '/homes/mh6/project/software/packages/acedb/4.9.54/giface';
my $sace = '/homes/mh6/project/software/packages/acedb/4.9.60/giface';
# my $sace = '/scratch/bin.LINUX_64/giface';
# my $sace = '/scratch/ed_test/giface';
# 5.1) proteins / genomes
my @seqs = dump_get_Sequences($targetDB);
# 5.2) GFF
dump_gff($targetDB,@seqs);
#################################################### Functions ###########################
# dumps the curated featuers on sequences as GFF2
# arguments:
# (database, list of sequences to dump)
sub dump_gff{
my ($database,@seqs)=@_;
foreach my $s(@seqs){
my $cmd = "gif seqget $s; seqfeatures -version 2 -file $database/$s.gff2";
open (WRITEDB,"echo '$cmd' | $sace $database |")||die "$!\n";
while (my $line = <WRITEDB>) {
print "ERROR detected while GFF dumping $s:\n\t$line\ngiface: $cmd\n" if $line=~/ERROR/;
}
close WRITEDB;
system("cat $database/$s.gff2 >> $database/genome.gff2") && die($!);
unlink "$database/$s.gff2"||die($!);
}
}
# dumps genome, proteins and returns a list of genomic_canonical sequences
# arguments:
# (target database path)
sub dump_get_Sequences {
my $dbpath = shift;
# sequence bit
my @sequenceNames;
my $acedb = Ace->connect(-path => $dbpath) ||die(Ace->Error);
my @sequences = $acedb->fetch(-query => 'Find Sequence; Genomic_canonical');
#open(OUTF, ">$dbpath/genome.fas") ||die($!);
foreach my $s(@sequences){
# print OUTF $s->asDNA;
push @sequenceNames,"$s";
}
#close OUTF;
# protein bit
open(OUTF,">$dbpath/proteins.fas")||die($!);
my $cdses=$acedb->fetch_many(-query => 'Find CDS; Method=curated') ||die(Ace->Error);
while (my $cds = $cdses->next){
print OUTF $cds->asPeptide;
}
close OUTF;
return @sequenceNames;
}
#!/usr/bin/env perl
# fix the genespan based on it's connected CDSes
#
use Ace;
use Ace::Sequence;
use strict;
my $db = Ace->connect(-path => shift()) ||die(Ace->Error);
my $genes = $db->fetch_many(-query => 'Find Gene Corresponding_CDS');
while (my $gene = $genes->next){
my ($start,$stop,$seq);
next unless $gene->Species eq 'Brugia malayi';
foreach my $cds ($gene->Corresponding_CDS) {
my $c = Ace::Sequence->new($cds);
die("cannot find parent of $cds\n") unless $c;
$c->refseq($c->parent);
$start||=$c->start;
$stop||=$c->end;
if ($c->start > $c->end){ #reverse
$start = $c->start if $c->start > $start;
$stop = $c->end if $c->end < $stop;
}else{
$start = $c->start if $c->start < $start;
$stop = $c->end if $c->end > $stop;
}
$seq = $c->parent();
}
print<<HERE;
Sequence : $seq
Gene_child $gene $start $stop
HERE
}
query find CDS *;Species="Brugia malayi" AND method="curated"
Follow Gene
edit Laboratory ELG
edit Species "Brugia malayi"
edit Method Gene
edit Status Live
query !Version
edit Version 1
edit History Version_change 1 now WBPerson4055 Event Imported merge_split
save
quit
#!/usr/bin/env perl
# take a reference database and a handful of curation ones and udpdate the reference_db
use strict;
use Getopt::Long;
use Digest::MD5 qw(md5_hex);
use Ace;
use Ace::Sequence;
my ($sourceDB,$targetDB);
my @curationDBs;
my @tmpfiles;
my $tmp = $ENV{TMPDIR} || '/tmp';
GetOptions(
'curationDB=s' => \@curationDBs,
'sourceDB=s' => \$sourceDB,
'targetDB=s' => \$targetDB,
)||die($!);
die("need a source and target\n") unless ($sourceDB && $targetDB);
# setup
# * log
# * wormbase
my $tace = '/scratch/bin.LINUX_64/tace';
# my $sace = '/scratch/bin.LINUX_64/giface';
my $sace = '/scratch/ed_test/giface';
# grab the reference files
my $reference = dumpDB($sourceDB);
# foreach do
# 1.) dump CDS / Transcript / Sequence
my %curationFiles;
foreach my $d(@curationDBs){
$curationFiles{$d} = dumpDB($d);
}
# 2.) acediff them
acediff($reference,\%curationFiles);
# 3.) which CDS / Transcript was changed?
die("has hash collisions\n") if checkCollisions(\%curationFiles);
# 4.) load them
# 4.1) copy the reference database
system("mkdir -p $targetDB") && die($!);
system("cp -r $sourceDB/* $targetDB/") && die($!);
# 4.2) reformat to speed it up
# 4.3) load(file,user,db)
foreach my $db(values %curationFiles){
foreach my $file(values %$db){
loadace($file,$targetDB);
}
}
# fix gene tags
fix_gene_tags($targetDB);
# add new gene ids
add_gene_ids($targetDB);
# fix gene spans
fix_gene_spans($targetDB);
# 5.) dump them
# 5.1) proteins / genomes
my @seqs = dump_get_Sequences($targetDB);
# 5.2) GFF
dump_gff($targetDB,@seqs);
#################################################### Functions ###########################
# creates new tag gene_ids
# arguments:
# (acedb database)
sub add_gene_ids{
my ($db) = @_;
my @letters = qw(a b c d e f g h i j k l m n o p q r s t u v w x y z);
my $acedb = Ace->connect(-path => $db) ||die(Ace->Error);
$acedb->auto_save(1);
my $counter= 0;
my ($ace,$ace2);
my $it = $acedb->fetch_many(Gene => 'TAG*');
while (my $g = $it->next){
my $name = "$g";
my ($num) = $name=~/TAG(\d+)/;
$counter = $num if $num > $counter;
}
$it = $acedb->fetch_many(-query => 'Find Gene;Corresponding_CDS AND Species = "Brugia malayi"');
while (my $g = $it->next){
next if "$g"=~/^TAG\d+/;
$counter++;
print "renaming Gene : $g to TAG$counter\n";
$ace.="-R Gene : $g TAG$counter\n";
$ace2.="Gene : TAG$counter\nSequence_name Bmal$counter\nPublic_name Bm$counter\n\n";
my @cdses = $g->Corresponding_CDS;
if (scalar(@cdses) == 1){
my $c = $cdses[0];
$ace.="-R CDS : $c Bm$counter\n";
print "renaming CDS : $c to Bm$counter\n"
}else{
for (my $n=0;$n<scalar(@cdses);$n++){
my $c = $cdses[$n];
$ace.="-R CDS : $c Bm$counter".$letters[$n]."\n";
print "renaming CDS : $c to Bm$counter".$letters[$n]."\n"
}
}
}
# for non-coding transcripts
$it = $acedb->fetch_many(-query => 'Find Gene;Corresponding_Transcript AND Species = "Brugia malayi"');
while (my $g = $it->next){
next if "$g"=~/^TAG\d+/;
$counter++;
print "renaming Gene : $g to TAG$counter\n";
$ace.="-R Gene : $g TAG$counter\n";
$ace2.="Gene : TAG$counter\nSequence_name Bmal$counter\nPublic_name Bm$counter\n\n";
my @transcripts = $g->Corresponding_Transcript;
if (scalar(@transcripts) == 1){
my $c = $transcripts[0];
$ace.="-R transcript : $c Bm$counter\n";
print "renaming transcript : $c to Bm$counter\n"
}else{
for (my $n=0;$n<scalar(@transcripts);$n++){
my $c = $transcripts[$n];
$ace.="-R Transcript : $c Bm$counter".$letters[$n]."\n";
print "renaming Transcript : $c to Bm$counter".$letters[$n]."\n"
}
}
}
$acedb->close;
# that bit below is because the Ace->parse seems to not work
my $outf1 = "$tmp/".md5_hex($db) .':new_gene_rename';
push @tmpfiles,$outf1;
open OUTF,">$outf1";
print OUTF "$ace\n";
close OUTF;
my $outf2 = "$tmp/".md5_hex($db) .':new_gene_tags';
push @tmpfiles,$outf2;
open OUTF,">$outf2";
print OUTF "$ace2\n";
close OUTF;
loadace($outf1,$db);
loadace($outf2,$db);
}
# populates a minimum set of tags (tace commands are in a flatfile)
# arguments:
# (acedb database)
sub fix_gene_tags {
my ($db)=@_;
print `tace $db < fix_genes.tace`
}
# fixes the gene spans based on connected CDSes/transcripts
# arguments:
# (acedb database)
sub fix_gene_spans {
my $db = shift;
my $outfile = "$tmp/".md5_hex($db) .':gene_spans';
push @tmpfiles,$outfile;
open (OUTF,">$outfile")||die($!);
my $acedb = Ace->connect(-path => $db) ||die(Ace->Error);
my $genes = $acedb->fetch_many(-query => 'Find Gene');
while (my $gene = $genes->next){
my ($start,$stop,$seq);
next unless ($gene->Corresponding_CDS ||$gene->Corresponding_transcript);
next unless $gene->Species eq 'Brugia malayi';
foreach my $cds ($gene->Corresponding_CDS,$gene->Corresponding_Transcript) {
my $c = Ace::Sequence->new($cds) ||die("something fishy with $cds\n");
$c->refseq($c->parent);
$start||=$c->start;
$stop||=$c->end;
if ($c->start > $c->end){ #reverse
$start = $c->start if $c->start > $start;
$stop = $c->end if $c->end < $stop;
}else{
$start = $c->start if $c->start < $start;
$stop = $c->end if $c->end > $stop;
}
$seq = $c->parent();
}
print OUTF <<HERE;
Sequence : $seq
Gene_child $gene $start $stop
HERE
}
close OUTF;
loadace($outfile,$db);
}
# dumps the curated featuers on sequences as GFF2
# arguments:
# (database, list of sequences to dump)
sub dump_gff{
my ($database,@seqs)=@_;
foreach my $s(@seqs){
my $cmd = "gif seqget $s; seqfeatures -version 2 -file $database/$s.gff2";
open (WRITEDB,"echo '$cmd' | $sace $database |")||die "$!\n";
while (my $line = <WRITEDB>) {
print "ERROR detected while GFF dumping $s:\n\t$line\ngiface: $cmd\n" if $line=~/ERROR/;
}
close WRITEDB;
system("cat $database/$s.gff2 >> $database/genome.gff2") && die($!);
unlink "$database/$s.gff2"||die($!);
}
}
# dumps genome, proteins and returns a list of genomic_canonical sequences
# arguments:
# (target database path)
sub dump_get_Sequences {
my $dbpath = shift;
# sequence bit
my @sequenceNames;
my $acedb = Ace->connect(-path => $dbpath) ||die(Ace->Error);
my @sequences = $acedb->fetch(-query => 'Find Sequence; Genomic_canonical');
open(OUTF, ">$dbpath/genome.fas") ||die($!);
foreach my $s(@sequences){
print OUTF $s->asDNA;
push @sequenceNames,"$s";
}
close OUTF;
# protein bit
open(OUTF,">$dbpath/proteins.fas")||die($!);
my $cdses=$acedb->fetch_many(-query => 'Find CDS; Method=curated') ||die(Ace->Error);
while (my $cds = $cdses->next){
print OUTF $cds->asPeptide;
}
close OUTF;
return @sequenceNames;
}
# create the acediff files
# arguments:
# (reference database, curation database)
sub acediff{
my($r,$c)=@_;
foreach my $classes (values %$c){
while(my($class,$file) = each %$classes){
system("perl /scratch/tmp/acedb/acediff.pl -reference $$r{$class} -new $file -output $file.tmp") && die($!);
system("mv $file.tmp $file") && die($!);
}
}
return 1;
}
# clean up behind the script
END{
foreach my $file (@tmpfiles){
# unlink $file;
}
}
# check for hash collisions in the files
# arguments:
# hasref of hashrefs of curation files
# returns:
# number of hash collisions
sub checkCollisions{
my %databases = %{shift()};
my %tmpHash;
my $collisions;
while (my($directory,$classRef) = each %databases){
while (my($class,$file) = each %{$classRef}){
next if $class eq 'Sequence';
print "checking $class : $file\n" if $ENV{DEBUG};
open INF, $file;
while(my $line = <INF>){
if($line =~ /$class : \"(\S+)\"/){
print "$class $1\n" if $ENV{DEBUG};
if ($tmpHash{$class}{$1} && $directory ne $tmpHash{$class}{$1}){
print "ERROR: $directory $class : $1 conflicts with $tmpHash{$class}{$1}\n";
$collisions++;
}else{
$tmpHash{$class}{$1} = $directory;
}
}
}
close INF;
}
}
return $collisions;
}
# wrapper to dump a database CDS / Transcript / Sequence
# arguments:
# (database path)
# returns:
# hashref(Transcript => $file, CDS => $file, Sequence => $file)
sub dumpDB {
my ($db)=@_;
my %h;
foreach my $c (qw(CDS Transcript Sequence Pseudogene)){
my $file = "$tmp/". md5_hex($db) .":$c";
$h{$c}=dumpace($c,$file,$db);
}
return \%h;
}
# functions lifted from Paul
###########################################################
# data dumping routine
# arguments:
# (class to dump, file the dump ends up in, database)
sub dumpace {
my ($class,$filepath,$ddb) = @_;
my $command = "nosave\nquery find $class\nshow -a -f $filepath\nquit\n";
print " dumping $class Filename: $filepath -> $ddb\n";
open (TACE, "echo '$command' | $tace $ddb |" ) || die "Failed to open database connection to $ddb\n" ;
while (<TACE>) {
# keeps pipe open until command quit executes.
}
close TACE;
# reformat it
# system("perl reformat_acediff -file $filepath.tmp -fileout $filepath") && die($!);
# unlink("$filepath.tmp");
push @tmpfiles,$filepath; # cleanup hook
return $filepath;
}
# Loading function
# arguments:
# (file to load, database user, database)
sub loadace {
my ($filepath,$ldb) = @_;
my $command = "pparse $filepath\nsave\nquit\n";
print "parsing Filename: $filepath -> ${ldb}\n";
open (TACE, "| $tace $ldb -tsuser $ENV{USER}") || die "Couldn't open $ldb\n";
print TACE $command;
close TACE;
print "SUCCESS! Loaded $filepath into ${ldb}\n";
return 1;
}
=pod
=head2 NAME - merge_split_camaces
=head1 USAGE:
=over 4
=item merge_split_brugia.pl [-options]
=back
merge_split_brugia.pl is a wrapper to automate the process of merging brugia curation databases
merge_split_camaces mandatory arguments:
=over 4
=item reference
path to a reference database
=item curation
one or more curation databases to merge into the reference database
=item target
target path to not trample all over the reference database
=back
=head1 EXAMPLES:
=over 4
=item merge_split_brugia.pl -reference X -curation Y -curation Z -target W
=back
Dumps the relevant classes from camace_orig and the split databases, runs an
acediff to calculate the changes. This acediff file is then reformated to take
account of the acediff bug.
=head1 AUTHOR - Michael Paulini
Email michael.paulini@wormbase.org
=cut
#!/bin/env perl
use Getopt::Long;
my ($infile,$gff,$ace);
GetOptions(
'infile=s' => \$infile,
'gff' => \$gff,
'ace' => \$ace,
)||die($!);
open INF,$infile;
# $genes{seq}=@{%{start => 1, stop => 1, strand => '+', name => 'abc'},}
my %genes;
my %genes2operon;
my %operons;
# read genes
while(<INF>){
if (/gene\tgene/){
chomp;
my @F=split;
push(@{$genes{$F[0]}},{start => $F[3],stop => $F[4],strand => $F[6],name => $F[9]});
}
}
seek(INF,0,0);
my $count = 1;
while(<INF>){
if (/cufflinks_\S+\tCDS/){
chomp;
my @F=split;
my $operon = "BMOP$count";
my @genesHit = grep {$_->{start} <= $F[4] && $_->{stop} >= $F[3] && $_->{strand} eq $F[6] } @{$genes{$F[0]}};
next unless scalar @genesHit > 1;
my @genesHit = sort {$a->{start} <=> $b->{start}} @genesHit;
map {$operon = $genes2operon{$_->{name}} if $genes2operon{$_->{name}}} @genesHit;
map {$genes2operon{$_->{name}}=$operon} @genesHit;
if ($operons{$operon}){
${$operons{$operon}}->{start} = $genesHit[0]->{start} if $genesHit[0]->{start} < ${$operons{$operon}}->{start};
${$operons{$operon}}->{stop} = $genesHit[-1]->{stop} if $genesHit[-1]->{stop} > ${$operons{$operon}}->{stop};
}
else {
${$operons{$operon}}->{start} = $genesHit[0]->{start};
${$operons{$operon}}->{stop} = $genesHit[-1]->{stop};
${$operons{$operon}}->{strand} = $genesHit[0]->{strand};
${$operons{$operon}}->{seq} = $F[0];
$count++;
}
${$operons{$operon}}->{members} = {} unless ${$operons{$operon}}->{members};
map { ${$operons{$operon}}->{members}->{$_->{name}} = 1} @genesHit;
}
}
while (my($k,$v)=each %operons){
if ($gff){
print "$$v->{seq}\toperon\toperon\t$$v->{start}\t$$v->{stop}\t.$vv->{strand}\t\.\tOperon \"$k\"\n";
}
elsif($ace){
my ($start,$stop) = $$v->{strand} eq '+'?($$v->{start},$$v->{stop}):($$v->{stop},$$v->{start});
print "Sequence : $$v->{seq}\nOperon $k $start $stop\n\n";
print "Operon : $k\nCanonical_parent $$v->{seq}\nSpecies \"Brugia malayi\"\n";
map {print "Contains_Gene $_\n"} keys %{$$v->{members}};
print "Description \"Predicted operon from cufflinks transcripts\"\nMethod operon\n\n";
}
else{
print "$k\t$$v->{seq}\t$$v->{start}\t$$v->{stop}\t$$v->{strand}\t";
print join "\t",keys %{$$v->{members}};
print "\n"
}
}
#!/usr/bin/env perl
use Ace;
my $db= Ace->connect(-path => shift());
while(<>){
chomp;
s/\t\t/\t/g;
s/\tsimilarity\t/\tnucleotide_match\t/;
print;
if(/curated\sCDS/){
my @F= split;
$F[9]=~s/"//g;
$c=$db->fetch(CDS =>$F[9]);
unless($c){
die "cannot find CDS:$F[9]\n";
}
# Evidence / Curation status
if($c->Evidence){
print" ; Note \"Curated\""
}else{
print " ; Note \"Predicted\""
}
my $bId;
my $gene = $c->Gene;
die "buggy $c -> $gene connection\n" unless $gene;
# add the cgc name / sequence name
print " ; Note \"gene:".$gene->Public_name."\" ; Alias \"${\$gene->Public_name}\"";
my @remarks = $gene->at('Remark');
foreach my $r(@remarks){
print " ; Note \"remark:$r\""
}
# Brief Identification / UniProt
if ($gene->Ortholog){
my @orthologGenes= $gene->Ortholog;
foreach my $g(@orthologGenes){
next unless $g->Corresponding_CDS;
foreach my $cd($g->Corresponding_CDS){
if($cd->Brief_identification){
$bId = $cd->Brief_identification
}
}
}
}
print " ; Note \"description:$bId\"" if $bId;
}
print"\n";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment