Created
June 25, 2012 13:24
-
-
Save epaule/2988549 to your computer and use it in GitHub Desktop.
wrapper to automatically merge additional acedb databases into a reference one
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/software/bin/perl -w | |
# | |
# acediff.pl : an improved acediff | |
# | |
# gw3 | |
# | |
# Usage : acediff.pl [-options] | |
# | |
# Last edited by: $Author: gw3 $ | |
# Last edited on: $Date: 2011-04-26 15:49:01 $ | |
use strict; | |
use Getopt::Long; | |
my ($test, $debug, $store, $file1, $file2, $output); | |
GetOptions ( | |
"test" => \$test, | |
"debug:s" => \$debug, | |
"reference:s" => \$file1, | |
"new:s" => \$file2, | |
"output:s" => \$output, | |
) | |
or die("invalid commandline option\n"); | |
; | |
# use the time and the process ID to make a unique file extension | |
my $time = time(); | |
my $pid = "$$"; | |
my $tmpdir = $ENV{TMPDIR} || '/tmp'; | |
my $outfile1 = "$tmpdir/acediff1.$pid.$time.new"; | |
my $outfile2 = "$tmpdir/acediff2.$pid.$time.new"; | |
my $tmp = "$tmpdir/acediff.$pid.$time.tmp"; | |
# Change input separator to paragraph mode, but store old mode in $oldlinesep | |
my $oldlinesep = $/; | |
$/ = ""; | |
open (FILE, "<$file1") or die("cant open $file1 : $!\n"); | |
open (PRE, ">$tmp") or die("cant open $tmp : $!\n"); | |
while (my $record = <FILE>) { | |
while ($record =~ /^\s*\/\//) {$record =~ s/^\s*\/\/.*?\n//} # strip out any comments at the start of the record | |
$record =~ s/\t/ /g; | |
$record =~ s/\n/\t/g; | |
print PRE $record, "\n"; | |
} | |
close (PRE); | |
close (FILE); | |
# sort the file - sometimes the sorted file doesn't appear - this is very odd - try a few times to make it | |
my $tries = 5; | |
while ($tries-- && ! -e "$outfile1") { | |
system("sort -S 4G $tmp -o $outfile1"); | |
system('sleep 5'); # wait a few seconds for NFS to realise that there really is a file there | |
} | |
open (FILE, "<$file2") or die("cant open $file2 : $!\n"); | |
open (PRE, ">$tmp") or die("cant open $tmp : $!\n"); | |
while (my $record = <FILE>) { | |
while ($record =~ /^\s*\/\//) {$record =~ s/^\s*\/\/.*?\n//} # strip out any comments at the start of the record | |
$record =~ s/\t/ /g; | |
$record =~ s/\n/\t/g; | |
print PRE $record, "\n"; | |
} | |
close (PRE); | |
close (FILE); | |
# sort the file - sometimes the sorted file doesn't appear - this is very odd - try a few times to make it | |
$tries = 5; | |
while ($tries-- && ! -e "$outfile2") { | |
system("sort -S 4G $tmp -o $outfile2"); | |
system('sleep 5'); # wait a few seconds for NFS to realise that there really is a file there | |
} | |
# reset input line separator | |
$/= $oldlinesep; | |
# get diff output - use 'system' rather than $wormbase->run_command | |
# because diff returns '1' if there is a difference, 2 means error | |
my $status = system("/usr/bin/diff $outfile1 $outfile2 > $tmp"); | |
if ( ( $status >> 8 ) == 2 ) { | |
die("diff exited with an error\n"); | |
} | |
# now read in $tmp | |
# when there is an object that is new (no previous instance): output object | |
# when there is an object that is deleted (no subsequent instance): output -D object | |
# when there is an object that is changed: read both instances in and out deleted tags as -D tags and add new tags | |
# '>' is a new object | |
# '<' is an old object | |
my %new; | |
my %reference; | |
open (ACE, "<$tmp") or die("cant read from $tmp : $!\n"); | |
while (my $line = <ACE>) { | |
# put the data into the hashes | |
my ($origin, $first, $rest) = ($line =~ /^([\>\<])\s+(.+?)\t(.+)/); | |
if (!defined $origin) {next} | |
if ($origin eq '>') { | |
# put tags in %new, keyed by class and ID line | |
$new{$first} = $rest; | |
} elsif ($origin eq '<') { | |
# put tags in %reference, keyed by class and ID line | |
$reference{$first} = $rest; | |
} else { # number or --- lines | |
# do nothing | |
} | |
} | |
close ACE; | |
# now go through %new and check in %reference to see if this is new or changed | |
# delete the %reference ones we find that are in %new | |
open (OUT, ">$output") || die("cantopen $output : $!\n"); | |
my $started; | |
foreach my $newkey (keys %new) { | |
if (exists $reference{$newkey}) { | |
# changed data - store the tags in hases for comparison | |
my %n; | |
my %r; | |
my @new = split /\t/, $new{$newkey}; | |
my @ref = split /\t/, $reference{$newkey}; | |
foreach my $tag (@new) {$n{$tag}=1} | |
foreach my $tag (@ref) {$r{$tag}=1} | |
# removed tags | |
$started = 0; | |
foreach my $tag (keys %r) { | |
if (! exists $n{$tag}) { | |
if (!$started) { | |
print OUT "\n// Changed deleted data\n"; | |
print OUT "\n$newkey\n"; | |
$started = 1; | |
} | |
print OUT "-D $tag\n"; | |
} | |
} | |
# added tags | |
$started = 0; | |
foreach my $tag (keys %n) { | |
if (! exists $r{$tag}) { | |
if (!$started) { | |
print OUT "\n// Changed added data\n"; | |
print OUT "\n$newkey\n"; | |
$started = 1; | |
} | |
print OUT "$tag\n"; | |
} | |
} | |
delete $reference{$newkey}; | |
} else { | |
# new data | |
print OUT "\n// New data\n"; | |
print OUT "\n$newkey\n"; | |
my $rest = $new{$newkey}; | |
$rest =~ s/\t/\n/g; | |
print OUT "$rest"; | |
} | |
} | |
# go through %reference to find the ones that are left - these are just deleted | |
foreach my $refkey (keys %reference) { | |
if (exists $reference{$refkey}) { | |
print OUT "\n// Deleted data\n"; | |
print OUT "\n-D $refkey\n"; | |
} | |
} | |
close(OUT); | |
# tidy up | |
unlink $outfile1; | |
unlink $outfile2; | |
unlink $tmp; | |
# Add perl documentation in POD format | |
# This should expand on your brief description above and | |
# add details of any options that can be used with the program. | |
# Such documentation can be viewed using the perldoc command. | |
__END__ | |
=pod | |
=head2 NAME - acediff.pl | |
=head1 USAGE | |
=over 4 | |
=item acediff.pl -reference file1.ace -new file2.ace -out changed_file.ace | |
=back | |
This script is a rewrite of acediff in perl. It takes a reference ace fiekl and a new ace file and writes out an ace file that will patch the old database to reflect the new data. | |
script_template.pl MANDATORY arguments: | |
=over 4 | |
=item -reference file, the old ace file | |
=back | |
=over 4 | |
=item -new file, the new ace file | |
=back | |
=over 4 | |
=item -output file, the difference ace file | |
=back | |
script_template.pl OPTIONAL arguments: | |
=over 4 | |
=item -h, Help | |
=back | |
=over 4 | |
=item -debug, Debug mode, set this to the username who should receive the emailed log messages. The default is that everyone in the group receives them. | |
=back | |
=over 4 | |
=item -test, Test mode, run the script, but don't change anything. | |
=back | |
=over 4 | |
=item -verbose, output lots of chatty test messages | |
=back | |
=head1 REQUIREMENTS | |
=over 4 | |
=item None at present. | |
=back | |
=head1 AUTHOR | |
=over 4 | |
=item Gary Williams (gw3@sanger.ac.uk) | |
=back | |
=cut |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env perl | |
use Ace; | |
my $db = Ace->connect(-path => shift()) || die(Ace->Error); | |
my $cds = $db->fetch(CDS => shift()); | |
exit() unless $cds; | |
my $seq = $cds->Sequence; | |
my ($cds_child) = grep {/"$cds"/} split("\n",$seq->asAce); | |
$cds_child =~ s/(""\t)+(CDS_child)*/CDS_child\t/; | |
print "-D CDS : $cds\n\n"; | |
print "Sequence : $seq\n$cds_child\n\n"; | |
my @lastline; | |
my @clines = split("\n",$cds->asAce); | |
foreach my $line(@clines){ | |
my @l = split("\t",$line); | |
for (my $i=0; $i < scalar @l; $i++){ | |
$lastline[$i] = $l[$i] unless $l[$i]=~/""/; | |
} | |
if ($line=~/""/){ | |
my @elements = split("\t",$line); | |
for (my $i=0; $i < scalar @elements; $i++){ | |
if ($elements[$i]=~/""/){ | |
print $lastline[$i],"\t"; | |
}else{ | |
print $elements[$i],"\t"; | |
} | |
} | |
print "\n" | |
} else {print "$line\n"} | |
} | |
print "\n" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# take a reference database and a handful of curation ones and udpdate the reference_db | |
use strict; | |
use Getopt::Long; | |
use Digest::MD5 qw(md5_hex); | |
use Ace; | |
use Ace::Sequence; | |
my ($sourceDB,$targetDB); | |
my @curationDBs; | |
my @tmpfiles; | |
my $tmp = $ENV{TMPDIR} || '/tmp/'; | |
GetOptions( | |
'sourceDB=s' => \$targetDB, | |
)||die($!); | |
die("need a target\n") unless ($targetDB); | |
# setup | |
# * log | |
# * wormbase | |
my $tace = '/scratch/bin.LINUX_64/tace'; | |
# my $sace = '/homes/mh6/project/software/packages/acedb/4.9.54/giface'; | |
my $sace = '/homes/mh6/project/software/packages/acedb/4.9.60/giface'; | |
# my $sace = '/scratch/bin.LINUX_64/giface'; | |
# my $sace = '/scratch/ed_test/giface'; | |
# 5.1) proteins / genomes | |
my @seqs = dump_get_Sequences($targetDB); | |
# 5.2) GFF | |
dump_gff($targetDB,@seqs); | |
#################################################### Functions ########################### | |
# dumps the curated featuers on sequences as GFF2 | |
# arguments: | |
# (database, list of sequences to dump) | |
sub dump_gff{ | |
my ($database,@seqs)=@_; | |
foreach my $s(@seqs){ | |
my $cmd = "gif seqget $s; seqfeatures -version 2 -file $database/$s.gff2"; | |
open (WRITEDB,"echo '$cmd' | $sace $database |")||die "$!\n"; | |
while (my $line = <WRITEDB>) { | |
print "ERROR detected while GFF dumping $s:\n\t$line\ngiface: $cmd\n" if $line=~/ERROR/; | |
} | |
close WRITEDB; | |
system("cat $database/$s.gff2 >> $database/genome.gff2") && die($!); | |
unlink "$database/$s.gff2"||die($!); | |
} | |
} | |
# dumps genome, proteins and returns a list of genomic_canonical sequences | |
# arguments: | |
# (target database path) | |
sub dump_get_Sequences { | |
my $dbpath = shift; | |
# sequence bit | |
my @sequenceNames; | |
my $acedb = Ace->connect(-path => $dbpath) ||die(Ace->Error); | |
my @sequences = $acedb->fetch(-query => 'Find Sequence; Genomic_canonical'); | |
#open(OUTF, ">$dbpath/genome.fas") ||die($!); | |
foreach my $s(@sequences){ | |
# print OUTF $s->asDNA; | |
push @sequenceNames,"$s"; | |
} | |
#close OUTF; | |
# protein bit | |
open(OUTF,">$dbpath/proteins.fas")||die($!); | |
my $cdses=$acedb->fetch_many(-query => 'Find CDS; Method=curated') ||die(Ace->Error); | |
while (my $cds = $cdses->next){ | |
print OUTF $cds->asPeptide; | |
} | |
close OUTF; | |
return @sequenceNames; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# fix the genespan based on it's connected CDSes | |
# | |
use Ace; | |
use Ace::Sequence; | |
use strict; | |
my $db = Ace->connect(-path => shift()) ||die(Ace->Error); | |
my $genes = $db->fetch_many(-query => 'Find Gene Corresponding_CDS'); | |
while (my $gene = $genes->next){ | |
my ($start,$stop,$seq); | |
next unless $gene->Species eq 'Brugia malayi'; | |
foreach my $cds ($gene->Corresponding_CDS) { | |
my $c = Ace::Sequence->new($cds); | |
die("cannot find parent of $cds\n") unless $c; | |
$c->refseq($c->parent); | |
$start||=$c->start; | |
$stop||=$c->end; | |
if ($c->start > $c->end){ #reverse | |
$start = $c->start if $c->start > $start; | |
$stop = $c->end if $c->end < $stop; | |
}else{ | |
$start = $c->start if $c->start < $start; | |
$stop = $c->end if $c->end > $stop; | |
} | |
$seq = $c->parent(); | |
} | |
print<<HERE; | |
Sequence : $seq | |
Gene_child $gene $start $stop | |
HERE | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
query find CDS *;Species="Brugia malayi" AND method="curated" | |
Follow Gene | |
edit Laboratory ELG | |
edit Species "Brugia malayi" | |
edit Method Gene | |
edit Status Live | |
query !Version | |
edit Version 1 | |
edit History Version_change 1 now WBPerson4055 Event Imported merge_split | |
save | |
quit | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# take a reference database and a handful of curation ones and udpdate the reference_db | |
use strict; | |
use Getopt::Long; | |
use Digest::MD5 qw(md5_hex); | |
use Ace; | |
use Ace::Sequence; | |
my ($sourceDB,$targetDB); | |
my @curationDBs; | |
my @tmpfiles; | |
my $tmp = $ENV{TMPDIR} || '/tmp'; | |
GetOptions( | |
'curationDB=s' => \@curationDBs, | |
'sourceDB=s' => \$sourceDB, | |
'targetDB=s' => \$targetDB, | |
)||die($!); | |
die("need a source and target\n") unless ($sourceDB && $targetDB); | |
# setup | |
# * log | |
# * wormbase | |
my $tace = '/scratch/bin.LINUX_64/tace'; | |
# my $sace = '/scratch/bin.LINUX_64/giface'; | |
my $sace = '/scratch/ed_test/giface'; | |
# grab the reference files | |
my $reference = dumpDB($sourceDB); | |
# foreach do | |
# 1.) dump CDS / Transcript / Sequence | |
my %curationFiles; | |
foreach my $d(@curationDBs){ | |
$curationFiles{$d} = dumpDB($d); | |
} | |
# 2.) acediff them | |
acediff($reference,\%curationFiles); | |
# 3.) which CDS / Transcript was changed? | |
die("has hash collisions\n") if checkCollisions(\%curationFiles); | |
# 4.) load them | |
# 4.1) copy the reference database | |
system("mkdir -p $targetDB") && die($!); | |
system("cp -r $sourceDB/* $targetDB/") && die($!); | |
# 4.2) reformat to speed it up | |
# 4.3) load(file,user,db) | |
foreach my $db(values %curationFiles){ | |
foreach my $file(values %$db){ | |
loadace($file,$targetDB); | |
} | |
} | |
# fix gene tags | |
fix_gene_tags($targetDB); | |
# add new gene ids | |
add_gene_ids($targetDB); | |
# fix gene spans | |
fix_gene_spans($targetDB); | |
# 5.) dump them | |
# 5.1) proteins / genomes | |
my @seqs = dump_get_Sequences($targetDB); | |
# 5.2) GFF | |
dump_gff($targetDB,@seqs); | |
#################################################### Functions ########################### | |
# creates new tag gene_ids | |
# arguments: | |
# (acedb database) | |
sub add_gene_ids{ | |
my ($db) = @_; | |
my @letters = qw(a b c d e f g h i j k l m n o p q r s t u v w x y z); | |
my $acedb = Ace->connect(-path => $db) ||die(Ace->Error); | |
$acedb->auto_save(1); | |
my $counter= 0; | |
my ($ace,$ace2); | |
my $it = $acedb->fetch_many(Gene => 'TAG*'); | |
while (my $g = $it->next){ | |
my $name = "$g"; | |
my ($num) = $name=~/TAG(\d+)/; | |
$counter = $num if $num > $counter; | |
} | |
$it = $acedb->fetch_many(-query => 'Find Gene;Corresponding_CDS AND Species = "Brugia malayi"'); | |
while (my $g = $it->next){ | |
next if "$g"=~/^TAG\d+/; | |
$counter++; | |
print "renaming Gene : $g to TAG$counter\n"; | |
$ace.="-R Gene : $g TAG$counter\n"; | |
$ace2.="Gene : TAG$counter\nSequence_name Bmal$counter\nPublic_name Bm$counter\n\n"; | |
my @cdses = $g->Corresponding_CDS; | |
if (scalar(@cdses) == 1){ | |
my $c = $cdses[0]; | |
$ace.="-R CDS : $c Bm$counter\n"; | |
print "renaming CDS : $c to Bm$counter\n" | |
}else{ | |
for (my $n=0;$n<scalar(@cdses);$n++){ | |
my $c = $cdses[$n]; | |
$ace.="-R CDS : $c Bm$counter".$letters[$n]."\n"; | |
print "renaming CDS : $c to Bm$counter".$letters[$n]."\n" | |
} | |
} | |
} | |
# for non-coding transcripts | |
$it = $acedb->fetch_many(-query => 'Find Gene;Corresponding_Transcript AND Species = "Brugia malayi"'); | |
while (my $g = $it->next){ | |
next if "$g"=~/^TAG\d+/; | |
$counter++; | |
print "renaming Gene : $g to TAG$counter\n"; | |
$ace.="-R Gene : $g TAG$counter\n"; | |
$ace2.="Gene : TAG$counter\nSequence_name Bmal$counter\nPublic_name Bm$counter\n\n"; | |
my @transcripts = $g->Corresponding_Transcript; | |
if (scalar(@transcripts) == 1){ | |
my $c = $transcripts[0]; | |
$ace.="-R transcript : $c Bm$counter\n"; | |
print "renaming transcript : $c to Bm$counter\n" | |
}else{ | |
for (my $n=0;$n<scalar(@transcripts);$n++){ | |
my $c = $transcripts[$n]; | |
$ace.="-R Transcript : $c Bm$counter".$letters[$n]."\n"; | |
print "renaming Transcript : $c to Bm$counter".$letters[$n]."\n" | |
} | |
} | |
} | |
$acedb->close; | |
# that bit below is because the Ace->parse seems to not work | |
my $outf1 = "$tmp/".md5_hex($db) .':new_gene_rename'; | |
push @tmpfiles,$outf1; | |
open OUTF,">$outf1"; | |
print OUTF "$ace\n"; | |
close OUTF; | |
my $outf2 = "$tmp/".md5_hex($db) .':new_gene_tags'; | |
push @tmpfiles,$outf2; | |
open OUTF,">$outf2"; | |
print OUTF "$ace2\n"; | |
close OUTF; | |
loadace($outf1,$db); | |
loadace($outf2,$db); | |
} | |
# populates a minimum set of tags (tace commands are in a flatfile) | |
# arguments: | |
# (acedb database) | |
sub fix_gene_tags { | |
my ($db)=@_; | |
print `tace $db < fix_genes.tace` | |
} | |
# fixes the gene spans based on connected CDSes/transcripts | |
# arguments: | |
# (acedb database) | |
sub fix_gene_spans { | |
my $db = shift; | |
my $outfile = "$tmp/".md5_hex($db) .':gene_spans'; | |
push @tmpfiles,$outfile; | |
open (OUTF,">$outfile")||die($!); | |
my $acedb = Ace->connect(-path => $db) ||die(Ace->Error); | |
my $genes = $acedb->fetch_many(-query => 'Find Gene'); | |
while (my $gene = $genes->next){ | |
my ($start,$stop,$seq); | |
next unless ($gene->Corresponding_CDS ||$gene->Corresponding_transcript); | |
next unless $gene->Species eq 'Brugia malayi'; | |
foreach my $cds ($gene->Corresponding_CDS,$gene->Corresponding_Transcript) { | |
my $c = Ace::Sequence->new($cds) ||die("something fishy with $cds\n"); | |
$c->refseq($c->parent); | |
$start||=$c->start; | |
$stop||=$c->end; | |
if ($c->start > $c->end){ #reverse | |
$start = $c->start if $c->start > $start; | |
$stop = $c->end if $c->end < $stop; | |
}else{ | |
$start = $c->start if $c->start < $start; | |
$stop = $c->end if $c->end > $stop; | |
} | |
$seq = $c->parent(); | |
} | |
print OUTF <<HERE; | |
Sequence : $seq | |
Gene_child $gene $start $stop | |
HERE | |
} | |
close OUTF; | |
loadace($outfile,$db); | |
} | |
# dumps the curated featuers on sequences as GFF2 | |
# arguments: | |
# (database, list of sequences to dump) | |
sub dump_gff{ | |
my ($database,@seqs)=@_; | |
foreach my $s(@seqs){ | |
my $cmd = "gif seqget $s; seqfeatures -version 2 -file $database/$s.gff2"; | |
open (WRITEDB,"echo '$cmd' | $sace $database |")||die "$!\n"; | |
while (my $line = <WRITEDB>) { | |
print "ERROR detected while GFF dumping $s:\n\t$line\ngiface: $cmd\n" if $line=~/ERROR/; | |
} | |
close WRITEDB; | |
system("cat $database/$s.gff2 >> $database/genome.gff2") && die($!); | |
unlink "$database/$s.gff2"||die($!); | |
} | |
} | |
# dumps genome, proteins and returns a list of genomic_canonical sequences | |
# arguments: | |
# (target database path) | |
sub dump_get_Sequences { | |
my $dbpath = shift; | |
# sequence bit | |
my @sequenceNames; | |
my $acedb = Ace->connect(-path => $dbpath) ||die(Ace->Error); | |
my @sequences = $acedb->fetch(-query => 'Find Sequence; Genomic_canonical'); | |
open(OUTF, ">$dbpath/genome.fas") ||die($!); | |
foreach my $s(@sequences){ | |
print OUTF $s->asDNA; | |
push @sequenceNames,"$s"; | |
} | |
close OUTF; | |
# protein bit | |
open(OUTF,">$dbpath/proteins.fas")||die($!); | |
my $cdses=$acedb->fetch_many(-query => 'Find CDS; Method=curated') ||die(Ace->Error); | |
while (my $cds = $cdses->next){ | |
print OUTF $cds->asPeptide; | |
} | |
close OUTF; | |
return @sequenceNames; | |
} | |
# create the acediff files | |
# arguments: | |
# (reference database, curation database) | |
sub acediff{ | |
my($r,$c)=@_; | |
foreach my $classes (values %$c){ | |
while(my($class,$file) = each %$classes){ | |
system("perl /scratch/tmp/acedb/acediff.pl -reference $$r{$class} -new $file -output $file.tmp") && die($!); | |
system("mv $file.tmp $file") && die($!); | |
} | |
} | |
return 1; | |
} | |
# clean up behind the script | |
END{ | |
foreach my $file (@tmpfiles){ | |
# unlink $file; | |
} | |
} | |
# check for hash collisions in the files | |
# arguments: | |
# hasref of hashrefs of curation files | |
# returns: | |
# number of hash collisions | |
sub checkCollisions{ | |
my %databases = %{shift()}; | |
my %tmpHash; | |
my $collisions; | |
while (my($directory,$classRef) = each %databases){ | |
while (my($class,$file) = each %{$classRef}){ | |
next if $class eq 'Sequence'; | |
print "checking $class : $file\n" if $ENV{DEBUG}; | |
open INF, $file; | |
while(my $line = <INF>){ | |
if($line =~ /$class : \"(\S+)\"/){ | |
print "$class $1\n" if $ENV{DEBUG}; | |
if ($tmpHash{$class}{$1} && $directory ne $tmpHash{$class}{$1}){ | |
print "ERROR: $directory $class : $1 conflicts with $tmpHash{$class}{$1}\n"; | |
$collisions++; | |
}else{ | |
$tmpHash{$class}{$1} = $directory; | |
} | |
} | |
} | |
close INF; | |
} | |
} | |
return $collisions; | |
} | |
# wrapper to dump a database CDS / Transcript / Sequence | |
# arguments: | |
# (database path) | |
# returns: | |
# hashref(Transcript => $file, CDS => $file, Sequence => $file) | |
sub dumpDB { | |
my ($db)=@_; | |
my %h; | |
foreach my $c (qw(CDS Transcript Sequence Pseudogene)){ | |
my $file = "$tmp/". md5_hex($db) .":$c"; | |
$h{$c}=dumpace($c,$file,$db); | |
} | |
return \%h; | |
} | |
# functions lifted from Paul | |
########################################################### | |
# data dumping routine | |
# arguments: | |
# (class to dump, file the dump ends up in, database) | |
sub dumpace { | |
my ($class,$filepath,$ddb) = @_; | |
my $command = "nosave\nquery find $class\nshow -a -f $filepath\nquit\n"; | |
print " dumping $class Filename: $filepath -> $ddb\n"; | |
open (TACE, "echo '$command' | $tace $ddb |" ) || die "Failed to open database connection to $ddb\n" ; | |
while (<TACE>) { | |
# keeps pipe open until command quit executes. | |
} | |
close TACE; | |
# reformat it | |
# system("perl reformat_acediff -file $filepath.tmp -fileout $filepath") && die($!); | |
# unlink("$filepath.tmp"); | |
push @tmpfiles,$filepath; # cleanup hook | |
return $filepath; | |
} | |
# Loading function | |
# arguments: | |
# (file to load, database user, database) | |
sub loadace { | |
my ($filepath,$ldb) = @_; | |
my $command = "pparse $filepath\nsave\nquit\n"; | |
print "parsing Filename: $filepath -> ${ldb}\n"; | |
open (TACE, "| $tace $ldb -tsuser $ENV{USER}") || die "Couldn't open $ldb\n"; | |
print TACE $command; | |
close TACE; | |
print "SUCCESS! Loaded $filepath into ${ldb}\n"; | |
return 1; | |
} | |
=pod | |
=head2 NAME - merge_split_camaces | |
=head1 USAGE: | |
=over 4 | |
=item merge_split_brugia.pl [-options] | |
=back | |
merge_split_brugia.pl is a wrapper to automate the process of merging brugia curation databases | |
merge_split_camaces mandatory arguments: | |
=over 4 | |
=item reference | |
path to a reference database | |
=item curation | |
one or more curation databases to merge into the reference database | |
=item target | |
target path to not trample all over the reference database | |
=back | |
=head1 EXAMPLES: | |
=over 4 | |
=item merge_split_brugia.pl -reference X -curation Y -curation Z -target W | |
=back | |
Dumps the relevant classes from camace_orig and the split databases, runs an | |
acediff to calculate the changes. This acediff file is then reformated to take | |
account of the acediff bug. | |
=head1 AUTHOR - Michael Paulini | |
Email michael.paulini@wormbase.org | |
=cut |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env perl | |
use Getopt::Long; | |
my ($infile,$gff,$ace); | |
GetOptions( | |
'infile=s' => \$infile, | |
'gff' => \$gff, | |
'ace' => \$ace, | |
)||die($!); | |
open INF,$infile; | |
# $genes{seq}=@{%{start => 1, stop => 1, strand => '+', name => 'abc'},} | |
my %genes; | |
my %genes2operon; | |
my %operons; | |
# read genes | |
while(<INF>){ | |
if (/gene\tgene/){ | |
chomp; | |
my @F=split; | |
push(@{$genes{$F[0]}},{start => $F[3],stop => $F[4],strand => $F[6],name => $F[9]}); | |
} | |
} | |
seek(INF,0,0); | |
my $count = 1; | |
while(<INF>){ | |
if (/cufflinks_\S+\tCDS/){ | |
chomp; | |
my @F=split; | |
my $operon = "BMOP$count"; | |
my @genesHit = grep {$_->{start} <= $F[4] && $_->{stop} >= $F[3] && $_->{strand} eq $F[6] } @{$genes{$F[0]}}; | |
next unless scalar @genesHit > 1; | |
my @genesHit = sort {$a->{start} <=> $b->{start}} @genesHit; | |
map {$operon = $genes2operon{$_->{name}} if $genes2operon{$_->{name}}} @genesHit; | |
map {$genes2operon{$_->{name}}=$operon} @genesHit; | |
if ($operons{$operon}){ | |
${$operons{$operon}}->{start} = $genesHit[0]->{start} if $genesHit[0]->{start} < ${$operons{$operon}}->{start}; | |
${$operons{$operon}}->{stop} = $genesHit[-1]->{stop} if $genesHit[-1]->{stop} > ${$operons{$operon}}->{stop}; | |
} | |
else { | |
${$operons{$operon}}->{start} = $genesHit[0]->{start}; | |
${$operons{$operon}}->{stop} = $genesHit[-1]->{stop}; | |
${$operons{$operon}}->{strand} = $genesHit[0]->{strand}; | |
${$operons{$operon}}->{seq} = $F[0]; | |
$count++; | |
} | |
${$operons{$operon}}->{members} = {} unless ${$operons{$operon}}->{members}; | |
map { ${$operons{$operon}}->{members}->{$_->{name}} = 1} @genesHit; | |
} | |
} | |
while (my($k,$v)=each %operons){ | |
if ($gff){ | |
print "$$v->{seq}\toperon\toperon\t$$v->{start}\t$$v->{stop}\t.$vv->{strand}\t\.\tOperon \"$k\"\n"; | |
} | |
elsif($ace){ | |
my ($start,$stop) = $$v->{strand} eq '+'?($$v->{start},$$v->{stop}):($$v->{stop},$$v->{start}); | |
print "Sequence : $$v->{seq}\nOperon $k $start $stop\n\n"; | |
print "Operon : $k\nCanonical_parent $$v->{seq}\nSpecies \"Brugia malayi\"\n"; | |
map {print "Contains_Gene $_\n"} keys %{$$v->{members}}; | |
print "Description \"Predicted operon from cufflinks transcripts\"\nMethod operon\n\n"; | |
} | |
else{ | |
print "$k\t$$v->{seq}\t$$v->{start}\t$$v->{stop}\t$$v->{strand}\t"; | |
print join "\t",keys %{$$v->{members}}; | |
print "\n" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use Ace; | |
my $db= Ace->connect(-path => shift()); | |
while(<>){ | |
chomp; | |
s/\t\t/\t/g; | |
s/\tsimilarity\t/\tnucleotide_match\t/; | |
print; | |
if(/curated\sCDS/){ | |
my @F= split; | |
$F[9]=~s/"//g; | |
$c=$db->fetch(CDS =>$F[9]); | |
unless($c){ | |
die "cannot find CDS:$F[9]\n"; | |
} | |
# Evidence / Curation status | |
if($c->Evidence){ | |
print" ; Note \"Curated\"" | |
}else{ | |
print " ; Note \"Predicted\"" | |
} | |
my $bId; | |
my $gene = $c->Gene; | |
die "buggy $c -> $gene connection\n" unless $gene; | |
# add the cgc name / sequence name | |
print " ; Note \"gene:".$gene->Public_name."\" ; Alias \"${\$gene->Public_name}\""; | |
my @remarks = $gene->at('Remark'); | |
foreach my $r(@remarks){ | |
print " ; Note \"remark:$r\"" | |
} | |
# Brief Identification / UniProt | |
if ($gene->Ortholog){ | |
my @orthologGenes= $gene->Ortholog; | |
foreach my $g(@orthologGenes){ | |
next unless $g->Corresponding_CDS; | |
foreach my $cd($g->Corresponding_CDS){ | |
if($cd->Brief_identification){ | |
$bId = $cd->Brief_identification | |
} | |
} | |
} | |
} | |
print " ; Note \"description:$bId\"" if $bId; | |
} | |
print"\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment