Created
June 25, 2012 13:23
-
-
Save epaule/2988540 to your computer and use it in GitHub Desktop.
Ace format diff
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/software/bin/perl -w | |
# | |
# acediff.pl : an improved acediff | |
# | |
# gw3 | |
# | |
# Usage : acediff.pl [-options] | |
# | |
# Last edited by: $Author: gw3 $ | |
# Last edited on: $Date: 2011-04-26 15:49:01 $ | |
use strict; | |
use Getopt::Long; | |
my ($test, $debug, $store, $file1, $file2, $output); | |
GetOptions ( | |
"test" => \$test, | |
"debug:s" => \$debug, | |
"reference:s" => \$file1, | |
"new:s" => \$file2, | |
"output:s" => \$output, | |
) | |
or die("invalid commandline option\n"); | |
; | |
# use the time and the process ID to make a unique file extension | |
my $time = time(); | |
my $pid = "$$"; | |
my $tmpdir = $ENV{TMPDIR} || '/tmp'; | |
my $outfile1 = "$tmpdir/acediff1.$pid.$time.new"; | |
my $outfile2 = "$tmpdir/acediff2.$pid.$time.new"; | |
my $tmp = "$tmpdir/acediff.$pid.$time.tmp"; | |
# Change input separator to paragraph mode, but store old mode in $oldlinesep | |
my $oldlinesep = $/; | |
$/ = ""; | |
open (FILE, "<$file1") or die("cant open $file1 : $!\n"); | |
open (PRE, ">$tmp") or die("cant open $tmp : $!\n"); | |
while (my $record = <FILE>) { | |
while ($record =~ /^\s*\/\//) {$record =~ s/^\s*\/\/.*?\n//} # strip out any comments at the start of the record | |
$record =~ s/\t/ /g; | |
$record =~ s/\n/\t/g; | |
print PRE $record, "\n"; | |
} | |
close (PRE); | |
close (FILE); | |
# sort the file - sometimes the sorted file doesn't appear - this is very odd - try a few times to make it | |
my $tries = 5; | |
while ($tries-- && ! -e "$outfile1") { | |
system("sort -S 4G $tmp -o $outfile1"); | |
system('sleep 5'); # wait a few seconds for NFS to realise that there really is a file there | |
} | |
open (FILE, "<$file2") or die("cant open $file2 : $!\n"); | |
open (PRE, ">$tmp") or die("cant open $tmp : $!\n"); | |
while (my $record = <FILE>) { | |
while ($record =~ /^\s*\/\//) {$record =~ s/^\s*\/\/.*?\n//} # strip out any comments at the start of the record | |
$record =~ s/\t/ /g; | |
$record =~ s/\n/\t/g; | |
print PRE $record, "\n"; | |
} | |
close (PRE); | |
close (FILE); | |
# sort the file - sometimes the sorted file doesn't appear - this is very odd - try a few times to make it | |
$tries = 5; | |
while ($tries-- && ! -e "$outfile2") { | |
system("sort -S 4G $tmp -o $outfile2"); | |
system('sleep 5'); # wait a few seconds for NFS to realise that there really is a file there | |
} | |
# reset input line separator | |
$/= $oldlinesep; | |
# get diff output - use 'system' rather than $wormbase->run_command | |
# because diff returns '1' if there is a difference, 2 means error | |
my $status = system("/usr/bin/diff $outfile1 $outfile2 > $tmp"); | |
if ( ( $status >> 8 ) == 2 ) { | |
die("diff exited with an error\n"); | |
} | |
# now read in $tmp | |
# when there is an object that is new (no previous instance): output object | |
# when there is an object that is deleted (no subsequent instance): output -D object | |
# when there is an object that is changed: read both instances in and out deleted tags as -D tags and add new tags | |
# '>' is a new object | |
# '<' is an old object | |
my %new; | |
my %reference; | |
open (ACE, "<$tmp") or die("cant read from $tmp : $!\n"); | |
while (my $line = <ACE>) { | |
# put the data into the hashes | |
my ($origin, $first, $rest) = ($line =~ /^([\>\<])\s+(.+?)\t(.+)/); | |
if (!defined $origin) {next} | |
if ($origin eq '>') { | |
# put tags in %new, keyed by class and ID line | |
$new{$first} = $rest; | |
} elsif ($origin eq '<') { | |
# put tags in %reference, keyed by class and ID line | |
$reference{$first} = $rest; | |
} else { # number or --- lines | |
# do nothing | |
} | |
} | |
close ACE; | |
# now go through %new and check in %reference to see if this is new or changed | |
# delete the %reference ones we find that are in %new | |
open (OUT, ">$output") || die("cantopen $output : $!\n"); | |
my $started; | |
foreach my $newkey (keys %new) { | |
if (exists $reference{$newkey}) { | |
# changed data - store the tags in hases for comparison | |
my %n; | |
my %r; | |
my @new = split /\t/, $new{$newkey}; | |
my @ref = split /\t/, $reference{$newkey}; | |
foreach my $tag (@new) {$n{$tag}=1} | |
foreach my $tag (@ref) {$r{$tag}=1} | |
# removed tags | |
$started = 0; | |
foreach my $tag (keys %r) { | |
if (! exists $n{$tag}) { | |
if (!$started) { | |
print OUT "\n// Changed deleted data\n"; | |
print OUT "\n$newkey\n"; | |
$started = 1; | |
} | |
print OUT "-D $tag\n"; | |
} | |
} | |
# added tags | |
$started = 0; | |
foreach my $tag (keys %n) { | |
if (! exists $r{$tag}) { | |
if (!$started) { | |
print OUT "\n// Changed added data\n"; | |
print OUT "\n$newkey\n"; | |
$started = 1; | |
} | |
print OUT "$tag\n"; | |
} | |
} | |
delete $reference{$newkey}; | |
} else { | |
# new data | |
print OUT "\n// New data\n"; | |
print OUT "\n$newkey\n"; | |
my $rest = $new{$newkey}; | |
$rest =~ s/\t/\n/g; | |
print OUT "$rest"; | |
} | |
} | |
# go through %reference to find the ones that are left - these are just deleted | |
foreach my $refkey (keys %reference) { | |
if (exists $reference{$refkey}) { | |
print OUT "\n// Deleted data\n"; | |
print OUT "\n-D $refkey\n"; | |
} | |
} | |
close(OUT); | |
# tidy up | |
unlink $outfile1; | |
unlink $outfile2; | |
unlink $tmp; | |
# Add perl documentation in POD format | |
# This should expand on your brief description above and | |
# add details of any options that can be used with the program. | |
# Such documentation can be viewed using the perldoc command. | |
__END__ | |
=pod | |
=head2 NAME - acediff.pl | |
=head1 USAGE | |
=over 4 | |
=item acediff.pl -reference file1.ace -new file2.ace -out changed_file.ace | |
=back | |
This script is a rewrite of acediff in perl. It takes a reference ace fiekl and a new ace file and writes out an ace file that will patch the old database to reflect the new data. | |
script_template.pl MANDATORY arguments: | |
=over 4 | |
=item -reference file, the old ace file | |
=back | |
=over 4 | |
=item -new file, the new ace file | |
=back | |
=over 4 | |
=item -output file, the difference ace file | |
=back | |
script_template.pl OPTIONAL arguments: | |
=over 4 | |
=item -h, Help | |
=back | |
=over 4 | |
=item -debug, Debug mode, set this to the username who should receive the emailed log messages. The default is that everyone in the group receives them. | |
=back | |
=over 4 | |
=item -test, Test mode, run the script, but don't change anything. | |
=back | |
=over 4 | |
=item -verbose, output lots of chatty test messages | |
=back | |
=head1 REQUIREMENTS | |
=over 4 | |
=item None at present. | |
=back | |
=head1 AUTHOR | |
=over 4 | |
=item Gary Williams (gw3@sanger.ac.uk) | |
=back | |
=cut |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment