Created
April 16, 2021 10:17
-
-
Save mdondrup/81927ccdb13d4a130a8a6631ca0ed73c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use Bio::Tools::GFF; | |
use Bio::SeqFeatureI; | |
my %fstore1 = (); | |
my %fstore2 = (); | |
my %conflicts = (); | |
# specify 2 input files to compare | |
my $gffio1 = Bio::Tools::GFF->new(-file => $ARGV[0], -gff_version => 3); | |
my $gffio2 = Bio::Tools::GFF->new(-file => $ARGV[1], -gff_version => 3); | |
# gff file that contains new features in file 2 | |
my $create = Bio::Tools::GFF->new(-gff_version => 3, | |
-file => ">gffdiff_create.gff3"); | |
# gff file that contains the fieatures missing in file 2 from file 1 | |
my $delete = Bio::Tools::GFF->new(-gff_version => 3, | |
-file => ">gffdiff_delete.gff3"); | |
# file with conflicting features, e.g. position identical but different ID | |
my $conffh; | |
open ($conffh, '>' , "gffdiff_conflicts") or die "couldn't open gffdiff_conflicts $!\n"; | |
# loop over the input stream file 1 | |
while(my $feature = $gffio1->next_feature()) { | |
my $id = ($feature->primary_id) ? | |
$feature->primary_id : new_id($feature); | |
### some sanity checks | |
die "something went wrong with the ID" unless $id; | |
die "duplicate feature ID $id" if exists $fstore1{$id}; | |
$fstore1{$id} = $feature; | |
} | |
print "read ".scalar ( keys %fstore1)." unique features from file 1\n"; | |
$gffio1->close(); | |
# read features of second file | |
my $new = my $changed = my $conflicts = my $deleted = 0; | |
while(my $feature = $gffio2->next_feature()) { | |
my $id = ($feature->primary_id) ? | |
$feature->primary_id : new_id($feature); | |
$fstore2{$id} = $feature; | |
### some sanity checks | |
die "something went wrong with the ID" unless $id; | |
my @what = (); | |
if (my $a = is_new($id, $feature)) { | |
# print "Feature $id is new\n"; | |
if (ref $a) { | |
$conflicts++; | |
## store conflict not to delete it later | |
$conflicts{new_id($a)} = 1; | |
print $conffh " >>>>> ".$ARGV[0]."\nC: ".$a->gff_string()."\n-----\n"; | |
print $conffh " <<<<< " .$ARGV[1]."\nC: ".$feature->gff_string()."\n<<<<<\n"; | |
} else { | |
$create->write_feature($feature); | |
$new++; | |
} | |
} | |
elsif (@what = what_changed($id, $feature)) { | |
#print "Feature $id has different ". (join " and ", @what)."\n"; | |
#print "- ".$fstore1{$id}->gff_string."\n"; | |
$delete->write_feature($fstore1{$id}); | |
#print "+ ".$feature->gff_string()."\n"; | |
$create->write_feature($feature); | |
$changed++; | |
} else { | |
# print "$id unchanged\n"; | |
} | |
} | |
$gffio2->close(); | |
print "read ".scalar ( keys %fstore2)." unique features from file 2\n"; | |
foreach my $key (keys %fstore1) { | |
if (! exists $fstore2{$key}) { | |
# protect feature from deletion if it is already known as an ID conflict | |
if (! exists $conflicts{new_id($fstore1{$key})} ) { | |
$delete->write_feature($fstore1{$key}); | |
$deleted++; | |
} | |
} | |
} | |
$create->close; | |
$delete->close; | |
close $conffh or die $!; | |
print "$new new features, $changed changed, $conflicts conflicting, $deleted deleted features\n"; | |
sub new_id { | |
my $feature = shift; | |
my $start = $feature->start; | |
my $end = $feature->end; | |
my $strand = $feature->strand; | |
my $type = $feature->primary_tag(); | |
my $seq_id = $feature->seq_id(); | |
return "$seq_id:$type:$start:$end:$strand"; | |
} | |
sub is_new { | |
my $id = shift; | |
my $feature = shift; | |
if (! exists $fstore1{$id}) { | |
if (! exists $fstore1{new_id($feature)}) { | |
return 1; | |
} else { | |
return $fstore1{new_id($feature)} | |
} | |
} | |
return 0; | |
} | |
sub what_changed { | |
my ($id, $feature) = @_; | |
die "no id" unless $id; | |
die "no feature for id $id" unless ref $feature; | |
unless (ref $fstore1{$id}) { | |
warn "$id not found, shouldn't happen:".$fstore1{$id}; | |
return (); | |
} | |
my @what = (); | |
## quick and dirty check | |
return () if $feature->gff_string eq $fstore1{$id}->gff_string; | |
## now we have to find out what's different (we ignore most tags for now): | |
#$feature->strand ne $fstore1{$id}->strand || | |
push @what, | |
"position/strand" unless ($feature->equals($fstore1{$id}), 'strong'); | |
push @what, "phase" if $feature->phase ne $fstore1{$id}->phase; | |
push @what, "primary_tag" if $feature->primary_tag() ne $fstore1{$id}->primary_tag(); | |
push @what, "source_tag" if $feature->source_tag() ne $fstore1{$id}->source_tag(); | |
push @what, "display_id" if $feature->display_id() ne $fstore1{$id}->display_id(); | |
## TODO: check all tags: TODO: | |
### my %tags1 = map $fstore1{$id}->get_all_tags(); | |
return @what; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment