Skip to content

Instantly share code, notes, and snippets.

@mdondrup
Created April 16, 2021 10:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mdondrup/81927ccdb13d4a130a8a6631ca0ed73c to your computer and use it in GitHub Desktop.
Save mdondrup/81927ccdb13d4a130a8a6631ca0ed73c to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use strict;
use warnings;
use Bio::Tools::GFF;
use Bio::SeqFeatureI;
my %fstore1 = ();
my %fstore2 = ();
my %conflicts = ();
# specify 2 input files to compare
my $gffio1 = Bio::Tools::GFF->new(-file => $ARGV[0], -gff_version => 3);
my $gffio2 = Bio::Tools::GFF->new(-file => $ARGV[1], -gff_version => 3);
# gff file that contains new features in file 2
my $create = Bio::Tools::GFF->new(-gff_version => 3,
-file => ">gffdiff_create.gff3");
# gff file that contains the fieatures missing in file 2 from file 1
my $delete = Bio::Tools::GFF->new(-gff_version => 3,
-file => ">gffdiff_delete.gff3");
# file with conflicting features, e.g. position identical but different ID
my $conffh;
open ($conffh, '>' , "gffdiff_conflicts") or die "couldn't open gffdiff_conflicts $!\n";
# loop over the input stream file 1
while(my $feature = $gffio1->next_feature()) {
my $id = ($feature->primary_id) ?
$feature->primary_id : new_id($feature);
### some sanity checks
die "something went wrong with the ID" unless $id;
die "duplicate feature ID $id" if exists $fstore1{$id};
$fstore1{$id} = $feature;
}
print "read ".scalar ( keys %fstore1)." unique features from file 1\n";
$gffio1->close();
# read features of second file
my $new = my $changed = my $conflicts = my $deleted = 0;
while(my $feature = $gffio2->next_feature()) {
my $id = ($feature->primary_id) ?
$feature->primary_id : new_id($feature);
$fstore2{$id} = $feature;
### some sanity checks
die "something went wrong with the ID" unless $id;
my @what = ();
if (my $a = is_new($id, $feature)) {
# print "Feature $id is new\n";
if (ref $a) {
$conflicts++;
## store conflict not to delete it later
$conflicts{new_id($a)} = 1;
print $conffh " >>>>> ".$ARGV[0]."\nC: ".$a->gff_string()."\n-----\n";
print $conffh " <<<<< " .$ARGV[1]."\nC: ".$feature->gff_string()."\n<<<<<\n";
} else {
$create->write_feature($feature);
$new++;
}
}
elsif (@what = what_changed($id, $feature)) {
#print "Feature $id has different ". (join " and ", @what)."\n";
#print "- ".$fstore1{$id}->gff_string."\n";
$delete->write_feature($fstore1{$id});
#print "+ ".$feature->gff_string()."\n";
$create->write_feature($feature);
$changed++;
} else {
# print "$id unchanged\n";
}
}
$gffio2->close();
print "read ".scalar ( keys %fstore2)." unique features from file 2\n";
foreach my $key (keys %fstore1) {
if (! exists $fstore2{$key}) {
# protect feature from deletion if it is already known as an ID conflict
if (! exists $conflicts{new_id($fstore1{$key})} ) {
$delete->write_feature($fstore1{$key});
$deleted++;
}
}
}
$create->close;
$delete->close;
close $conffh or die $!;
print "$new new features, $changed changed, $conflicts conflicting, $deleted deleted features\n";
sub new_id {
my $feature = shift;
my $start = $feature->start;
my $end = $feature->end;
my $strand = $feature->strand;
my $type = $feature->primary_tag();
my $seq_id = $feature->seq_id();
return "$seq_id:$type:$start:$end:$strand";
}
sub is_new {
my $id = shift;
my $feature = shift;
if (! exists $fstore1{$id}) {
if (! exists $fstore1{new_id($feature)}) {
return 1;
} else {
return $fstore1{new_id($feature)}
}
}
return 0;
}
sub what_changed {
my ($id, $feature) = @_;
die "no id" unless $id;
die "no feature for id $id" unless ref $feature;
unless (ref $fstore1{$id}) {
warn "$id not found, shouldn't happen:".$fstore1{$id};
return ();
}
my @what = ();
## quick and dirty check
return () if $feature->gff_string eq $fstore1{$id}->gff_string;
## now we have to find out what's different (we ignore most tags for now):
#$feature->strand ne $fstore1{$id}->strand ||
push @what,
"position/strand" unless ($feature->equals($fstore1{$id}), 'strong');
push @what, "phase" if $feature->phase ne $fstore1{$id}->phase;
push @what, "primary_tag" if $feature->primary_tag() ne $fstore1{$id}->primary_tag();
push @what, "source_tag" if $feature->source_tag() ne $fstore1{$id}->source_tag();
push @what, "display_id" if $feature->display_id() ne $fstore1{$id}->display_id();
## TODO: check all tags: TODO:
### my %tags1 = map $fstore1{$id}->get_all_tags();
return @what;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment