mackyle/export-fixed-tags

## export-fixed-tags
#!/usr/bin/env perl

# export-fixed-tags -- produce fast-import stream to fix broken tags
# Copyright (C) 2016 Kyle J. McKay.  All rights reserved.
# License AGPLv3+ https://www.gnu.org/licenses/agpl.html

#
## Usage
##
## Some software used to convert repositories to Git format creates invalid
## tags during the conversion.  In the case where these tags lack a signature,
## email and timestamp but DO have an author name and that name matches the
## name on the commit the tag refers to, the tag can be corrected by adding
## the missing information by taking it from the commit.
##
## Some repositories resulting from such bad conversions can be found at:
##
##   http://git.savannah.gnu.org/
##
## In particular, if a repository has bad tags a clone with fsckObjects=true
## will fail.  For example:
##
##   git -c transfer.fsckobjects=true clone http://git.savannah.gnu.org/r/automake.git
##
## will fail (unless the invalid tags have since been corrected).
##
## Running this script in a repository with any of these kind of bad tags will
## produce a git fast-import stream (on standard output) that can be passed to
## git fast-import to correct the tags (progress is reported to standard error).
##
## So, for example, the aforementioned automake repository's bad tags can be
## corrected using this script like so:
##
##   git -c transfer.fsckobjects=false clone --mirror http://git.savannah.gnu.org/r/automake.git
##   cd automake.git
##   export-fixed-tags | git fast-import
##
## followed by a push to publish the corrected tags.
#

use strict;
use warnings;
use Encode;

my $encoder;
BEGIN {
  $encoder = Encode::find_encoding('Windows-1252') ||
             Encode::find_encoding('ISO-8859-1')
    or die "failed to load ISO-8859-1 encoder\n";
}

sub to_utf8($;$) {
  my $str = shift;
  return undef unless defined $str;
  my $ans;
  if (Encode::is_utf8($str) || utf8::decode($str)) {
    $ans = $str;
  } else {
    $ans = $encoder->decode($str, Encode::FB_DEFAULT);
  }
  utf8::encode($ans);
  return $ans;
}

sub collect {
  my $count = shift;
  my $c = '';
  my $b = '';
  while ($count >= 32768) {
    read(TAGS, $b, 32768);
    $count -= 32768;
    $c .= $b;
  }
  read(TAGS, $b, $count), $c .= $b if $count;
  $c;
}

# mimics strbuf_addstr_without_crud in ident.c
# but also skips NULs since those are permitted in commit or tag headers
sub without_crud($) {
  my $x = shift;
  return undef unless defined($x);
  $x =~ s/^[\x00-\x1f .,:;<>\x22\\']+//s; # remove crud from beginning
  $x =~ s/[\x00-\x1f .,:;<>\x22\\']+$//s; # remove crud from the end
  $x =~ s/[\n<>\0]//g; # remove internal \n \0 < and >
  $x;
}

sub split_tagger($) {
  my $g = shift;
  defined($g) or return ();
  my ($n, $t, $o);
  ($g, $o) = ($1, $2) if $g =~ /^(.*?)\s*([-+]\d\d\d\d)$/;
  ($g, $t) = ($1, 0 + $2) if $g =~ /^(.*?)\s*([-+]?\d+)$/;
  ($n, $g) = ($1, $2), $n =~ s/\s+$// if $g =~ /^\s*([^<]*)(.*)$/;
  $g =~ s/\s+$//;
  $g =~ s/^<+//;
  $g =~ s/>+$//;
  return ($n, $g, $t, $o);
}

sub tag_is_ok($) {
  my $tag = shift;
  my $g = $tag->{'tagger'};
  return undef unless defined($g);
  my ($n, $e, $t, $o) = split_tagger($g);
  {
    no warnings;
#    print STDERR "\"$g\" -> \"$n\",\"$e\",\"$t\",\"$o\"\n";
  }
  defined($n) && defined($e) && defined($t) && defined($o) or return undef;
  $t >= 0 or return undef;
  my $test = without_crud($n) . ' <' . without_crud($e) . '> ' . $t . ' ' . $o;
  $g eq $test;
}

# return is ($cn, $ce, $cd, $an, $ae, $ad) where $cd and $ad are raw dates
sub commit_info($) {
  split("\n", qx(git log -n 1 --date=raw --format='format:%cn%n%ce%n%cd%n%an%n%ae%n%ad' $_[0] --))
}

my $cmd = <<'CMD';
git for-each-ref refs/tags |
awk '$2=="tag"{print $1" "$3}' |
git cat-file --batch='%(objectname) %(objecttype) %(objectsize) %(rest)'
CMD

open TAGS, '-|', $cmd or die "could not run:\n$cmd";
binmode TAGS;
my @tags = ();
while (<TAGS>) {
  if (/^([0-9a-f]{40}) ([^ ]+) ([0-9]+) (refs\/[^ ]+)$/) {
    my ($h, $t, $l, $r) = ($1, $2, $3, $4);
    chomp $r;
    my %tag = ();
    $tag{'hash'} = $h;
    $tag{'refname'} = $r;
    $tag{'other'} = [];
    $t eq "tag" or die "wtf: non-tag type in input, is git mad?\n";
    my $count = 0;
    while (<TAGS>) {
      $count += length($_);
      chomp;
      last if /^$/;
      $tag{'object'} = $1, next if /^object ([0-9a-f]{40})$/;
      $tag{'type'} = $1, next if /^type ([^ ]+)$/;
      $tag{'tag'} = $1, next if /^tag ([^ ]+)$/;
      $tag{'tagger'} = $1, next if /^tagger (.*)$/ || /^tagger()$/;
      push(@{$tag{'other'}}, $_);
    }
    my $tm = collect(1 + $l - $count);
    chomp $tm;
    $tag{'message'} = $tm;
    if (tag_is_ok(\%tag)) {
      print STDERR "skipping OK tag $tag{refname} ($tag{tag})\n";
    } else {
      print STDERR "processing invalid tag $tag{refname}\n";
      push(@tags, \%tag);
    }
  }
}
close(TAGS);

foreach (@tags) {
  my %tag = %$_;
  my ($r, $h) = ($tag{'refname'}, $tag{'hash'});
  $r && $h or die "programmer error: tag without refname and/or hash key(s)!";
  my ($o, $y, $t) = ($tag{'object'}, $tag{'type'}, $tag{'tag'});
  $o && $y && $t or
    print(STDERR "refusing to process tag w/o object and type and tag fields: $r\n"), next;
  $y eq 'commit' or
    print(STDERR "refusing to process tag with non-commit type '$y': $r ($t)\n"), next;
  $r eq "refs/tags/$t" or
    print(STDERR "refusing to process tag with non-matching refname: $r ($t)\n"), next;
  scalar(@{$tag{'other'}}) eq 0 or
    print(STDERR "refusing to process tag with unknown header fields: $r ($t)\n"), next;
  $tag{'message'} =~ /^-----BEGIN PGP SIGNATURE-----/m and
    print(STDERR "refusing to process signed tag: $r ($t)\n"), next;
  my ($cn, $ce, $cd, $an, $ae, $ad) = commit_info($o);
  $cn && $ce && $cd && $an && $ae && $ad or
    print(STDERR "skipping because no commit $o info availalbe: $r ($t)\n"), next;
#  $_->{'commit'} = [$cn, $ce, $cd, $an, $ae, $ad];
#  print STDERR "$o = \"$cn\",\"$ce\",\"$cd\",\"$an\",\"$ae\",\"$ad\"\n";
  my ($n, $e, $tu, $to);
  ($n, $e, $tu, $to)  = split_tagger($tag{'tagger'}) if defined($tag{'tagger'});
  $n = without_crud($n) if defined($n);
  $e = without_crud($e) if defined($e);
  if (!$n && !$e) {
    $n = $cn;
    $e = $ce;
  } elsif ($n) {
    if ($cn eq $n) {
      $e = $ce;
    } elsif ($an eq $n) {
      $e = $ae;
    } else {
      print STDERR "skipping because no email available for \"$n\": $r ($t)\n";
      next;
    }
  } else {
    if ($ce eq $e) {
      $n = $cn;
    } elsif ($ae eq $e) {
      $n = $an;
    } else {
      print STDERR "skipping because no name available for <$e>: $r ($t)\n";
      next;
    }
  }
  my $d;
  if ($tu && !$to) {
    $d = "$tu +0000";
  } elsif (!$tu) {
    $d = $cd;
  } else {
    $d ="$tu $to";
  }
  $t = to_utf8($t);
  $n = to_utf8($n);
  $e = to_utf8($e);
  my $m = to_utf8($tag{'message'});
  {
    use bytes;
    print "tag $t\nfrom $o\ntagger $n <$e> $d\n";
    print "data ", length($m), "\n", $m, "\n";
  }
}
print "done\n";

#use Data::Dumper;
#print STDERR Data::Dumper->Dump([\@tags], ['*tags']);

exit 0;
	#!/usr/bin/env perl

	# export-fixed-tags -- produce fast-import stream to fix broken tags
	# Copyright (C) 2016 Kyle J. McKay. All rights reserved.
	# License AGPLv3+ https://www.gnu.org/licenses/agpl.html

	#
	## Usage
	##
	## Some software used to convert repositories to Git format creates invalid
	## tags during the conversion. In the case where these tags lack a signature,
	## email and timestamp but DO have an author name and that name matches the
	## name on the commit the tag refers to, the tag can be corrected by adding
	## the missing information by taking it from the commit.
	##
	## Some repositories resulting from such bad conversions can be found at:
	##
	## http://git.savannah.gnu.org/
	##
	## In particular, if a repository has bad tags a clone with fsckObjects=true
	## will fail. For example:
	##
	## git -c transfer.fsckobjects=true clone http://git.savannah.gnu.org/r/automake.git
	##
	## will fail (unless the invalid tags have since been corrected).
	##
	## Running this script in a repository with any of these kind of bad tags will
	## produce a git fast-import stream (on standard output) that can be passed to
	## git fast-import to correct the tags (progress is reported to standard error).
	##
	## So, for example, the aforementioned automake repository's bad tags can be
	## corrected using this script like so:
	##
	## git -c transfer.fsckobjects=false clone --mirror http://git.savannah.gnu.org/r/automake.git
	## cd automake.git
	## export-fixed-tags \| git fast-import
	##
	## followed by a push to publish the corrected tags.
	#

	use strict;
	use warnings;
	use Encode;

	my $encoder;
	BEGIN {
	$encoder = Encode::find_encoding('Windows-1252') \|\|
	Encode::find_encoding('ISO-8859-1')
	or die "failed to load ISO-8859-1 encoder\n";
	}

	sub to_utf8($;$) {
	my $str = shift;
	return undef unless defined $str;
	my $ans;
	if (Encode::is_utf8($str) \|\| utf8::decode($str)) {
	$ans = $str;
	} else {
	$ans = $encoder->decode($str, Encode::FB_DEFAULT);
	}
	utf8::encode($ans);
	return $ans;
	}

	sub collect {
	my $count = shift;
	my $c = '';
	my $b = '';
	while ($count >= 32768) {
	read(TAGS, $b, 32768);
	$count -= 32768;
	$c .= $b;
	}
	read(TAGS, $b, $count), $c .= $b if $count;
	$c;
	}

	# mimics strbuf_addstr_without_crud in ident.c
	# but also skips NULs since those are permitted in commit or tag headers
	sub without_crud($) {
	my $x = shift;
	return undef unless defined($x);
	$x =~ s/^[\x00-\x1f .,:;<>\x22\\']+//s; # remove crud from beginning
	$x =~ s/[\x00-\x1f .,:;<>\x22\\']+$//s; # remove crud from the end
	$x =~ s/[\n<>\0]//g; # remove internal \n \0 < and >
	$x;
	}

	sub split_tagger($) {
	my $g = shift;
	defined($g) or return ();
	my ($n, $t, $o);
	($g, $o) = ($1, $2) if $g =~ /^(.?)\s([-+]\d\d\d\d)$/;
	($g, $t) = ($1, 0 + $2) if $g =~ /^(.?)\s([-+]?\d+)$/;
	($n, $g) = ($1, $2), $n =~ s/\s+$// if $g =~ /^\s([^<])(.*)$/;
	$g =~ s/\s+$//;
	$g =~ s/^<+//;
	$g =~ s/>+$//;
	return ($n, $g, $t, $o);
	}

	sub tag_is_ok($) {
	my $tag = shift;
	my $g = $tag->{'tagger'};
	return undef unless defined($g);
	my ($n, $e, $t, $o) = split_tagger($g);
	{
	no warnings;
	# print STDERR "\"$g\" -> \"$n\",\"$e\",\"$t\",\"$o\"\n";
	}
	defined($n) && defined($e) && defined($t) && defined($o) or return undef;
	$t >= 0 or return undef;
	my $test = without_crud($n) . ' <' . without_crud($e) . '> ' . $t . ' ' . $o;
	$g eq $test;
	}

	# return is ($cn, $ce, $cd, $an, $ae, $ad) where $cd and $ad are raw dates
	sub commit_info($) {
	split("\n", qx(git log -n 1 --date=raw --format='format:%cn%n%ce%n%cd%n%an%n%ae%n%ad' $_[0] --))
	}

	my $cmd = <<'CMD';
	git for-each-ref refs/tags \|
	awk '$2=="tag"{print $1" "$3}' \|
	git cat-file --batch='%(objectname) %(objecttype) %(objectsize) %(rest)'
	CMD

	open TAGS, '-\|', $cmd or die "could not run:\n$cmd";
	binmode TAGS;
	my @tags = ();
	while (<TAGS>) {
	if (/^([0-9a-f]{40}) ([^ ]+) ([0-9]+) (refs\/[^ ]+)$/) {
	my ($h, $t, $l, $r) = ($1, $2, $3, $4);
	chomp $r;
	my %tag = ();
	$tag{'hash'} = $h;
	$tag{'refname'} = $r;
	$tag{'other'} = [];
	$t eq "tag" or die "wtf: non-tag type in input, is git mad?\n";
	my $count = 0;
	while (<TAGS>) {
	$count += length($_);
	chomp;
	last if /^$/;
	$tag{'object'} = $1, next if /^object ([0-9a-f]{40})$/;
	$tag{'type'} = $1, next if /^type ([^ ]+)$/;
	$tag{'tag'} = $1, next if /^tag ([^ ]+)$/;
	$tag{'tagger'} = $1, next if /^tagger (.*)$/ \|\| /^tagger()$/;
	push(@{$tag{'other'}}, $_);
	}
	my $tm = collect(1 + $l - $count);
	chomp $tm;
	$tag{'message'} = $tm;
	if (tag_is_ok(\%tag)) {
	print STDERR "skipping OK tag $tag{refname} ($tag{tag})\n";
	} else {
	print STDERR "processing invalid tag $tag{refname}\n";
	push(@tags, \%tag);
	}
	}
	}
	close(TAGS);

	foreach (@tags) {
	my %tag = %$_;
	my ($r, $h) = ($tag{'refname'}, $tag{'hash'});
	$r && $h or die "programmer error: tag without refname and/or hash key(s)!";
	my ($o, $y, $t) = ($tag{'object'}, $tag{'type'}, $tag{'tag'});
	$o && $y && $t or
	print(STDERR "refusing to process tag w/o object and type and tag fields: $r\n"), next;
	$y eq 'commit' or
	print(STDERR "refusing to process tag with non-commit type '$y': $r ($t)\n"), next;
	$r eq "refs/tags/$t" or
	print(STDERR "refusing to process tag with non-matching refname: $r ($t)\n"), next;
	scalar(@{$tag{'other'}}) eq 0 or
	print(STDERR "refusing to process tag with unknown header fields: $r ($t)\n"), next;
	$tag{'message'} =~ /^-----BEGIN PGP SIGNATURE-----/m and
	print(STDERR "refusing to process signed tag: $r ($t)\n"), next;
	my ($cn, $ce, $cd, $an, $ae, $ad) = commit_info($o);
	$cn && $ce && $cd && $an && $ae && $ad or
	print(STDERR "skipping because no commit $o info availalbe: $r ($t)\n"), next;
	# $_->{'commit'} = [$cn, $ce, $cd, $an, $ae, $ad];
	# print STDERR "$o = \"$cn\",\"$ce\",\"$cd\",\"$an\",\"$ae\",\"$ad\"\n";
	my ($n, $e, $tu, $to);
	($n, $e, $tu, $to) = split_tagger($tag{'tagger'}) if defined($tag{'tagger'});
	$n = without_crud($n) if defined($n);
	$e = without_crud($e) if defined($e);
	if (!$n && !$e) {
	$n = $cn;
	$e = $ce;
	} elsif ($n) {
	if ($cn eq $n) {
	$e = $ce;
	} elsif ($an eq $n) {
	$e = $ae;
	} else {
	print STDERR "skipping because no email available for \"$n\": $r ($t)\n";
	next;
	}
	} else {
	if ($ce eq $e) {
	$n = $cn;
	} elsif ($ae eq $e) {
	$n = $an;
	} else {
	print STDERR "skipping because no name available for <$e>: $r ($t)\n";
	next;
	}
	}
	my $d;
	if ($tu && !$to) {
	$d = "$tu +0000";
	} elsif (!$tu) {
	$d = $cd;
	} else {
	$d ="$tu $to";
	}
	$t = to_utf8($t);
	$n = to_utf8($n);
	$e = to_utf8($e);
	my $m = to_utf8($tag{'message'});
	{
	use bytes;
	print "tag $t\nfrom $o\ntagger $n <$e> $d\n";
	print "data ", length($m), "\n", $m, "\n";
	}
	}
	print "done\n";

	#use Data::Dumper;
	#print STDERR Data::Dumper->Dump([\@tags], ['*tags']);

	exit 0;