Skip to content

Instantly share code, notes, and snippets.

@mackyle
Last active October 25, 2016 04:42
Show Gist options
  • Save mackyle/9ea081513f6b90bb4470b7b2bc6e4bce to your computer and use it in GitHub Desktop.
Save mackyle/9ea081513f6b90bb4470b7b2bc6e4bce to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
# export-fixed-tags -- produce fast-import stream to fix broken tags
# Copyright (C) 2016 Kyle J. McKay. All rights reserved.
# License AGPLv3+ https://www.gnu.org/licenses/agpl.html
#
## Usage
##
## Some software used to convert repositories to Git format creates invalid
## tags during the conversion. In the case where these tags lack a signature,
## email and timestamp but DO have an author name and that name matches the
## name on the commit the tag refers to, the tag can be corrected by adding
## the missing information by taking it from the commit.
##
## Some repositories resulting from such bad conversions can be found at:
##
## http://git.savannah.gnu.org/
##
## In particular, if a repository has bad tags a clone with fsckObjects=true
## will fail. For example:
##
## git -c transfer.fsckobjects=true clone http://git.savannah.gnu.org/r/automake.git
##
## will fail (unless the invalid tags have since been corrected).
##
## Running this script in a repository with any of these kind of bad tags will
## produce a git fast-import stream (on standard output) that can be passed to
## git fast-import to correct the tags (progress is reported to standard error).
##
## So, for example, the aforementioned automake repository's bad tags can be
## corrected using this script like so:
##
## git -c transfer.fsckobjects=false clone --mirror http://git.savannah.gnu.org/r/automake.git
## cd automake.git
## export-fixed-tags | git fast-import
##
## followed by a push to publish the corrected tags.
#
use strict;
use warnings;
use Encode;
my $encoder;
BEGIN {
$encoder = Encode::find_encoding('Windows-1252') ||
Encode::find_encoding('ISO-8859-1')
or die "failed to load ISO-8859-1 encoder\n";
}
sub to_utf8($;$) {
my $str = shift;
return undef unless defined $str;
my $ans;
if (Encode::is_utf8($str) || utf8::decode($str)) {
$ans = $str;
} else {
$ans = $encoder->decode($str, Encode::FB_DEFAULT);
}
utf8::encode($ans);
return $ans;
}
sub collect {
my $count = shift;
my $c = '';
my $b = '';
while ($count >= 32768) {
read(TAGS, $b, 32768);
$count -= 32768;
$c .= $b;
}
read(TAGS, $b, $count), $c .= $b if $count;
$c;
}
# mimics strbuf_addstr_without_crud in ident.c
# but also skips NULs since those are permitted in commit or tag headers
sub without_crud($) {
my $x = shift;
return undef unless defined($x);
$x =~ s/^[\x00-\x1f .,:;<>\x22\\']+//s; # remove crud from beginning
$x =~ s/[\x00-\x1f .,:;<>\x22\\']+$//s; # remove crud from the end
$x =~ s/[\n<>\0]//g; # remove internal \n \0 < and >
$x;
}
sub split_tagger($) {
my $g = shift;
defined($g) or return ();
my ($n, $t, $o);
($g, $o) = ($1, $2) if $g =~ /^(.*?)\s*([-+]\d\d\d\d)$/;
($g, $t) = ($1, 0 + $2) if $g =~ /^(.*?)\s*([-+]?\d+)$/;
($n, $g) = ($1, $2), $n =~ s/\s+$// if $g =~ /^\s*([^<]*)(.*)$/;
$g =~ s/\s+$//;
$g =~ s/^<+//;
$g =~ s/>+$//;
return ($n, $g, $t, $o);
}
sub tag_is_ok($) {
my $tag = shift;
my $g = $tag->{'tagger'};
return undef unless defined($g);
my ($n, $e, $t, $o) = split_tagger($g);
{
no warnings;
# print STDERR "\"$g\" -> \"$n\",\"$e\",\"$t\",\"$o\"\n";
}
defined($n) && defined($e) && defined($t) && defined($o) or return undef;
$t >= 0 or return undef;
my $test = without_crud($n) . ' <' . without_crud($e) . '> ' . $t . ' ' . $o;
$g eq $test;
}
# return is ($cn, $ce, $cd, $an, $ae, $ad) where $cd and $ad are raw dates
sub commit_info($) {
split("\n", qx(git log -n 1 --date=raw --format='format:%cn%n%ce%n%cd%n%an%n%ae%n%ad' $_[0] --))
}
my $cmd = <<'CMD';
git for-each-ref refs/tags |
awk '$2=="tag"{print $1" "$3}' |
git cat-file --batch='%(objectname) %(objecttype) %(objectsize) %(rest)'
CMD
open TAGS, '-|', $cmd or die "could not run:\n$cmd";
binmode TAGS;
my @tags = ();
while (<TAGS>) {
if (/^([0-9a-f]{40}) ([^ ]+) ([0-9]+) (refs\/[^ ]+)$/) {
my ($h, $t, $l, $r) = ($1, $2, $3, $4);
chomp $r;
my %tag = ();
$tag{'hash'} = $h;
$tag{'refname'} = $r;
$tag{'other'} = [];
$t eq "tag" or die "wtf: non-tag type in input, is git mad?\n";
my $count = 0;
while (<TAGS>) {
$count += length($_);
chomp;
last if /^$/;
$tag{'object'} = $1, next if /^object ([0-9a-f]{40})$/;
$tag{'type'} = $1, next if /^type ([^ ]+)$/;
$tag{'tag'} = $1, next if /^tag ([^ ]+)$/;
$tag{'tagger'} = $1, next if /^tagger (.*)$/ || /^tagger()$/;
push(@{$tag{'other'}}, $_);
}
my $tm = collect(1 + $l - $count);
chomp $tm;
$tag{'message'} = $tm;
if (tag_is_ok(\%tag)) {
print STDERR "skipping OK tag $tag{refname} ($tag{tag})\n";
} else {
print STDERR "processing invalid tag $tag{refname}\n";
push(@tags, \%tag);
}
}
}
close(TAGS);
foreach (@tags) {
my %tag = %$_;
my ($r, $h) = ($tag{'refname'}, $tag{'hash'});
$r && $h or die "programmer error: tag without refname and/or hash key(s)!";
my ($o, $y, $t) = ($tag{'object'}, $tag{'type'}, $tag{'tag'});
$o && $y && $t or
print(STDERR "refusing to process tag w/o object and type and tag fields: $r\n"), next;
$y eq 'commit' or
print(STDERR "refusing to process tag with non-commit type '$y': $r ($t)\n"), next;
$r eq "refs/tags/$t" or
print(STDERR "refusing to process tag with non-matching refname: $r ($t)\n"), next;
scalar(@{$tag{'other'}}) eq 0 or
print(STDERR "refusing to process tag with unknown header fields: $r ($t)\n"), next;
$tag{'message'} =~ /^-----BEGIN PGP SIGNATURE-----/m and
print(STDERR "refusing to process signed tag: $r ($t)\n"), next;
my ($cn, $ce, $cd, $an, $ae, $ad) = commit_info($o);
$cn && $ce && $cd && $an && $ae && $ad or
print(STDERR "skipping because no commit $o info availalbe: $r ($t)\n"), next;
# $_->{'commit'} = [$cn, $ce, $cd, $an, $ae, $ad];
# print STDERR "$o = \"$cn\",\"$ce\",\"$cd\",\"$an\",\"$ae\",\"$ad\"\n";
my ($n, $e, $tu, $to);
($n, $e, $tu, $to) = split_tagger($tag{'tagger'}) if defined($tag{'tagger'});
$n = without_crud($n) if defined($n);
$e = without_crud($e) if defined($e);
if (!$n && !$e) {
$n = $cn;
$e = $ce;
} elsif ($n) {
if ($cn eq $n) {
$e = $ce;
} elsif ($an eq $n) {
$e = $ae;
} else {
print STDERR "skipping because no email available for \"$n\": $r ($t)\n";
next;
}
} else {
if ($ce eq $e) {
$n = $cn;
} elsif ($ae eq $e) {
$n = $an;
} else {
print STDERR "skipping because no name available for <$e>: $r ($t)\n";
next;
}
}
my $d;
if ($tu && !$to) {
$d = "$tu +0000";
} elsif (!$tu) {
$d = $cd;
} else {
$d ="$tu $to";
}
$t = to_utf8($t);
$n = to_utf8($n);
$e = to_utf8($e);
my $m = to_utf8($tag{'message'});
{
use bytes;
print "tag $t\nfrom $o\ntagger $n <$e> $d\n";
print "data ", length($m), "\n", $m, "\n";
}
}
print "done\n";
#use Data::Dumper;
#print STDERR Data::Dumper->Dump([\@tags], ['*tags']);
exit 0;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment