Skip to content

Instantly share code, notes, and snippets.

/buildSVNTree.pl Secret

Created October 14, 2010 14:39
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/f6902cb4e3534f07ba48 to your computer and use it in GitHub Desktop.
Save anonymous/f6902cb4e3534f07ba48 to your computer and use it in GitHub Desktop.
The scripts I used to convert from SVN to Git
#!/usr/bin/perl -w
use strict;
use SVN::Core;
use SVN::Ra;
use Text::Glob;
use Data::Dumper;
# Usage: perl buildSVNTree.pl > svnBranches.txt
# (messages will appear on STDERR)
# Convert a path to the canonical branch name
# For branches this is just the directory name
# For tags, it includes tags/directory name to avoid name clashes with branches
sub path2name
{
my $path = shift;
my $name = $path;
$name =~ s{^.*?((?:tags/)?[^/]+)$}{$1};
return $name;
}
# Places where branches live (see Text::Glob for syntax)
# Our repo is /trunk/project rather than /project/trunk, modify as needed
my $branchLocations = '/{trunk,branches/*,tags/*}';
# Identify svn copies
my @interesting_copies; # list of copies; hashref with fields ( path, rev, fromPath, fromRev )
my @branch_revs; # svn revs with likely branches in them ($branch_revs[$rev] = root of branch)
my @log_msgs; # svn log messages, useful for debugging
sub buildCopyList
{
my $pathHash = shift; # { full path => _p_svn_log_changed_path_t object }
my $rev = shift;
my $author = shift;
my $date = shift;
my $logmsg = shift;
my $pool = shift; # svn pool object
print STDERR "$rev\n" unless $rev % 500; # helpful status
# for understanding the non-standard copies
if( defined( $logmsg ) )
{
my @all_lines = split( /\n/, $logmsg );
$log_msgs[$rev] = $all_lines[0]; # just use first line to save display space later
}
$branch_revs[$rev] = 0; # initialize to "not the root of a branch"
foreach my $path ( keys( %$pathHash ) )
{
my $commit = $pathHash->{$path};
my $fromRev = $commit->copyfrom_rev(); # See SVN::Core
my $fromPath = $commit->copyfrom_path(); # See SVN::Core
unless( $fromRev == $SVN::Core::INVALID_REVNUM )
{
next unless $commit->action() eq 'A'; # not fool-proof, but eliminates most of cvs2svn's noise
unless( $branch_revs[$rev] )
{
$branch_revs[$rev] = $path; # rev contains a branch at path
}
else
{
# This rev has already been marked as the root of a branch
# I don't think this happens for most svn copies, but cvs2svn does
# some weird stuff to create branches and tags
# We want the top-most directory, so count directories and take the smaller
my @curDirs = split( m{/}, $path );
my @prevDirs = split( m{/}, $branch_revs[$rev] );
$branch_revs[$rev] = $path if @curDirs < @prevDirs; # prefer the shortest path
}
# store for further analysis later
push( @interesting_copies, { path => $path, rev => $rev, fromPath => $fromPath, fromRev => $fromRev } );
}
}
}
# Return (via global variable) the rev of matching commit
# Just munges global variable each time, so last one wins
my $globalParentRev;
sub setParentRev
{
my $pathHash = shift; # { full path => _p_svn_log_changed_path_t object }
my $rev = shift;
my $author = shift;
my $date = shift;
my $logmsg = shift;
my $pool = shift; # svn pool object
$globalParentRev = $rev;
}
# Return (via global variable) the number of files changed in a commit
# Useful for identifying cvs2svn copies (lots of files) vs normal svn copies (usually 1 or 2 files)
my $globalNumChangedFiles;
sub setNumChangedFiles
{
my $pathHash = shift; # { full path => _p_svn_log_changed_path_t object }
my $rev = shift;
my $author = shift;
my $date = shift;
my $logmsg = shift;
my $pool = shift; # svn pool object
$globalNumChangedFiles = keys( %$pathHash );
}
# svnURL can be just about any svn URL, but the process is a lot faster
# if you have a local mirror
my $svnURL = 'file:///path/to/svn/repo';
my $ra = SVN::Ra->new( $svnURL );
my $svnHead = $ra->get_latest_revnum();
# First identify all the revs with svn copies
my $paths = ''; # don't limit paths at this time
my $start = 0; # start at root
my $end = $svnHead; # run to head
my $limit = 0; # call the callback as many times as you'd like
my $discover_changed_paths = 1; # tell the callback what paths were modified
my $strict_node_history = 0; # probably doesn't matter for this run; equivalent to stop-on-copy
$ra->get_log( $paths, $start, $end, $limit, $discover_changed_paths,
$strict_node_history, \&buildCopyList );
# Some of the following code uses these, other svn::ra calls use different...
$end = 0; # run backward in time to root
$limit = 1; # just want the most recent edit to that path
$discover_changed_paths = 0; # don't need path information
$strict_node_history = 1; # feel free to stop-on-copy
# attempt to identify branches
# %branches hash eventually looks like:
# path -> { branchName -> { branchrev, deleted, parent, children } }
# preload trunk because it's not copied from anywhere
my %branches = ( '/trunk' => { trunk => { branchrev => 1, deleted => 0 } } );
# our svn layout is /trunk/project, so some operations need to know project name
my $project = "insert-project-name-here";
foreach my $copy ( @interesting_copies )
{
my $fromPath = $copy->{'fromPath'};
my $fromRev = $copy->{'fromRev'};
my $path = $copy->{'path'};
my $rev = $copy->{'rev'};
my $svnFromPath = $fromPath;
$svnFromPath =~ s{^/}{}; # Apparently recent versions of SVN don't like leading slashes
my $fileType = $ra->check_path( $svnFromPath, $fromRev );
# Note: the paths here are dependent on svn layout, they work for me, but YMMV
if( $fileType == $SVN::Node::dir && # source is a directory
!( $fromPath =~ m{^/trunk} && $path =~ m{^/trunk} ) && # not an internal copy within the trunk
!( $fromPath =~ m{^/(branches|tags)/([^/]+)/} && $path =~ m{^/$1/$2/} ) && # not an internal copy within a branch or tag
!( $fromPath =~ m{^/vendor} ) ) # not a vendor branch from CVS (personal preference)
{
if( $branch_revs[$rev] eq $path ) # this path is the highest directory copied
{
if( Text::Glob::match_glob( $branchLocations, $fromPath ) && # the fromPath is an accepted branch location
Text::Glob::match_glob( $branchLocations, $path ) ) # the toPath is an accepted branch location
{
$path =~ s{(.+?)/$project$}{$1}; # strip the project subdirectory if it was explicitly named in copy
my $branchName = path2name( $path );
if( defined( $branches{$path} ) )
{
# branch at this location already exists, rename to avoid clashes
if( defined( $branches{$path}{$branchName} ) )
{
# Previous branch at this location hasn't been renamed yet; do it
my $origRev = $branches{$path}{$branchName}{'branchrev'};
my $newName = $branchName . "@" . $origRev;
$branches{$path}{$newName} = $branches{$path}{$branchName};
# Remove the original (ambiguous) entry
delete( $branches{$path}{$branchName} );
# Update any children for the new branch name
if( defined( $branches{$path}{$newName}{'children'} ) )
{
foreach my $child ( @{ $branches{$path}{$newName}{'children'} } )
{
my $childPath = $child->{'path'};
my $childName = $child->{'name'};
if( $branches{$childPath}{$childName}{'parent'}{'name'} eq $branchName )
{
$branches{$childPath}{$childName}{'parent'}{'name'} = $newName;
}
}
}
# Update the parent for the new branch name
my $parentPath = $branches{$path}{$newName}{'parent'}{'path'};
my $parentName = $branches{$path}{$newName}{'parent'}{'name'};
foreach my $childOfParent ( @{ $branches{$parentPath}{$parentName}{'children'} } )
{
if( $childOfParent->{'name'} eq $branchName )
{
$childOfParent->{'name'} = $newName;
last;
}
}
}
# mark all other branches at this location as deleted
foreach my $oldBranch ( keys( %{ $branches{$path} } ) )
{
$branches{$path}{$oldBranch}{'deleted'} = 1;
}
}
# Add child information to parent path
# Now update parent rev to the last rev that actually modified the parent path
# (because SVN revs are global, just entering 'svn cp foo bar' will result
# in copyfrom_rev being set to HEAD rather than the last rev that edited
# the path being copied. For Git, we want the last edit to the parent path)
$svnFromPath .= "/$project" unless $svnFromPath =~ m{/$project$} or
$ra->check_path( "$svnFromPath/$project", $fromRev ) == $SVN::Node::none;
$ra->get_log( $svnFromPath, $fromRev, $end, $limit, $discover_changed_paths,
$strict_node_history, \&setParentRev );
my $lastParentRev = $globalParentRev;
$globalParentRev = undef; # At least throw a warning if the parent rev isn't found
# recurse up tree to find parent branch if not copied from parent root
my $parentPath = $fromPath;
$parentPath =~ s{/(.*?)/?[^/]+$}{/$1} until defined( $branches{$parentPath} ) or $parentPath eq '/';
my $parentNameGuess = path2name( $parentPath );
my %parentInfo = ( name => $parentNameGuess, path => $parentPath, rev => $lastParentRev );
my %childInfo = ( name => $branchName, path => $path, rev => $rev );
unless( $parentPath eq '/' )
{
# The branch from this location is already known
my $parentName;
if( defined( $branches{$parentPath}{$parentNameGuess} ) &&
$branches{$parentPath}{$parentNameGuess}{'branchrev'} <= $fromRev )
{
$parentName = $parentNameGuess;
}
else
{
# The fromPath is defined, but the name guess or the rev is wrong.
# Iterate over the keys of fromPath to try to find a branch that matches
my $parentRev = -1;
foreach my $branchName ( keys %{ $branches{$parentPath} } )
{
my $rev = $branches{$parentPath}{$branchName}{'branchrev'};
# find the most recent branch that is still older than fromRev
if( $rev <= $fromRev && $rev > $parentRev )
{
$parentName = $branchName;
$parentRev = $rev;
}
}
$parentInfo{'name'} = $parentName;
}
# child information is not required (it's not used elsewhere), but it
# sometimes helps the human-in-the-loop figure out what's going on
if( defined( $branches{$parentPath}{$parentName}{'children'} ) )
{
push( @{ $branches{$parentPath}{$parentName}{'children'} }, \%childInfo );
}
else
{
$branches{$parentPath}{$parentName}{'children'} = [ \%childInfo ];
}
}
else
{
# Somehow we don't yet know about the branch at this location.
# This should only happen if we're going backwards in history or somehow skipped revs
# For now, print an error and go on...
print STDERR "Branch at $path copied from $fromPath which is not a known branch.\n";
print STDERR "\tChild info not recorded\n";
}
# Check if path exists in HEAD
# If path doesn't exist in HEAD, it's flagged as "deleted" which is
# used by the hideFromGit.pl script to move refs to a hidden namespace
my $svnPath = $path;
$svnPath =~ s{^/}{}; # Trim leading slash
my $fileType = $ra->check_path( $svnPath, $svnHead );
my $branchDeleted = $fileType == $SVN::Node::none ? 1 : 0;
$branches{$path}{$branchName}{'branchrev'} = $rev;
$branches{$path}{$branchName}{'deleted'} = $branchDeleted;
$branches{$path}{$branchName}{'parent'} = \%parentInfo;
# Print branching information
# print STDERR "$fromPath -> $path @ $rev";
# print STDERR " ($parentPath)" if $parentPath ne $fromPath;
# print STDERR " (deleted in HEAD)" if $branchDeleted;
# print STDERR "\n";
}
else # the path doesn't match expected branches, so inform the user
{
# find the last rev that actually changed the parent
$ra->get_log( $svnFromPath, $fromRev, $end, $limit, $discover_changed_paths,
$strict_node_history, \&setParentRev );
$fromRev = $globalParentRev;
$globalParentRev = undef; # At least throw a warning if the parent rev isn't found
# get the number of files changed in the current rev (svn copy produces
# usually just one or two paths, cvs2svn creates lots)
$ra->get_log( $paths, $rev, $rev, 1, 1, $strict_node_history, \&setNumChangedFiles );
my $numChangedFiles = $globalNumChangedFiles;
$globalNumChangedFiles = undef; # At least throw a warning if not set
# These fall into a couple categories:
# 1) actual branches/tags that are copied from a subdir (should be mostly gone now)
# 2) "merging" (copying) a directory from one branch to another (ignore!)
# 3) screwed up branches/tags (usually deleted and redone in nearby revs) (ignore!)
# 4) something that looks like a branch, but is a file-by-file copy of a directory contents?
# (this is probably an svn copy where the destination already existed, probably ignore as it's basically a merge)
print STDERR "Possible branch: $fromPath @ $fromRev -> $path @ $rev ($numChangedFiles files changed)\n"; # a large number of changed files tends to be a cvs2svn artifact
print STDERR "\tr" . $rev . ": " . $log_msgs[$rev] . "\n";
print STDERR "\tr" . ($rev+1) . ": " . $log_msgs[$rev+1] . "\n";
print STDERR "\tr" . ($rev+2) . ": " . $log_msgs[$rev+2] . "\n";
}
}
}
elsif( $fileType == $SVN::Node::dir )
{
print STDERR "*** rejected copy from $fromPath to $path @ $rev\n";
}
}
# Serialize it all for use by a later Perl script
print Dumper( \%branches );
#!/usr/bin/perl -w
use strict;
# svn log svn-url | perl fetchSVNNames.pl
my %nameHash;
while( <> )
{
next unless /^r(\d+)\s\|\s([A-Za-z0-9 ()]+)/;
print "$1\n";
$nameHash{$2} = 1;
}
foreach my $name ( keys( %nameHash ) )
{
print "$name\n";
}
#!/usr/bin/perl -w
use strict;
use Cwd;
use IO::File;
use Data::Dumper;
# install a SIGINT handler just to make canceling a bit easier.
# The way this script is put together, it will still require hitting CTRL-C a
# couple times in a row (kill all children, then kill this script)...
$SIG{'INT'} = sub {
# Experimental data implies Perl forwards signals to child processes during
# system() calls, so the only time this process will catch SIGINT is if no
# children are running, thus it's safe to exit without reaping children...
# I think...
print "received SIGINT, exiting...\n";
exit(1);
};
sub usage
{
return "Usage: $0 svnBranchFile\n";
}
die usage() unless @ARGV == 1;
# Bash script that git will evaluate every commit to translate svn names to
# git names (add/modify as necessary, fetchSVNNames.pl can be helpful)
my $authorScript = <<EndOfScript
if [ "\$GIT_COMMITTER_NAME" = "(no author)" ]; then
export GIT_COMMITTER_NAME="nobody"
export GIT_AUTHOR_NAME=\$GIT_COMMITTER_NAME
export GIT_COMMITTER_EMAIL="none\@none.com"
export GIT_AUTHOR_EMAIL=\$GIT_COMMITTER_EMAIL
elif [ "\$GIT_COMMITTER_NAME" = "nobody" ]; then
export GIT_COMMITTER_NAME="nobody"
export GIT_AUTHOR_NAME=\$GIT_COMMITTER_NAME
export GIT_COMMITTER_EMAIL="none\@none.com"
export GIT_AUTHOR_EMAIL=\$GIT_COMMITTER_EMAIL
else
echo "Unknown author \$GIT_COMMITTER_NAME";
fi
EndOfScript
;
chomp( $authorScript ); # keep rest of git command on the same line
sub do_cmd
{
my @args = @_;
# For debugging purposes uncomment the next line to print out the shell
# command that will be run:
# unshift( @args, 'echo' );
# Use the two argument form of system to avoid invoking a shell (see perldoc -f exec)
system { $args[0] } @args;
# Check output of system
if( $? == -1 )
{
print "$0: failed to execute $args[0]: $!\n";
}
elsif( $? & 127 )
{
printf( "$0: $args[0] died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without' );
}
elsif( $? >> 8 )
{
printf( "$0: $args[0] exited with nonzero value %d\n", $? >> 8 );
print "$0: child error message: $!\n";
}
}
# Load SVN branch history
my %svnBranches;
# Anonymous block to lexically hide serialized perl structure
{
# Load up serialized hash structure
my $filename = shift( @ARGV );
my $svnHistoryFile = IO::File->new( $filename );
my $oldSep = $/;
$/ = undef;
my $svnHistory = <$svnHistoryFile>;
$/ = $oldSep;
$svnHistoryFile->close();
# Eval into curent scope and store in visible hash
# Note: eval is a potential security hole, use wisely!
my $tmpRef = eval "my $svnHistory";
die "Loading svn history failed: $@\n" unless defined( $tmpRef );
%svnBranches = %$tmpRef;
}
my $parentRepo = "/path/to/massive/git/repo/from/svn-fe";
# MacOS Ram Disk: diskutil erasevolume HFS+ "ramdisk" `hdiutil attach -nomount ram://1165430`
# Linux: use tmpfs
# Windows: ???
my $tempdir = "/Volumes/ramdisk/git";
my $startDir = &Cwd::cwd();
my $svnRepoName = "repoName-given-to-svn-fe"; # for building revmaps
my $workDir = "$svnRepoName-git-repos";
my $cleanDir = "$svnRepoName-git-repos-small";
# our svn layout is /trunk/project, so some operations need to know project name
my $project = "insert-project-name-here";
my @emptyPaths;
my @missingBranchRevs;
my $totalPaths = keys( %svnBranches );
my $curPathNum = 1;
foreach my $path ( keys( %svnBranches ) )
{
my $repoName = $path;
$repoName =~ s{^/}{}; # remove leading slash
my $gitPath = "$path/$project"; # kinda hackish to pull out project subdir
$gitPath =~ s{^/}{}; # remove leading slash
my $curTempDir = "$tempdir"; # could specialize this per branch, but doesn't seem to be necessary
my $branchRepo = "$workDir/$repoName.git";
# helpful status
print "Begin repo $repoName ($curPathNum/$totalPaths)\n";
$curPathNum++;
# No this script doesn't use Git.pm. I ran into too many problems
# attempting to execute simple Git commands (and I don't remember them
# now), so I just shell out to git...
# git clone --bare parentRepo dirRepo
do_cmd( qw( git clone --bare ), $parentRepo, $branchRepo );
# cd dirRepo
chdir( $branchRepo );
# git filter-branch --env-filter authorCmd --subdirectory-filter $dir -d tempdir -- --all
$? = 0; # dangerous, but we don't want errors from previous commands getting flagged as filter-branch errors
do_cmd( qw( git filter-branch --env-filter ), $authorScript,
'--subdirectory-filter', $gitPath, '-d', $curTempDir, '--', '--all' );
# detect empty repository
if( $? >> 8 == 1 ) # filter-branch exits 1 when nothing matches the subdirectory filter
{
push( @emptyPaths, $path );
print "Cleaning up empty repo: $repoName\n";
chdir( $startDir );
do_cmd( qw( rm -rf ), $branchRepo );
$? = 0;
print "\n"; # blank line before starting next branch
next; # continue to next path
}
$? = 0;
# Don't need to do any ref translations for the trunk
unless( $path eq '/trunk' )
{
# Create a revmap since git object ids have changed
my $gitLogString = `git log --reverse`; # reverse causes commits to be listed in oldest-first order
my @gitLogLines = split( /\n/, $gitLogString );
my ( $sha, $rev, @svnRevs, @gitCommits, %rev2idx );
my $revIdx = 0;
foreach my $line ( @gitLogLines )
{
# process it similar to creating revmaps from the command line
# there are more efficient ways, but this logic has already been tested
$sha = $1 if $line =~ /^commit\s([0-9a-fA-F]+)/;
if( $line =~ /$svnRepoName@(\d+)/ )
{
$rev = $1;
push( @svnRevs, $rev );
$rev2idx{$rev} = $revIdx++;
push( @gitCommits, $sha );
}
}
# Translate svn branches/tags to git branches/tags
print "Converting svn branches/tags to git refs\n";
my @branches = keys( %{ $svnBranches{$path} } );
# sort the branches based on svn creation rev
@branches = sort { $svnBranches{$path}{$a}{'branchrev'} <=>
$svnBranches{$path}{$b}{'branchrev'} } @branches;
for( my $b = 0; $b < @branches; $b++ )
{
my $branch = $branches[$b];
if( !defined( $rev2idx{ $svnBranches{$path}{$branch}{'branchrev'} } ) )
{
# if the branch didn't make it through the subdirectory filter, don't create a branch/tag for it
print "\tBranch $branch not found in current revmap, skipping...\n";
push( @missingBranchRevs, $branch );
next;
}
my $refObj;
if( ($b+1) < @branches )
{
# Another branch/tag follows this one at the same path. Therefore the
# current branch needs to point to the commit object just before the
# new branch is created.
my $nextBranchRev = $svnBranches{$path}{$branches[$b+1]}{'branchrev'}; # svn rev when the next branch/tag is created
my $nextBranchIdx = $rev2idx{$nextBranchRev}; # index into gitCommits that next branch is created
my $curBranchIdx = $nextBranchIdx - 1; # one git commit before new branch/tag is created
$refObj = $gitCommits[$curBranchIdx];
}
else
{
$refObj = $gitCommits[-1]; # branch/tag goes all the way to most recent rev touching this path
}
# Create git branch or tag
if( $branch =~ /^tags/ )
{
my $tagName = $branch;
$tagName =~ s{^tags/}{}; # git tags are independent of branches, so don't need leading tags/
# Set up environment variables so tag object is created with the correct metadata
my ($committerName, $committerEmail, $committerDate, $logMsg);
my $gitVars = `git log -1 --pretty=format:'"%cn" "%ce" "%cD" "%B"'`;
(($committerName, $committerEmail, $committerDate, $logMsg) = ($gitVars =~ /"(.+?)"/sg)) ||
die "Unable to parse log output: $gitVars\n";
$ENV{'GIT_COMMITTER_NAME'} = "$committerName";
$ENV{'GIT_COMMITTER_EMAIL'} = "$committerEmail";
$ENV{'GIT_COMMITTER_DATE'} = "$committerDate";
$ENV{'GIT_AUTHOR_NAME'} = $ENV{'GIT_COMMITTER_NAME'}; # probably overkill, but set 'em all to be sure
$ENV{'GIT_AUTHOR_EMAIL'} = $ENV{'GIT_COMMITTER_EMAIL'};
$ENV{'GIT_AUTHOR_DATE'} = $ENV{'GIT_COMMITTER_DATE'};
print "\tTagging $tagName from $refObj\n";
do_cmd( qw( git tag -a -m ), $logMsg, $tagName, $refObj );
}
else
{
print "\tCreating branch $branch from $refObj\n";
do_cmd( qw( git branch ), $branch, $refObj );
}
}
}
#cd ..
chdir( $startDir );
# The slow way to remove dead objects (but doesn't require another directory, also saves space by packing objects)
# rm -r .git/refs/original/
# git reflog expire --expire=now --all
# git gc --aggressive
# git prune
# The fast way
# clone treats a file URL as a URL and thus doesn't make hard links
do_cmd( qw( git clone --bare ), "file:///$startDir/$branchRepo", "$startDir/$cleanDir/$repoName.git" );
print "\n"; # blank line before starting next branch
}
print "The following are branches with missing initial revs:\n";
print Dumper( \@missingBranchRevs );
print "\nThe following are branches that produced empty repositories in git:\n";
print Dumper( \@emptyPaths );
#!/usr/bin/perl -w
use strict;
# Usage: grep *.revmap | perl genJointRevMap.pl > joint.revmap
# Or: find . -name "*.revmap" -exec grep . '{}' + | genJointRevmap.pl > joint.revmap
my @revs;
while( <> )
{
if( m{(?:\./)?([\w.\-/]+?).revmap:(\d+)\s+([0-9a-fA-F]+)} )
{
my $repo = $1;
my $rev = $2;
my $sha = $3;
if( $revs[$rev] )
{
# Two (or more) commits have the same rev touching them
my $old_rev = $revs[$rev];
$old_rev =~ s/^\d+\s(?:multi-repo\s)?//; # strip rev number
$revs[$rev] = "$rev\tmulti-repo\t$old_rev\t$repo\t$sha";
}
else
{
$revs[$rev] = "$rev\t$repo\t$sha";
}
}
}
foreach my $rev ( @revs ) { print "$rev\n" if $rev; }
#!/usr/bin/perl -w
use strict;
use Cwd;
use IO::File;
use Data::Dumper;
# Note: revmapDestDir/gitRepo.revmap must exist. Thus if gitRepo contains
# directories (path/to/gitRepo), then destDir/path/to/gitRepo must exist
sub usage
{
return "Usage: $0 gitRepo svnRepoName revmapDestDir\n";
}
die usage() unless @ARGV == 3;
my $gitRepo = shift( @ARGV );
$gitRepo =~ s{/\s*$}{}; # strip trailing slash and whitespace
my $svnRepoName = shift( @ARGV );
my $destDir = shift( @ARGV );
my $startDir = Cwd::cwd();
chdir( $gitRepo );
my $gitLog = `git log --all --date-order`; # note: multiple commit objects can have the same svn rev
chdir( $startDir );
my @results = $gitLog =~ /^commit\s([0-9a-fA-F]{40,40}) # git commit ID
.+? # git commit details
^\s+git-svn-id:\s$svnRepoName@(\d+)/msxg; # svn rev number
my $fh = new IO::File( ">$destDir/$gitRepo.revmap" );
for( my $i = 0; $i < @results; $i+=2 )
{
# svnRev (tab) gitSHA
print $fh "$results[$i+1]\t$results[$i]\n";
}
$fh->close();
#!/usr/bin/perl -w
use strict;
use SVN::Core;
use SVN::Ra;
use Git;
use IO::File;
use IO::Pty::Easy;
use Data::Dumper;
# Concept: use git patch-id to verify that diffs in SVN were correctly
# translated to Git (note that the git diff is dependent on history, so
# this also checks that parentage was handled correctly). Both VCSs are
# forced to use the system diff for consistency (the two algoritms are
# slightly different). In our repo, this script does find differences,
# but almost all of them can be explained. The tricky one to watch for
# is a SVN commit that changes multiple branches -- this will almost
# always fail because I didn't build jointRevmap concepts into it...
# Usage: gitValidation.pl svnBranches.txt nonJointRevmap
# (This script was written before I got in the habit of
# generating joint revmaps for a single repository, so
# it just deals with the fact that some svn revs get clobbered)
die "Must supply svn history file and revmap\n" unless @ARGV == 2;
# Load SVN branch history
my %svnBranchHistory;
# Anonymous block to lexically hide serialized perl structure
{
# Load up serialized hash structure
my $filename = shift( @ARGV );
my $svnHistoryFile = new IO::File( $filename );
my $oldSep = $/;
$/ = undef;
my $svnHistory = <$svnHistoryFile>;
$/ = $oldSep;
$svnHistoryFile->close();
# Eval into curent scope and store in visible hash
my $tmpRef = eval "my $svnHistory";
die "Loading svn history failed: $@\n" unless defined( $tmpRef );
%svnBranchHistory = %$tmpRef;
}
# Load revmap
my @revmap;
# Anonymous block to lexically hide loading stuff
{
my $filename = shift( @ARGV );
my $revmapFile = new IO::File( $filename );
while( my $line = <$revmapFile> )
{
if( $line =~ m#^(\d+)\smulti-repo# )
{
my $rev = $1;
my %repos;
while( $line =~ m#([\w.\-/]+?)\s([0-9a-fA-F]{40,40})#g )
{
# path => sha
$repos{$1} = $2;
}
$revmap[$rev] = \%repos; # later this will trip a manual graft
}
elsif( $line =~ m#^(\d+).+?([0-9a-fA-F]{40,40})# )
{
my $rev = $1;
my $sha = $2;
$revmap[$rev] = $sha; # TODO this is clobbering the old rev if multiple shas go to a single rev
}
}
$revmapFile->close();
}
my %globalChangedFiles;
sub findChangedPaths
{
my $pathHash = shift; # { full path => _p_svn_log_changed_path_t object }
my $rev = shift;
my $author = shift;
my $date = shift;
my $logmsg = shift;
my $pool = shift; # svn pool object
foreach my $path ( %$pathHash )
{
# Cache the action because later SVN::Ra calls will munge the memory
$globalChangedFiles{$path} = $pathHash->{$path}->action() if defined $pathHash->{$path};
}
}
# svnURL can be just about any svn URL, but the process is a lot faster
# if you have a local mirror
my $svnURL = 'file:///path/to/svn/repo';
my $ra = new SVN::Ra( $svnURL );
my $gitRepo = 'repo-produced-by-repoFusion.pl';
$ENV{'GIT_EXTERNAL_DIFF'} = "/path/to/mygitdiff.sh"; # force git to use the system diff (for svn use --diff-cmd)
$ENV{'GIT_DIR'} = $gitRepo;
# Psuedo-TTY (pty) for handling git-patch-id (need a pty to get around output buffering)
# (this was seriously broken in my Git.pm, so use IO::Pty::Easy instead)
my @gitPatchIDArgs = ( '/path/to/git', 'patch-id' );
my $git = $gitPatchIDArgs[0];
my $gitPatchID = new IO::Pty::Easy;
$gitPatchID->spawn( @gitPatchIDArgs );
my $totalIter = 500; # number of revs to test
my $maxRev = $ra->get_latest_revnum();
for( my $iter = 0; $iter < $totalIter; $iter++ )
{
my $rev = int( rand( $maxRev ) ) + 1;
print "r$rev: ";
print "\t" if $rev < 10000;
my $limit = 1;
$ra->get_log( '', $rev, $rev, $limit, 1, 0, \&findChangedPaths );
my %changedFiles = %globalChangedFiles;
%globalChangedFiles = ();
my $gitSHA = $revmap[$rev];
unless( defined( $gitSHA ) )
{
print "not found in revmap, skipping...\n";
$iter--;
next;
}
if( ref( $gitSHA ) eq 'HASH' )
{
# TODO we won't get here because the revmap isn't a joint revmap,
# just a single repo revmap with multiple sha's per svn rev
# dereference multi repo commits to the right sha
die "multi-repo commit $rev\n";
}
my $allFilesMatch = 1;
my @skippedCreation;
my @skippedOutOfScope;
my @skippedBinary;
foreach my $file ( keys( %changedFiles ) )
{
$file =~ s{^/}{}; # svn doesn't like leading /
# skip directories and other non-file nodes
unless( $ra->check_path( $file, $rev ) == $SVN::Node::file )
{
push( @skippedOutOfScope, "/$file" );
next;
}
unless( $changedFiles{"/$file"} eq 'M' )
{
# TODO check git for "added file" and potentially diff files
push( @skippedCreation, "/$file" );
next;
}
unless( $file =~ m{^.+?/$project/} )
{
# skip files outside desired path
push( @skippedOutOfScope, "/$file" );
next;
}
# need @rev because svn isn't consistent about when in time it extracts file information
my $svnDiff = `svn diff --diff-cmd diff -c $rev $svnURL/$file\@$rev`;
unless( $svnDiff )
{
# In very rare cases SVN will return an empty diff, just skip the file
push( @skippedOutOfScope, "/$file" );
next;
}
if( $svnDiff =~ /Cannot display: file marked as a binary type/ )
{
push( @skippedBinary, "/$file" );
next;
}
$svnDiff =~ s{^Index:.+?=+$}{diff --git}ms; # remove svn header
$svnDiff =~ s{\s+\(revision \d+\)}{}msg; # remove svn revision from files
# get the git diff
my $gitFile = $file;
$gitFile =~ s{.+?/$project/(.+)}{$1}; # just strip the trunk/$project part
my $gitDiff = `$git diff $gitSHA^..$gitSHA -- $gitFile`;
$gitDiff = "diff --git\n" . $gitDiff;
$gitDiff =~ s{^---.+?_([^/\s]+).*?$}{--- $1}ms; # make src filename look like svn
$gitDiff =~ s{^\+\+\+.+?_([^/\s]+).*?$}{+++ $1}ms; # make dest filename look like svn
$gitPatchID->write( "$svnDiff\n" );
my $svnPatchID = $gitPatchID->read( 0.5 ); # small but non-zero timeout seems to work
die "git patch-id didn't return svn patch ID for $file\@$rev" unless defined $svnPatchID;
$svnPatchID =~ s/^([0-9a-fA-F]{40,40})\s+.+\s+/$1/;
$gitPatchID->write( "$gitDiff\n" );
my $gitPatchID = $gitPatchID->read( 0.5 );
die "git patch-id didn't return git patch ID for $gitFile\@$gitSHA" unless defined $gitPatchID;
$gitPatchID =~ s/^([0-9a-fA-F]{40,40})\s+.+\s+/$1/;
unless( $svnPatchID eq $gitPatchID )
{
print "\n\t$file: diffs don't match";
$allFilesMatch = 0;
# die;
}
}
if( $allFilesMatch )
{
my $total = keys( %changedFiles );
my $numSkipped = 0;
$numSkipped += @skippedCreation if @skippedCreation > 0;
$numSkipped += @skippedOutOfScope if @skippedOutOfScope > 0;
unless( $numSkipped > 0.8 * $total ) # a reasonable percentage of the commit was checked
{
print "ok";
print " (skipped $numSkipped/$total)" if $numSkipped > 0;
}
else
{
print "skipped ";
if( $numSkipped == $total )
{
print "all ";
}
else
{
print "most ";
}
print "files ($numSkipped/$total), trial doesn't count";
$iter--;
}
}
print "\n";
}
$gitPatchID->close();
#!/usr/bin/perl -w
use strict;
use IO::File;
# The concept of this script is to move git refs based on deleted svn
# paths to a hidden namespace. In our history this comes from retagging
# reusing branch names, or just flat out deleting directories from svn.
#
# Since svn merge information has not been captured by git, if these refs
# are deleted git gc would remove the objects they point to. We don't
# want that (though it is personal preference), so we move all these refs
# to refs/hidden/heads or refs/hidden/tags. These refs remain in the
# central repository (git gc will not remove any objects they point to),
# but are not cloned (though they can be explicitly fetched), so they're
# there when you need, but not when you don't.
# Usage: hideFromGit.pl svnBranches.txt
die "Must supply svn history file\n" unless @ARGV == 1;
# Load SVN branch history
my %svnBranches;
# Anonymous block to lexically hide serialized perl structure
{
# Load up serialized hash structure
my $filename = shift( @ARGV );
my $svnHistoryFile = new IO::File( $filename );
my $oldSep = $/;
$/ = undef;
my $svnHistory = <$svnHistoryFile>;
$/ = $oldSep;
$svnHistoryFile->close();
# Eval into curent scope and store in visible hash
my $tmpRef = eval "my $svnHistory";
die "Loading svn history failed: $@\n" unless defined( $tmpRef );
%svnBranches = %$tmpRef;
}
my $gitRepoName = "repo-produced-by-repoFusion.pl";
my $cleanDir = "$gitRepoName-git-repos-small";
my $finalRepo = "$cleanDir/$gitRepoName.git";
# read packed-refs if it exists
my $packedRefsFile = new IO::File( "$finalRepo/packed-refs" );
my $packedRefs = '';
if( defined( $packedRefs ) )
{
my $oldSep = $/;
$/ = undef; # slurp entire file at once
$packedRefs = <$packedRefsFile>;
$/ = $oldSep;
$packedRefsFile->close();
# just to be safe
`cp $finalRepo/packed-refs $finalRepo/backup-packed-refs`;
}
foreach my $path ( keys( %svnBranches ) )
{
foreach my $branch ( keys( %{ $svnBranches{$path} } ) )
{
next unless $svnBranches{$path}{$branch}{'deleted'};
# branches are in refs/heads/branchName, tags are refs/tags/tagName
my $branchName = $branch;
unless( $branchName =~ m{^tags/} )
{
$branchName = "heads/$branch";
}
if( -e "$finalRepo/refs/$branchName" )
{
unless( -e "$finalRepo/refs/hidden/$branchName" )
{
rename( "$finalRepo/refs/$branchName",
"$finalRepo/refs/hidden/$branchName" );
}
else
{
print STDERR "Unable to hide $branchName, another ref already exists\n";
}
}
elsif( $packedRefs =~ s{refs/$branchName$}{refs/hidden/$branchName}m )
{
# only get here if substitution succeeded, so nothing more to do
}
else
{
print STDERR "Unable to hide $branchName (probably doesn't exist in git repo?)\n";
}
}
}
if( $packedRefs )
{
my $packedRefsFile = new IO::File( ">$finalRepo/packed-refs" );
die $! unless defined( $packedRefsFile );
print $packedRefsFile $packedRefs;
$packedRefsFile->close();
}
#!/bin/bash
# Wrapper for git to use the system diff
LEFT=$2
RIGHT=$5
diff -u $LEFT $RIGHT
exit 0
#!/usr/bin/perl -w
use strict;
use Cwd;
use IO::File;
use Data::Dumper;
# install a SIGINT handler just to make canceling a bit easier.
# The way this script is put together, it will still require hitting CTRL-C a
# couple times in a row (kill all children, then kill this script)...
$SIG{'INT'} = sub {
# Experimental data implies Perl forwards signals to child processes during
# system() calls, so the only time this process will catch SIGINT is if no
# children are running, thus it's safe to exit without reaping children...
# I think...
print "received SIGINT, exiting...\n";
exit(1);
};
sub usage
{
return "Usage: $0 svnBranchFile jointRevmapFile\n";
}
die usage() unless @ARGV == 2;
sub do_cmd
{
my @args = @_;
# For debugging purposes uncomment the next line to print out the shell
# command that will be run:
# unshift( @args, 'echo' );
# Use the two argument form of system to avoid invoking a shell (see perldoc -f exec)
system { $args[0] } @args;
# Check output of system
if( $? == -1 )
{
print "$0: failed to execute $args[0]: $!\n";
}
elsif( $? & 127 )
{
printf( "$0: $args[0] died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without' );
}
elsif( $? >> 8 )
{
printf( "$0: $args[0] exited with nonzero value %d\n", $? >> 8 );
print "$0: child error message: $!\n";
}
}
# Load SVN branch history
my %svnBranches;
# Anonymous block to lexically hide serialized perl structure
{
# Load up serialized hash structure
my $filename = shift( @ARGV );
my $svnHistoryFile = new IO::File( $filename );
my $oldSep = $/;
$/ = undef;
my $svnHistory = <$svnHistoryFile>;
$/ = $oldSep;
$svnHistoryFile->close();
# Eval into curent scope and store in visible hash
my $tmpRef = eval "my $svnHistory";
die "Loading svn history failed: $@\n" unless defined( $tmpRef );
%svnBranches = %$tmpRef;
}
# Load revmap
my @revmap;
# Anonymous block to lexically hide loading stuff
{
my $filename = shift( @ARGV );
my $revmapFile = new IO::File( $filename );
while( my $line = <$revmapFile> )
{
if( $line =~ m#^(\d+)\smulti-repo# )
{
my $rev = $1;
my %repos;
while( $line =~ m#([\w.\-/]+?)\s([0-9a-fA-F]{40,40})#g )
{
# path => sha
$repos{$1} = $2;
}
$revmap[$rev] = \%repos; # later this will trip some custom logic
}
elsif( $line =~ m#^(\d+).+?([0-9a-fA-F]{40,40})# )
{
my $rev = $1;
my $sha = $2;
$revmap[$rev] = $sha;
}
}
$revmapFile->close();
}
# MacOS Ram Disk: diskutil erasevolume HFS+ "ramdisk" `hdiutil attach -nomount ram://1165430`
# Linux: use tmpfs
# Windows: ???
mmy $tempdir = "/Volumes/ramdisk/git";
my $startDir = &Cwd::cwd();
# Really should take output from filterBranch.pl to determine where all the
# repos are, but it's a lot easier to just keep the variables the same and hope
# for the best.
my $svnRepoName = "svnRepoName-used-in-filterBranch";
my $workDir = "$svnRepoName-git-repos";
my $cleanDir = "$svnRepoName-git-repos-small";
# Similar to filterBranch.pl: No, this script doesn't use Git.pm. I ran into
# too many problems with it (I don't remember the specifics), so it ended up
# being easier to shell out to git instead...
# Start with the trunk
my $trunkRepo = "$cleanDir/trunk.git";
my $fusionRepo = "$workDir/$svnRepoName.git";
do_cmd( qw( git clone --bare ), $trunkRepo, $fusionRepo );
chdir( $fusionRepo );
mkdir( "info" ) unless -e "info";
my $graftFile = new IO::File( ">info/working_grafts" ); # git tries to read the grafts during the fetches, so keep them out of the way
die $! unless defined( $graftFile );
# Pull branches/tags from small repos
my $totalPaths = keys( %svnBranches );
my $curPathNum = 2; # already did 1 ;)
my $graftFixupReqd = 0;
foreach my $path ( keys( %svnBranches ) )
{
next if $path eq '/trunk';
unless( $curPathNum % 150 )
{
# my system eventually starts complaining about "too many open files"
# so run git gc occasionally to clean up everything
# (150 is pretty arbitrary)
print "Running git gc...\n";
do_cmd( qw( git gc ) );
print "\n";
}
my $repoName = $path;
$repoName =~ s{^/}{}; # remove leading slash
if( -e $branchRepo )
{
print "Fetching from repo $repoName ($curPathNum/$totalPaths)\n";
$curPathNum++;
my $remoteRefs = `git ls-remote $branchRepo`;
my @refs;
my $grafts;
foreach my $branch ( keys( %{ $svnBranches{$path} } ) )
{
my $branchName = $branch;
# branches are in refs/heads/branchName, tags are refs/tags/tagName
unless( $branchName =~ m{^tags/} )
{
$branchName = "heads/$branch";
}
# Skip branch if remote repo doesn't have a matching ref
next unless $remoteRefs =~ m{refs/$branchName$}m;
# Add branch to list of refs to fetch from remote
push( @refs, "refs/$branchName:refs/$branchName" );
# Create grafts
# Grab revs of interest
my $childRev = $svnBranches{$path}{$branch}{'branchrev'};
my $parentRev = $svnBranches{$path}{$branch}{'parent'}{'rev'};
my $childSHA = $revmap[$childRev];
if( !defined( $childSHA ) )
{
print "\tUndefined child rev r$childRev in revmap\n";
$graftFixupReqd = 1;
$childSHA = "* r$childRev @ $path ($svnBranches{$path}{$branch}{'parent'}{'name'})";
}
elsif( ref( $childSHA ) eq 'HASH' )
{
# The child comes from an svn rev that touches multiple git repos
# The revmap should have enough information in it to resolve the
# ambiguity, so just do it.
$childSHA = $childSHA->{"$repoName.git"};
}
my $parentSHA = $revmap[$parentRev];
if( !defined( $parentSHA ) )
{
print "\tUndefined parent rev r$parentRev in revmap\n";
$graftFixupReqd = 1;
$parentSHA = "* r$parentRev @ $svnBranches{$path}{$branch}{'parent'}{'path'} ($branch)";
}
elsif( ref( $parentSHA ) eq 'HASH' )
{
# The parent comes from an svn rev that touches multiple git repos
# The revmap should have enough information in it to resolve the
# ambiguity, so just do it.
my $parentRepoName = $svnBranches{$path}{$branch}{'parent'}{'path'};
$parentRepoName =~ s{^/}{}; # remove leading slash
$parentSHA = $parentSHA->{"$parentRepoName.git"};
}
$grafts .= "$childSHA $parentSHA\n";
}
do_cmd( qw( git fetch ), $branchRepo, @refs );
if( defined( $grafts ) )
{
print "Adding grafts...\n";
print $graftFile $grafts;
}
print "\n"; # blank line before starting next branch
}
else
{
print "Skipping non-existant repo $repoName ($curPathNum/$totalPaths)\n\n";
$curPathNum++;
}
}
$graftFile->close();
if( $graftFixupReqd )
{
# pause here and let the user fix the manual grafts
print "Manual graft fixing is required. Please edit $fusionRepo/info/working_grafts before continuing\n";
print "Press Enter to continue...";
<>;
print "\n";
}
rename( "info/working_grafts", "info/grafts" );
# filter-branch to commit grafts
# Have to specify "tag-name-filter cat" to enable tags to be modified to point
# to new objects
print "Committing grafts to permant history...\n";
do_cmd( qw( git filter-branch --tag-name-filter cat ), '-d', $tempdir, '--', '--all' );
do_cmd( qw( rm -rf refs/original ) ); # work around a bug in clone that breaks tags if refs/original exists
print "\n";
#cd ..
chdir( $startDir );
# clone to file:/// repo to remove any cruft
# clone treats a file URL as a URL and thus doesn't make hard links
do_cmd( qw( git clone --bare ), "file:///$startDir/$fusionRepo", "$startDir/$cleanDir/$svnRepoName.git" );
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment