Skip to content

Instantly share code, notes, and snippets.

@Pandapip1
Created June 8, 2024 14:37
Show Gist options
  • Save Pandapip1/2ee4da5a2d871792faf3d90098767704 to your computer and use it in GitHub Desktop.
Save Pandapip1/2ee4da5a2d871792faf3d90098767704 to your computer and use it in GitHub Desktop.
Spack Deduplicate
#!/usr/bin/env python3
from pathlib import Path
import json
import sys
import filecmp
import os
import shutil
import subprocess
is_binary_string = lambda bytes: bool(bytes.translate(None, (bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f}))))
def package_data_to_v7(package_data, database_version):
match int(database_version):
case 7:
return package_data
case 6:
package_data_v7 = package_data.copy()
package_data_v7['spec']['parameters']['build_system'] = "generic"
if "dependencies" in package_data_v7['spec']:
package_data_v7['spec']['dependencies'] = []
for dependency in package_data['spec']['dependencies']:
package_data_v7['spec']['dependencies'].append({
"name": dependency['name'],
"hash": dependency['hash'],
"parameters": {
"deptypes": dependency['type'],
"virtuals": []
}
})
return package_data_v7
case _:
raise ValueError(f"Unsupported database version {database_version}")
def package_data_from_v7(package_data, database_version):
match int(database_version):
case 7:
return package_data
case 6:
package_data_v6 = package_data.copy()
package_data_v6['spec']['parameters'].pop('build_system')
if "dependencies" in package_data_v6['spec']:
package_data_v6['spec']['dependencies'] = []
for dependency in package_data['spec']['dependencies']:
package_data_v6['spec']['dependencies'].append({
"name": dependency['name'],
"hash": dependency['hash'],
"type": dependency['parameters']['deptypes']
})
return package_data_v6
case _:
raise ValueError(f"Unsupported database version {database_version}")
class dircmp(filecmp.dircmp):
"""
Compare the content of dir1 and dir2. In contrast with filecmp.dircmp, this
subclass compares the content of files with the same path.
"""
def phase3(self):
"""
Find out differences between common files.
Ensure we are using content comparison with shallow=False.
"""
fcomp = filecmp.cmpfiles(self.left, self.right, self.common_files,
shallow=False)
self.same_files, self.diff_files, self.funny_files = fcomp
def diff_dirs(dir1, dir2):
"""
Compare two directory trees content.
Return a list of paths if they differ, True if they are otherwise different, or False if they are the same.
"""
if not os.path.isdir(dir1) or not os.path.isdir(dir2):
return True
diff = []
compared = dircmp(dir1, dir2)
if compared.left_only:
diff += compared.left_only
if compared.right_only:
diff += compared.right_only
if compared.diff_files:
diff += compared.diff_files
if compared.funny_files:
diff += compared.funny_files
ignore = [".spack", "bin", "include", "lib", "share", "cmake", "man", "doc", "db", "cache"]
for subdir in compared.common_dirs:
if subdir in ignore: continue
subdir_diff = diff_dirs(os.path.join(dir1, subdir), os.path.join(dir2, subdir))
if type(subdir_diff) != bool:
for diffentry in subdir_diff:
diff.append(os.path.join(subdir, diffentry))
if len(diff) == 0:
return False
diff = [ entry for entry in diff if not any([ entry.startswith(ignore_dir) for ignore_dir in ignore ]) ]
# Remove binary files
diff = [ entry for entry in diff if not Path(os.path.join(dir1, entry)).is_file() or not is_binary_string(open(os.path.join(dir1, entry), 'rb').read(1024)) ]
return diff
def recalculate_ref_count(package_hash, packages, recurse = True):
# Check if the package hash is in the packages dictionary
if package_hash not in packages:
raise ValueError(f"Package hash /{package_hash} not found in the packages dictionary")
# First, recurse
if recurse:
if 'dependencies' in packages[package_hash]['spec']:
for dependency in packages[package_hash]['spec']['dependencies']:
if dependency['hash'] in packages:
packages = recalculate_ref_count(dependency['hash'], packages)
# Build, link, and run dependencies all count towards the ref count
packages = packages.copy()
packages[package_hash]['ref_count'] = 0
for other_package_hash in packages.keys():
if 'dependencies' not in packages[other_package_hash]['spec']:
continue
for dependency in packages[other_package_hash]['spec']['dependencies']:
if dependency['hash'] == package_hash and dependency['parameters']['deptypes'] in ['build', 'link', 'run']:
packages[package_hash]['ref_count'] += 1
return packages
deduplicated = []
def deduplicate_spec(phash, packages):
global deduplicated
if phash in deduplicated:
return packages
packages = packages.copy()
if phash not in packages:
return packages
for other_package_hash in list(packages.keys()):
if other_package_hash == phash:
continue
if other_package_hash not in packages:
continue
# If it's a dependent, deduplicate it first
if 'dependencies' in packages[other_package_hash]['spec']:
for dependency in packages[other_package_hash]['spec']['dependencies']:
if dependency['name'] == packages[phash]['spec']['name']:
deduplicated.append(phash) # Prevent infinite recursion
packages = deduplicate_spec(other_package_hash, packages)
deduplicated.remove(phash)
break
spec = packages[phash]['spec']
cur_hash_has_dependents = False
other_hashes_with_dependents = set()
for other_package_hash in list(packages.keys()):
if 'dependencies' in packages[other_package_hash]['spec']:
for dependency in packages[other_package_hash]['spec']['dependencies']:
if 'link' not in dependency['parameters']['deptypes'] and 'run' not in dependency['parameters']['deptypes']:
continue
if dependency['hash'] == phash:
cur_hash_has_dependents = True
break
elif dependency['name'] == spec['name'] and dependency['hash'] in packages and packages[dependency['hash']]['spec']['version'] == spec['version']:
other_hashes_with_dependents.add(dependency['hash'])
alternative_hashes = [package_hash for package_hash, package in packages.items() if package['spec']['name'] == spec['name'] and package['spec']['version'] == spec['version'] and package['spec']['hash'] != phash]
if len(other_hashes_with_dependents) == 1 and not cur_hash_has_dependents:
return deduplicate_spec(list(other_hashes_with_dependents)[0], packages)
if len(alternative_hashes) > 0:
if not cur_hash_has_dependents and len(other_hashes_with_dependents) == 0:
alt_hash_with_newest_compiler = phash
newest_compiler = list(map(int, spec['compiler']['version'].split('.')))
for alternative_hash in alternative_hashes:
current_compiler = list(map(int, packages[alternative_hash]['spec']['compiler']['version'].split('.')))
for i in range(min(len(newest_compiler), len(current_compiler))):
if current_compiler[i] > newest_compiler[i]:
alt_hash_with_newest_compiler = alternative_hash
newest_compiler = current_compiler
break
elif current_compiler[i] < newest_compiler[i]:
break
else:
if len(current_compiler) > len(newest_compiler):
alt_hash_with_newest_compiler = alternative_hash
newest_compiler = current_compiler
if alt_hash_with_newest_compiler is not None and alt_hash_with_newest_compiler != phash:
return deduplicate_spec(alt_hash_with_newest_compiler, packages)
for alternative_hash in alternative_hashes:
if alternative_hash in other_hashes_with_dependents and packages[alternative_hash]['spec']['compiler']['version'] != spec['compiler']['version']:
print(f"Not merging packages {spec['name']} {spec['version']} /{alternative_hash} and {spec['name']} {spec['version']} /{phash} due to different compilers ({packages[alternative_hash]['spec']['compiler']['version']} and {spec['compiler']['version']})")
continue
pdiff = diff_dirs(packages[phash]['path'], packages[alternative_hash]['path'])
if pdiff:
print(f"Not merging packages {spec['name']} {spec['version']} /{alternative_hash} and {spec['name']} {spec['version']} /{phash} due to modified files")
if type(pdiff) != bool:
for diffentry in pdiff:
print(f" {diffentry}")
elif not os.path.isdir(packages[phash]['path']):
print(f"From directory {packages[phash]['path']} is not directory")
elif not os.path.isdir(packages[alternative_hash]['path']):
print(f"To directory {packages[alternative_hash]['path']} is not directory")
else:
print(f"Unknown")
if input("Continue? (y/n) ") != 'y':
deduplicated.append(alternative_hash)
continue
print(f"Replacing {spec['name']} {spec['version']} /{alternative_hash} with {spec['name']} {spec['version']} /{phash}")
# If second package is manually installed, mark first package as manually installed
if packages[alternative_hash]['installed']:
packages[phash]['installed'] = True
if packages[alternative_hash]['explicit']:
packages[phash]['explicit'] = True
# Delete second package
del packages[alternative_hash]
# Update packages to have references to alternative_hash replaced with phash
for other_package_hash in list(packages.keys()):
package = packages[other_package_hash]
if 'dependencies' in package['spec']:
for dependency_idx in range(len(package['spec']['dependencies'])):
if package['spec']['dependencies'][dependency_idx]['hash'] == alternative_hash:
packages[other_package_hash]['spec']['dependencies'][dependency_idx]['hash'] = phash
packages[phash]['spec'] = spec
packages = recalculate_ref_count(phash, packages, False)
deduplicated.append(phash)
with open(database_file, 'w') as f:
db_out = {
"database": {
"version": "7",
"installs": packages
}
}
json.dump(db_out, f, indent=4)
return packages
database_file = sys.argv[1]
with open(database_file, 'r') as f:
database = json.load(f)
database_version = database['database']['version']
if int(database_version) not in [6, 7]:
raise ValueError(f"Unsupported database version {database_version}")
packages = {}
for package_hash, package_data in list(database['database']['installs'].items()):
package_data = package_data_to_v7(package_data, database_version)
package_name = package_data['spec']['name']
package_version = package_data['spec']['version']
if package_hash not in packages:
packages[package_hash] = package_data
# Remove packages where the path does not exist
for package_hash, package_data in list(packages.items()):
if 'path' not in package_data or package_data['path'] is None or not os.path.exists(package_data['path']):
suitable_alt_hash = None
for alt_hash, alt_data in packages.items():
if alt_hash == package_hash: continue
if alt_data['spec']['name'] == package_data['spec']['name'] and alt_data['spec']['version'] == package_data['spec']['version']:
suitable_alt_hash = alt_hash
break
if suitable_alt_hash is not None:
print(f"Package {package_data['spec']['name']} {package_data['spec']['version']} /{package_hash} has no path, replacing with {suitable_alt_hash}")
del packages[package_hash]
for other_package_hash in packages.keys():
package = packages[other_package_hash]
if 'dependencies' in package['spec']:
for dependency_idx in range(len(package['spec']['dependencies'])):
if package['spec']['dependencies'][dependency_idx]['hash'] == package_hash:
packages[other_package_hash]['spec']['dependencies'][dependency_idx]['hash'] = suitable_alt_hash
packages = recalculate_ref_count(suitable_alt_hash, packages)
else:
print(f"Package {package_data['spec']['name']} {package_data['spec']['version']} /{package_hash} has no path and no suitable alternative")
has_dependents = False
for other_package_hash in packages.keys():
package = packages[other_package_hash]
if 'dependencies' in package['spec']:
for dependency in package['spec']['dependencies']:
if dependency['hash'] == package_hash:
has_dependents = True
break
if has_dependents:
print(f"Package {package_data['spec']['name']} {package_data['spec']['version']} /{package_hash} has dependents")
else:
print(f"Package {package_data['spec']['name']} {package_data['spec']['version']} /{package_hash} has no dependents, removing")
del packages[package_hash]
with open(database_file, 'w') as f:
db_out = {
"database": {
"version": "7",
"installs": packages
}
}
json.dump(db_out, f, indent=4)
# Deduplicate
phashes = list(packages.keys())
for phash in phashes:
if phash not in packages:
continue
packages = deduplicate_spec(phash, packages)
#!/usr/bin/env python3
from pathlib import Path
import json
import sys
import filecmp
import os
import shutil
def package_data_to_v7(package_data, database_version):
match int(database_version):
case 7:
return package_data
case 6:
package_data_v7 = package_data.copy()
package_data_v7['spec']['parameters']['build_system'] = "generic"
if "dependencies" in package_data_v7['spec']:
package_data_v7['spec']['dependencies'] = []
for dependency in package_data['spec']['dependencies']:
package_data_v7['spec']['dependencies'].append({
"name": dependency['name'],
"hash": dependency['hash'],
"parameters": {
"deptypes": dependency['type'],
"virtuals": []
}
})
return package_data_v7
case _:
raise ValueError(f"Unsupported database version {database_version}")
def package_data_from_v7(package_data, database_version):
match int(database_version):
case 7:
return package_data
case 6:
package_data_v6 = package_data.copy()
package_data_v6['spec']['parameters'].pop('build_system')
if "dependencies" in package_data_v6['spec']:
package_data_v6['spec']['dependencies'] = []
for dependency in package_data['spec']['dependencies']:
package_data_v6['spec']['dependencies'].append({
"name": dependency['name'],
"hash": dependency['hash'],
"type": dependency['parameters']['deptypes']
})
return package_data_v6
case _:
raise ValueError(f"Unsupported database version {database_version}")
class dircmp(filecmp.dircmp):
"""
Compare the content of dir1 and dir2. In contrast with filecmp.dircmp, this
subclass compares the content of files with the same path.
"""
def phase3(self):
"""
Find out differences between common files.
Ensure we are using content comparison with shallow=False.
"""
fcomp = filecmp.cmpfiles(self.left, self.right, self.common_files,
shallow=False)
self.same_files, self.diff_files, self.funny_files = fcomp
def recalculate_ref_count(package_hash, packages, recurse = True):
# Check if the package hash is in the packages dictionary
if package_hash not in packages:
raise ValueError(f"Package hash /{package_hash} not found in the packages dictionary")
# First, recurse
if recurse:
if 'dependencies' in packages[package_hash]['spec']:
for dependency in packages[package_hash]['spec']['dependencies']:
if dependency['hash'] in packages:
packages = recalculate_ref_count(dependency['hash'], packages)
# Build, link, and run dependencies all count towards the ref count
packages = packages.copy()
packages[package_hash]['ref_count'] = 0
for other_package_hash in packages.keys():
if 'dependencies' not in packages[other_package_hash]['spec']:
continue
for dependency in packages[other_package_hash]['spec']['dependencies']:
if dependency['hash'] == package_hash and dependency['parameters']['deptypes'] in ['build', 'link', 'run']:
packages[package_hash]['ref_count'] += 1
return packages
def copy_old_path(package_location, optspack_path):
"""
Copies the old installation to the new installation path. Overwrites existing files if necessary. Uses hard links to save time and space.
"""
optspack_path = Path(optspack_path).resolve()
package_location = Path(package_location).resolve()
if not package_location.exists():
print(f"Old package location {package_location} does not exist")
return
package_folder_name = os.path.basename(package_location)
compiler_name = os.path.basename(os.path.dirname(package_location))
platform_name = os.path.basename(os.path.dirname(os.path.dirname(package_location)))
new_package_location = Path(optspack_path / platform_name / compiler_name / package_folder_name)
if new_package_location.exists():
print(f"Location {new_package_location} already exists. Diffing directories...")
compared = dircmp(package_location, new_package_location)
if compared.left_only:
print(f"Left only: {compared.left_only}")
if compared.right_only:
print(f"Right only: {compared.right_only}")
if compared.diff_files:
print(f"Diff files: {compared.diff_files}")
if not compared.left_only and not compared.right_only and not compared.diff_files:
print(f"Directories are the same")
return new_package_location
if compared.left_only:
for left_only_file in compared.left_only:
# Copy to the new package location
left_only_file_path = Path(package_location / left_only_file)
if left_only_file_path.is_dir():
print(f"Copying directory {left_only_file_path} to {new_package_location}")
shutil.copytree(left_only_file_path, new_package_location / left_only_file, symlinks=True, dirs_exist_ok=True)
elif left_only_file_path.is_file() and not left_only_file_path.exists():
print(f"Copying file {left_only_file_path} to {new_package_location}")
shutil.copy2(left_only_file_path, new_package_location)
if compared.right_only:
for right_only_file in compared.right_only:
right_only_file_path = Path(new_package_location / right_only_file)
if right_only_file_path.is_dir():
print(f"Removing directory {right_only_file_path}")
shutil.rmtree(right_only_file_path)
elif right_only_file_path.is_file() and right_only_file_path.exists():
print(f"Removing file {right_only_file_path}")
os.remove(right_only_file_path)
if compared.diff_files:
for diff_file in compared.diff_files:
print(f"Removing file {new_package_location / diff_file}")
os.remove(new_package_location / diff_file)
print(f"Copying file {package_location / diff_file} to {new_package_location}")
shutil.copy2(package_location / diff_file, new_package_location)
else:
print(f"Copying package files from {package_location} to {new_package_location}")
shutil.copytree(package_location, new_package_location, symlinks=True, dirs_exist_ok=True)
return new_package_location
copied = []
def copy_old_package(package_hash, packages_from, packages_to, database_file_to, optspack_path):
global copied
if package_hash in copied:
return packages_from, packages_to
# Ensure the package hash is in the source database
if package_hash not in packages_from:
raise ValueError(f"Package hash /{package_hash} not found in the source database")
# Fetch information about the package
package_name = packages_from[package_hash]['spec']['name']
package_version = packages_from[package_hash]['spec']['version']
# Copy the inputs to prevent modification of the original dictionaries
packages_from, packages_to = packages_from.copy(), packages_to.copy()
# Otherwise, we have work to do. Start by copying the dependencies to ensure that we don't accidentally merge
# packages that have conflicting dependencies.
if 'dependencies' in packages_from[package_hash]['spec']:
for dependency in packages_from[package_hash]['spec']['dependencies'].copy(): # A copy is needed because we're modifying the dictionary
packages_from, packages_to = copy_old_package(dependency['hash'], packages_from, packages_to, database_file_to, optspack_path)
print(f"Copying package {package_name}@{package_version} /{package_hash} to the destination database")
packages_to[package_hash] = packages_from[package_hash].copy()
packages_to[package_hash]['path'] = str(copy_old_path(packages_from[package_hash]['path'], optspack_path))
packages_to = recalculate_ref_count(package_hash, packages_to)
copied.append(package_hash)
return packages_from, packages_to
deduplicated = []
def deduplicate_spec(phash, packages):
global deduplicated
if phash in deduplicated:
return packages
packages = packages.copy()
if phash not in packages:
return packages
spec = packages[phash]['spec']
if 'dependencies' in spec:
spec['dependencies'] = sorted(spec['dependencies'], key=lambda x: x['hash'])
for dependency in spec['dependencies']:
packages = deduplicate_spec(dependency['hash'], packages)
alternative_hashes = [package_hash for package_hash, package in packages.items() if package['spec']['name'] == spec['name'] and package['spec']['version'] == spec['version'] and package['spec']['hash'] != phash]
if len(alternative_hashes) > 0:
for alternative_hash in alternative_hashes:
pdiff = diff_dirs(packages[phash]['path'], packages[alternative_hash]['path'])
if pdiff:
print(f"Not merging packages {spec['name']} {spec['version']} /{alternative_hash} and {spec['name']} {spec['version']} /{phash} due to modified files")
if type(pdiff) != bool:
for diffentry in pdiff:
print(f" {diffentry}")
elif not os.path.isdir(packages[phash]['path']):
print(f"From directory {packages[phash]['path']} is not directory")
elif not os.path.isdir(packages[alternative_hash]['path']):
print(f"To directory {packages[alternative_hash]['path']} is not directory")
else:
print(f"Unknown")
continue
print(f"Replacing {spec['name']} {spec['version']} /{alternative_hash} with {spec['name']} {spec['version']} /{phash}")
# Merge dependencies
"""if 'dependencies' in packages[alternative_hash]['spec']:
if 'dependencies' not in spec:
spec['dependencies'] = []
for dependency in packages[alternative_hash]['spec']['dependencies']:
if dependency not in spec['dependencies']:
spec['dependencies'].append(dependency)
if 'dependencies' in spec:
for dependency_idx in range(len(spec['dependencies'])):
# If alternative uses higher version, replace
dependency = spec['dependencies'][dependency_idx]
dependency_spec = packages[dependency['hash']]['spec']
alternative_dependency = next((dependency for dependency in packages[alternative_hash]['spec']['dependencies'] if dependency['name'] == spec['dependencies'][dependency_idx]['name']), None)
if alternative_dependency and alternative_dependency['hash'] != spec['dependencies'][dependency_idx]['hash']:
alternative_dependency_spec = packages[alternative_dependency['hash']]['spec']
dependency_version_split = dependency_spec['version'].split('.')
alternative_dependency_version_split = alternative_dependency_spec['version'].split('.')
use_alternative = False
for i in range(min(len(dependency_version_split), len(alternative_dependency_version_split))):
dependency_v_part = dependency_version_split[i]
alternative_v_part = alternative_dependency_version_split[i]
# Cut off at the first non-numeric character
dependency_v_part = dependency_v_part[:next((i for i, c in enumerate(dependency_v_part) if not c.isnumeric()), len(dependency_v_part))]
alternative_v_part = alternative_v_part[:next((i for i, c in enumerate(alternative_v_part) if not c.isnumeric()), len(alternative_v_part))]
if dependency_v_part == '':
use_alternative = True
break
if alternative_v_part == '':
break
if int(dependency_v_part) < int(alternative_v_part):
use_alternative = True
break
if int(dependency_v_part) > int(alternative_v_part):
break
if use_alternative:
print(f"Replacing dependency {dependency['name']} {dependency_spec['version']} /{dependency['hash']} with {alternative_dependency_spec['version']} /{alternative_dependency['hash']} in {spec['name']} {spec['version']} /{phash}")
spec['dependencies'][dependency_idx] = alternative_dependency
packages = recalculate_ref_count(alternative_dependency['hash'], packages)
packages = recalculate_ref_count(dependency['hash'], packages)
"""
# Delete second package
del packages[alternative_hash]
# Update packages to have references to alternative_hash replaced with phash
for other_package_hash in list(packages.keys()):
package = packages[other_package_hash]
if 'dependencies' in package['spec']:
for dependency_idx in range(len(package['spec']['dependencies'])):
if package['spec']['dependencies'][dependency_idx]['hash'] == alternative_hash:
packages[other_package_hash]['spec']['dependencies'][dependency_idx]['hash'] = phash
packages[phash]['spec'] = spec
packages = recalculate_ref_count(phash, packages, False)
deduplicated.append(phash)
return packages
optspack_path_from = Path(sys.argv[1])
optspack_path = Path(sys.argv[2])
database_file_from = optspack_path_from / ".spack-db" / "index.json"
database_file_to = optspack_path / ".spack-db" / "index.json"
with open(database_file_from, 'r') as f:
database_from = json.load(f)
with open(database_file_to, 'r') as f:
database_to = json.load(f)
database_from_version = database_from['database']['version']
database_to_version = database_to['database']['version']
if int(database_from_version) not in [6, 7]:
raise ValueError(f"Unsupported database version {database_from_version}")
if int(database_to_version) not in [6, 7]:
raise ValueError(f"Unsupported database version {database_to_version}")
packages_from = {}
packages_to = {}
for package_hash, package_data in list(database_from['database']['installs'].items()):
package_data = package_data_to_v7(package_data, database_from_version)
package_name = package_data['spec']['name']
package_version = package_data['spec']['version']
if package_hash not in packages_from:
packages_from[package_hash] = package_data
for package_hash, package_data in list(database_to['database']['installs'].items()):
package_data = package_data_to_v7(package_data, database_to_version)
package_name = package_data['spec']['name']
package_version = package_data['spec']['version']
if package_hash not in packages_to:
packages_to[package_hash] = package_data
for package_hash in list(packages_from.keys()):
packages_from, packages_to = copy_old_package(package_hash, packages_from, packages_to, database_file_to, optspack_path)
assert package_hash in packages_to, f"Package {package_hash} somehow not in packages_to"
with open(database_file_to, 'w') as f:
db_out = {
"database": {
"version": "7",
"installs": packages_to
}
}
json.dump(db_out, f, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment