Skip to content

Instantly share code, notes, and snippets.

@isarandi
Created December 2, 2015 15:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save isarandi/8ec43e90a4129f7ee5b6 to your computer and use it in GitHub Desktop.
Save isarandi/8ec43e90a4129f7ee5b6 to your computer and use it in GitHub Desktop.
Sometimes I extract a zip or rar archive and don't care to delete one or the other, so now both the archive and its extracted contents take up disk space. This script hunts for such cases and offers to delete the extracted files. It could be modified to offer to delete the archive instead, too.
# encoding=utf8
import os, zipfile, rarfile, tarfile, shutil
# avoid encoding problems
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def remove_file_or_tree(p):
if os.path.exists(p):
if os.path.isdir(p):
shutil.rmtree(p)
else:
os.remove(p)
def get_paths_inside_archive(archive_path):
if archive_path.endswith(".zip"):
try:
return zipfile.ZipFile(archive_path).namelist()
except zipfile.BadZipfile:
print "bad file: ", archive_path
elif archive_path.endswith(".rar"):
try:
return rarfile.RarFile(archive_path).namelist()
except rarfile.NotRarFile:
print "bad file: ", archive_path
except rarfile.NeedFirstVolume:
pass
elif archive_path.endswith(".tar.gz"):
try:
return tarfile.open(archive_path, 'r:gz').getnames()
except tarfile.TarError:
print "bad file: ", archive_path
return []
def targz_aware_splitext(path):
if path.endswith(".tar.gz"):
return path[:-len(".tar.gz")], path[-len(".tar.gz"):]
else:
return os.path.splitext(path)
def delete_redundant_extracted_files(root_dir_of_search):
for dir_path, subdir_list, file_list in os.walk(root_dir_of_search):
for filename in file_list:
archive_basename, ext = targz_aware_splitext(filename)
if ext not in (".zip", ".rar", ".tar.gz"):
continue
filepath = os.path.join(dir_path, filename)
files_in_archive = get_paths_inside_archive(filepath)
redundant_extracted_paths = []
for filename_in_archive in files_in_archive:
extracted_candidate1 = os.path.join(dir_path, filename_in_archive)
extracted_candidate2 = os.path.join(dir_path, archive_basename, filename_in_archive)
for candidate in (extracted_candidate1, extracted_candidate2):
if os.path.exists(candidate):
redundant_extracted_paths.append(candidate)
if any(os.path.getsize(p) > 50*1024*1024 for p in redundant_extracted_paths):
if raw_input('Delete all redundant files extracted from ' + filepath + '? (y/n) ') == "y":
for extracted_path in redundant_extracted_paths:
remove_file_or_tree(extracted_path)
extracted_root = os.path.join(dir_path, archive_basename)
remove_file_or_tree(extracted_root)
print "\n"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment