Created
December 2, 2015 15:36
-
-
Save isarandi/8ec43e90a4129f7ee5b6 to your computer and use it in GitHub Desktop.
Sometimes I extract a zip or rar archive and don't care to delete one or the other, so now both the archive and its extracted contents take up disk space. This script hunts for such cases and offers to delete the extracted files. It could be modified to offer to delete the archive instead, too.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding=utf8 | |
import os, zipfile, rarfile, tarfile, shutil | |
# avoid encoding problems | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf8') | |
def remove_file_or_tree(p): | |
if os.path.exists(p): | |
if os.path.isdir(p): | |
shutil.rmtree(p) | |
else: | |
os.remove(p) | |
def get_paths_inside_archive(archive_path): | |
if archive_path.endswith(".zip"): | |
try: | |
return zipfile.ZipFile(archive_path).namelist() | |
except zipfile.BadZipfile: | |
print "bad file: ", archive_path | |
elif archive_path.endswith(".rar"): | |
try: | |
return rarfile.RarFile(archive_path).namelist() | |
except rarfile.NotRarFile: | |
print "bad file: ", archive_path | |
except rarfile.NeedFirstVolume: | |
pass | |
elif archive_path.endswith(".tar.gz"): | |
try: | |
return tarfile.open(archive_path, 'r:gz').getnames() | |
except tarfile.TarError: | |
print "bad file: ", archive_path | |
return [] | |
def targz_aware_splitext(path): | |
if path.endswith(".tar.gz"): | |
return path[:-len(".tar.gz")], path[-len(".tar.gz"):] | |
else: | |
return os.path.splitext(path) | |
def delete_redundant_extracted_files(root_dir_of_search): | |
for dir_path, subdir_list, file_list in os.walk(root_dir_of_search): | |
for filename in file_list: | |
archive_basename, ext = targz_aware_splitext(filename) | |
if ext not in (".zip", ".rar", ".tar.gz"): | |
continue | |
filepath = os.path.join(dir_path, filename) | |
files_in_archive = get_paths_inside_archive(filepath) | |
redundant_extracted_paths = [] | |
for filename_in_archive in files_in_archive: | |
extracted_candidate1 = os.path.join(dir_path, filename_in_archive) | |
extracted_candidate2 = os.path.join(dir_path, archive_basename, filename_in_archive) | |
for candidate in (extracted_candidate1, extracted_candidate2): | |
if os.path.exists(candidate): | |
redundant_extracted_paths.append(candidate) | |
if any(os.path.getsize(p) > 50*1024*1024 for p in redundant_extracted_paths): | |
if raw_input('Delete all redundant files extracted from ' + filepath + '? (y/n) ') == "y": | |
for extracted_path in redundant_extracted_paths: | |
remove_file_or_tree(extracted_path) | |
extracted_root = os.path.join(dir_path, archive_basename) | |
remove_file_or_tree(extracted_root) | |
print "\n" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment