isarandi/delete_redundant_extracted.py

## delete_redundant_extracted.py
# encoding=utf8
import os, zipfile, rarfile, tarfile, shutil

# avoid encoding problems
import sys
reload(sys)
sys.setdefaultencoding('utf8')


def remove_file_or_tree(p):
    if os.path.exists(p):
        if os.path.isdir(p):
            shutil.rmtree(p)
        else:
            os.remove(p)


def get_paths_inside_archive(archive_path):
    if archive_path.endswith(".zip"):
        try:
            return zipfile.ZipFile(archive_path).namelist()
        except zipfile.BadZipfile:
            print "bad file: ", archive_path

    elif archive_path.endswith(".rar"):
        try:
            return rarfile.RarFile(archive_path).namelist()
        except rarfile.NotRarFile:
            print "bad file: ", archive_path
        except rarfile.NeedFirstVolume:
            pass

    elif archive_path.endswith(".tar.gz"):
        try:
            return tarfile.open(archive_path, 'r:gz').getnames()
        except tarfile.TarError:
            print "bad file: ", archive_path

    return []


def targz_aware_splitext(path):
    if path.endswith(".tar.gz"):
        return path[:-len(".tar.gz")], path[-len(".tar.gz"):]
    else:
        return os.path.splitext(path)


def delete_redundant_extracted_files(root_dir_of_search):

    for dir_path, subdir_list, file_list in os.walk(root_dir_of_search):
        for filename in file_list:

            archive_basename, ext = targz_aware_splitext(filename)
            if ext not in (".zip", ".rar", ".tar.gz"):
                continue

            filepath = os.path.join(dir_path, filename)
            files_in_archive = get_paths_inside_archive(filepath)

            redundant_extracted_paths = []
            for filename_in_archive in files_in_archive:
                extracted_candidate1 = os.path.join(dir_path, filename_in_archive)
                extracted_candidate2 = os.path.join(dir_path, archive_basename, filename_in_archive)

                for candidate in (extracted_candidate1, extracted_candidate2):
                    if os.path.exists(candidate):
                        redundant_extracted_paths.append(candidate)

            if any(os.path.getsize(p) > 50*1024*1024 for p in redundant_extracted_paths):
                if raw_input('Delete all redundant files extracted from ' + filepath + '? (y/n) ') == "y":
                    for extracted_path in redundant_extracted_paths:
                        remove_file_or_tree(extracted_path)

                    extracted_root = os.path.join(dir_path, archive_basename)
                    remove_file_or_tree(extracted_root)

                print "\n"
	# encoding=utf8
	import os, zipfile, rarfile, tarfile, shutil

	# avoid encoding problems
	import sys
	reload(sys)
	sys.setdefaultencoding('utf8')


	def remove_file_or_tree(p):
	if os.path.exists(p):
	if os.path.isdir(p):
	shutil.rmtree(p)
	else:
	os.remove(p)


	def get_paths_inside_archive(archive_path):
	if archive_path.endswith(".zip"):
	try:
	return zipfile.ZipFile(archive_path).namelist()
	except zipfile.BadZipfile:
	print "bad file: ", archive_path

	elif archive_path.endswith(".rar"):
	try:
	return rarfile.RarFile(archive_path).namelist()
	except rarfile.NotRarFile:
	print "bad file: ", archive_path
	except rarfile.NeedFirstVolume:
	pass

	elif archive_path.endswith(".tar.gz"):
	try:
	return tarfile.open(archive_path, 'r:gz').getnames()
	except tarfile.TarError:
	print "bad file: ", archive_path

	return []


	def targz_aware_splitext(path):
	if path.endswith(".tar.gz"):
	return path[:-len(".tar.gz")], path[-len(".tar.gz"):]
	else:
	return os.path.splitext(path)


	def delete_redundant_extracted_files(root_dir_of_search):

	for dir_path, subdir_list, file_list in os.walk(root_dir_of_search):
	for filename in file_list:

	archive_basename, ext = targz_aware_splitext(filename)
	if ext not in (".zip", ".rar", ".tar.gz"):
	continue

	filepath = os.path.join(dir_path, filename)
	files_in_archive = get_paths_inside_archive(filepath)

	redundant_extracted_paths = []
	for filename_in_archive in files_in_archive:
	extracted_candidate1 = os.path.join(dir_path, filename_in_archive)
	extracted_candidate2 = os.path.join(dir_path, archive_basename, filename_in_archive)

	for candidate in (extracted_candidate1, extracted_candidate2):
	if os.path.exists(candidate):
	redundant_extracted_paths.append(candidate)

	if any(os.path.getsize(p) > 5010241024 for p in redundant_extracted_paths):
	if raw_input('Delete all redundant files extracted from ' + filepath + '? (y/n) ') == "y":
	for extracted_path in redundant_extracted_paths:
	remove_file_or_tree(extracted_path)

	extracted_root = os.path.join(dir_path, archive_basename)
	remove_file_or_tree(extracted_root)

	print "\n"