Skip to content

Instantly share code, notes, and snippets.

@daniellivingston
Created June 25, 2022 15:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daniellivingston/b6068e3daf2e4a9b46dc6bd4a6af37f0 to your computer and use it in GitHub Desktop.
Save daniellivingston/b6068e3daf2e4a9b46dc6bd4a6af37f0 to your computer and use it in GitHub Desktop.
Selectively unarchive ZIP file members based on user-defined criteria
"""
FILTERED UNZIP
A simple script that *only* extracts files from a .zip archive
that match user-defined extraction criteria (filesize,
extension, date, etc.).
In addition, a live progress bar shows extraction progress,
and kept / filtered files are written to CSV files for
post-extraction analysis.
User-editable parameters are in ALL CAPS.
To add filtering constraints, see the `while` loop in the
`filtered_extract()` function.
"""
import os
from pathlib import Path
from zipfile import ZipFile
from rich.progress import Progress
SRC_ZIPFILE = '/Users/WHOAMI/Downloads/Archive.zip'
DEST_DIRECTORY = Path('/Users/WHOAMI/Desktop/UnzipResults/')
MIN_FILESIZE = 100 # kB
VALID_EXTENSIONS = set([
".jpg",
".jpeg",
".png",
".webp",
".mp4",
".zip",
".tar",
".heic",
".gz"
])
def extract_files(zipfile, infolist, destination_dir=os.getcwd()):
Path(destination_dir).mkdir(parents=True, exist_ok=True) # create if not exists
with Progress(transient=True) as progress:
task = progress.add_task("Extracting...", total=len(infolist))
for file in infolist:
progress.console.print(f"Extracting file '{file.filename}'")
zipfile.extract(file, destination_dir)
progress.advance(task)
def filtered_extract(zipfile, csv_delim=","):
infolist = zipfile.infolist()
min_filesize_bytes = MIN_FILESIZE * 1000
discarded_files = []
kept_files = []
while infolist:
file = infolist.pop()
# === archived file filtering section === #
if file.is_dir():
continue
if (file.file_size < min_filesize_bytes):
discarded_files.append(file)
continue
if (os.path.splitext(file.filename)[1].lower() not in VALID_EXTENSIONS):
discarded_files.append(file)
continue
# === add additional filtering mechanisms above === #
kept_files.append(file)
total_files = len(kept_files) + len(discarded_files)
logging_info = (
('files_kept.log', kept_files),
('files_discarded.log', discarded_files)
)
for (logfile, filelist) in logging_info:
with Path(DEST_DIRECTORY / logfile).open('w') as f:
f.write(f"# FILES: {len(filelist)} of {total_files} files\n")
f.write(f"Filename{csv_delim}Date Created{csv_delim}Filesize (kB)\n") # CSV Header
for file in filelist:
filename = f'"{file.filename.strip()}"'
date = f"{file.date_time[1]}/{file.date_time[2]}/{file.date_time[0]}"
size = f"{file.file_size / 1000.}"
f.write(f"{filename}{csv_delim}{date}{csv_delim}{size}\n")
extract_files(zipfile, kept_files, destination_dir=DEST_DIRECTORY)
with ZipFile(SRC_ZIPFILE) as zip:
zip.debug = 3
filtered_extract(zip)
@daniellivingston
Copy link
Author

Wrote this to deal with a large (155 GB compressed) archive, where I only wanted to extract photos and videos.

I was able to extract 50,547 files totaling 151.63 GB uncompressed in 5 minutes and 28 seconds on a 2021 MacBook Pro Max.

Quite fast for me, but the limiting factor for extraction time will be the read/write speed of your hard disk.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment