Created
June 25, 2022 15:26
-
-
Save daniellivingston/b6068e3daf2e4a9b46dc6bd4a6af37f0 to your computer and use it in GitHub Desktop.
Selectively unarchive ZIP file members based on user-defined criteria
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
FILTERED UNZIP | |
A simple script that *only* extracts files from a .zip archive | |
that match user-defined extraction criteria (filesize, | |
extension, date, etc.). | |
In addition, a live progress bar shows extraction progress, | |
and kept / filtered files are written to CSV files for | |
post-extraction analysis. | |
User-editable parameters are in ALL CAPS. | |
To add filtering constraints, see the `while` loop in the | |
`filtered_extract()` function. | |
""" | |
import os | |
from pathlib import Path | |
from zipfile import ZipFile | |
from rich.progress import Progress | |
SRC_ZIPFILE = '/Users/WHOAMI/Downloads/Archive.zip' | |
DEST_DIRECTORY = Path('/Users/WHOAMI/Desktop/UnzipResults/') | |
MIN_FILESIZE = 100 # kB | |
VALID_EXTENSIONS = set([ | |
".jpg", | |
".jpeg", | |
".png", | |
".webp", | |
".mp4", | |
".zip", | |
".tar", | |
".heic", | |
".gz" | |
]) | |
def extract_files(zipfile, infolist, destination_dir=os.getcwd()): | |
Path(destination_dir).mkdir(parents=True, exist_ok=True) # create if not exists | |
with Progress(transient=True) as progress: | |
task = progress.add_task("Extracting...", total=len(infolist)) | |
for file in infolist: | |
progress.console.print(f"Extracting file '{file.filename}'") | |
zipfile.extract(file, destination_dir) | |
progress.advance(task) | |
def filtered_extract(zipfile, csv_delim=","): | |
infolist = zipfile.infolist() | |
min_filesize_bytes = MIN_FILESIZE * 1000 | |
discarded_files = [] | |
kept_files = [] | |
while infolist: | |
file = infolist.pop() | |
# === archived file filtering section === # | |
if file.is_dir(): | |
continue | |
if (file.file_size < min_filesize_bytes): | |
discarded_files.append(file) | |
continue | |
if (os.path.splitext(file.filename)[1].lower() not in VALID_EXTENSIONS): | |
discarded_files.append(file) | |
continue | |
# === add additional filtering mechanisms above === # | |
kept_files.append(file) | |
total_files = len(kept_files) + len(discarded_files) | |
logging_info = ( | |
('files_kept.log', kept_files), | |
('files_discarded.log', discarded_files) | |
) | |
for (logfile, filelist) in logging_info: | |
with Path(DEST_DIRECTORY / logfile).open('w') as f: | |
f.write(f"# FILES: {len(filelist)} of {total_files} files\n") | |
f.write(f"Filename{csv_delim}Date Created{csv_delim}Filesize (kB)\n") # CSV Header | |
for file in filelist: | |
filename = f'"{file.filename.strip()}"' | |
date = f"{file.date_time[1]}/{file.date_time[2]}/{file.date_time[0]}" | |
size = f"{file.file_size / 1000.}" | |
f.write(f"{filename}{csv_delim}{date}{csv_delim}{size}\n") | |
extract_files(zipfile, kept_files, destination_dir=DEST_DIRECTORY) | |
with ZipFile(SRC_ZIPFILE) as zip: | |
zip.debug = 3 | |
filtered_extract(zip) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Wrote this to deal with a large (155 GB compressed) archive, where I only wanted to extract photos and videos.
I was able to extract 50,547 files totaling 151.63 GB uncompressed in 5 minutes and 28 seconds on a 2021 MacBook Pro Max.
Quite fast for me, but the limiting factor for extraction time will be the read/write speed of your hard disk.