A script to extract images from .zip, .docx, .xlsx, and .pptx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import os | |
import shutil | |
import logging | |
import zipfile | |
import argparse | |
import tempfile | |
from pathlib import Path | |
def extract_images(filepath, destination): | |
temp_path = tempfile.gettempdir() | |
logging.info('The default path for temporary directories and files is {}'.format(temp_path)) | |
destination_path = Path(destination) | |
file_count = 0 | |
overall_size = 0 | |
data = [] | |
# Extracts the images from the file to temporary directory | |
if zipfile.is_zipfile(filepath): | |
# Creates a temporary directory to work with | |
with tempfile.TemporaryDirectory() as working_dir: | |
working_dir_name = Path(working_dir).name | |
logging.info('Created temporal directory "{}"'.format(working_dir_name)) | |
shutil.copy(filepath, working_dir) | |
logging.info('Copied {} to {}'.format(filepath.name, working_dir_name)) | |
file = Path(working_dir) / filepath.name | |
# Makes a list of all images | |
image_list = [] | |
for x in zipfile.ZipFile(file).namelist(): | |
if x.endswith('.png') or x.endswith('.jpeg'): | |
image_list.append(x) | |
# Extracts images and moves them out of the temp directory | |
for x in image_list: | |
zipfile.ZipFile(file).extract(x) | |
logging.info('Extracted {}'.format(x)) | |
logging.info('Extracted all image files') | |
for x in image_list: | |
shutil.copy(x, destination_path) | |
logging.info('Copied {}'.format(x)) | |
logging.info('Copied all image files to {}'.format(destination_path.resolve())) | |
for x in image_list: | |
overall_size = overall_size + zipfile.ZipFile(file).getinfo(x).file_size | |
file_count = len(image_list) | |
data.append(file_count) | |
data.append(overall_size) | |
return data | |
else: | |
logging.info('File is a {}'.format(filepath.suffix)) | |
logging.error('File cannot be accessed, must have the following extensions: zip, docx, xlsx, pptx') | |
data.append(filepath.suffix) | |
return data | |
def main(): | |
parser = argparse.ArgumentParser(prog = 'image_mod.py', description = 'Extracts images from zip, docx, xlsx, and pptx files') | |
parser.add_argument('filepath') | |
parser.add_argument('-d', '--destination', default = '.') | |
args = parser.parse_args() | |
if args.destination: | |
if Path(args.filepath).is_file() and Path(args.destination).is_dir(): | |
process = extract_images(Path(args.filepath), Path(args.destination)) | |
if len(process) == 2: | |
print('Operation completed successfully. {} images were extracted ({:,.2f}KB total)'.format(process[0], process[1] / 1024)) | |
logging.info('Operation successful') | |
else: | |
print('Operation failed. File type {} not supported.'.format(process[0])) | |
logging.info('Operation failed') | |
else: | |
print('Operation failed. Either the file you provided or the destination directory doesn\'t exist.') | |
logging.info('Filename or destination directory doesn\'t exist') | |
logging.error('File: {}; Directory: {}'.format(Path(args.filepath).exists(), Path(args.destination).exists())) | |
if __name__ == "__main__": | |
logging.basicConfig(filename = 'common_tasks.log', level = logging.DEBUG, format = '%(asctime)s - %(name)s - %(levelname)s: %(message)s') | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment