Last active
July 15, 2019 19:09
-
-
Save MathiasGruber/2a2da1465ce5bc7021577afe806d4cfa to your computer and use it in GitHub Desktop.
Iterates through rootdir and finds corrupt images
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import multiprocessing | |
from joblib import Parallel, delayed | |
from tqdm import tqdm_notebook | |
from skimage import io | |
from PIL import Image | |
import os | |
import gc | |
import numpy as np | |
import warnings | |
rootdir = '/home/gruber/datasets/' | |
all_files = [] | |
for subdir, dirs, files in os.walk(rootdir): | |
for file in files: | |
fn = os.path.join(subdir, file) | |
all_files.append(fn) | |
def check_file(fn): | |
try: | |
with warnings.catch_warnings() as w: | |
warnings.filterwarnings('error') | |
with Image.open(fn) as img: | |
img.verify() | |
with Image.open(fn) as img: | |
img = img.convert('RGB') | |
img = np.array(img) | |
shape = img.shape | |
assert len(shape) == 3, 'Image must have three channels' | |
assert shape[0] > 1, 'First dimension should be above 1' | |
assert shape[1] > 1, 'Second dimension should be above 1' | |
assert shape[2] == 3, 'Third dimension should have three channels' | |
img = io.imread(fn) | |
gc.collect() | |
return False | |
except (Exception, UserWarning) as e: | |
print('Issue with {} - {}'.format(fn, e)) | |
return True | |
num_cores = multiprocessing.cpu_count() | |
is_corrupt = Parallel(n_jobs=num_cores)( | |
delayed(check_file)(fn) for fn in tqdm_notebook(all_files) | |
) | |
for fn in np.array(all_files)[is_corrupt]: | |
if os.path.exists(fn): | |
print("Removing ", fn) | |
os.remove(fn) | |
else: | |
print("already removed", fn) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment