Skip to content

Instantly share code, notes, and snippets.

@MathiasGruber
Last active July 15, 2019 19:09
Show Gist options
  • Save MathiasGruber/2a2da1465ce5bc7021577afe806d4cfa to your computer and use it in GitHub Desktop.
Save MathiasGruber/2a2da1465ce5bc7021577afe806d4cfa to your computer and use it in GitHub Desktop.
Iterates through rootdir and finds corrupt images
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm_notebook
from skimage import io
from PIL import Image
import os
import gc
import numpy as np
import warnings
rootdir = '/home/gruber/datasets/'
all_files = []
for subdir, dirs, files in os.walk(rootdir):
for file in files:
fn = os.path.join(subdir, file)
all_files.append(fn)
def check_file(fn):
try:
with warnings.catch_warnings() as w:
warnings.filterwarnings('error')
with Image.open(fn) as img:
img.verify()
with Image.open(fn) as img:
img = img.convert('RGB')
img = np.array(img)
shape = img.shape
assert len(shape) == 3, 'Image must have three channels'
assert shape[0] > 1, 'First dimension should be above 1'
assert shape[1] > 1, 'Second dimension should be above 1'
assert shape[2] == 3, 'Third dimension should have three channels'
img = io.imread(fn)
gc.collect()
return False
except (Exception, UserWarning) as e:
print('Issue with {} - {}'.format(fn, e))
return True
num_cores = multiprocessing.cpu_count()
is_corrupt = Parallel(n_jobs=num_cores)(
delayed(check_file)(fn) for fn in tqdm_notebook(all_files)
)
for fn in np.array(all_files)[is_corrupt]:
if os.path.exists(fn):
print("Removing ", fn)
os.remove(fn)
else:
print("already removed", fn)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment