Skip to content

Instantly share code, notes, and snippets.

@dalgu90
Last active August 3, 2016 11:31
Show Gist options
  • Save dalgu90/6a0460412f41ebddc9251a709fa989ba to your computer and use it in GitHub Desktop.
Save dalgu90/6a0460412f41ebddc9251a709fa989ba to your computer and use it in GitHub Desktop.
Traverse dataset directories and check JPEG files, and print out filenames with invalid JPEG header.
from __future__ import print_function
import sys
import os
import re
argv = sys.argv[1:]
if not argv:
sys.stderr.write('Usage: python check_dataset_valid.py [IMAGE_ROOT] [OUTPUT_FNAME]\n')
sys.exit(1)
image_root = argv[0]
output_fname = argv[1]
output_fd = open(output_fname, 'w')
jpeg_pattern = re.compile(".+(\\.jpg|\\.jpeg)$", flags=re.IGNORECASE)
for root, dnames, fnames in os.walk(image_root):
jpeg_fnames = [f for f in fnames if jpeg_pattern.match(f)]
print('Inspecting %s (%d images)' % (root, len(jpeg_fnames)))
for f in jpeg_fnames:
fpath = os.path.join(root, f)
with open(fpath, 'r') as fd:
data = fd.read(10) # Read the first 10 bytes
if len(data) < 10 or data[0] != '\xff' or data[1] != '\xd8':
print('\twrong JPG file!: %s' % fpath)
output_fd.write(fpath + '\n')
print('done')
output_fd.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment