Last active
August 3, 2016 11:31
-
-
Save dalgu90/6a0460412f41ebddc9251a709fa989ba to your computer and use it in GitHub Desktop.
Traverse dataset directories and check JPEG files, and print out filenames with invalid JPEG header.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import sys | |
import os | |
import re | |
argv = sys.argv[1:] | |
if not argv: | |
sys.stderr.write('Usage: python check_dataset_valid.py [IMAGE_ROOT] [OUTPUT_FNAME]\n') | |
sys.exit(1) | |
image_root = argv[0] | |
output_fname = argv[1] | |
output_fd = open(output_fname, 'w') | |
jpeg_pattern = re.compile(".+(\\.jpg|\\.jpeg)$", flags=re.IGNORECASE) | |
for root, dnames, fnames in os.walk(image_root): | |
jpeg_fnames = [f for f in fnames if jpeg_pattern.match(f)] | |
print('Inspecting %s (%d images)' % (root, len(jpeg_fnames))) | |
for f in jpeg_fnames: | |
fpath = os.path.join(root, f) | |
with open(fpath, 'r') as fd: | |
data = fd.read(10) # Read the first 10 bytes | |
if len(data) < 10 or data[0] != '\xff' or data[1] != '\xd8': | |
print('\twrong JPG file!: %s' % fpath) | |
output_fd.write(fpath + '\n') | |
print('done') | |
output_fd.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment