Skip to content

Instantly share code, notes, and snippets.

@georgy7
Last active November 27, 2015 02:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save georgy7/82738ca1c98650fa9725 to your computer and use it in GitHub Desktop.
Save georgy7/82738ca1c98650fa9725 to your computer and use it in GitHub Desktop.
To train https://github.com/karpathy/neuraltalk2 on my own data. Just run from a folder with tagged JPEG images. I mean the keywords in the file properties.
# pip install exifread
import exifread
import array
import string
import glob
import json
KEYWORDS = 'Image XPKeywords'
OUTPUT = 'neuraltalk.json'
def filter_str(str):
s = ''.join(filter(lambda x: (x in string.printable) and (x not in '\t\n\r'), str))
return s.strip()
def tags(filename):
try:
f = open(filename, 'rb')
exif = exifread.process_file(f)
tags = array.array('B', exif[KEYWORDS].values).tostring().decode('utf-16').split(';')
tags = [filter_str(t) for t in tags]
return tags
except Exception as e:
print(filename, 'has no keywords.')
return []
def save(arr):
f = open(OUTPUT, 'w')
json.dump(arr, f, sort_keys=True, indent=2, separators=(',', ': '))
f.close()
print('Finished.')
def main():
files = glob.glob('*.[jJ][pP][gG]') + glob.glob('*.[jJ][pP][eE][gG]')
result = []
for filename in files:
image_tags = tags(filename)
if len(image_tags) > 0:
result.append({'file_path': filename, 'captions': image_tags})
save(result)
if __name__ == "__main__":
main()
NEUROTALK=../neuraltalk2
JSON_OUTPUT=neuraltalk_output.json
H5_OUTPUT=neuraltalk.h5
CHECKPOINTS_OUTPUT=./checkpoints
# Making the paths absolute.
JSON_OUTPUT=`readlink -f $JSON_OUTPUT`
H5_OUTPUT=`readlink -f $H5_OUTPUT`
CHECKPOINTS_OUTPUT=`readlink -f $CHECKPOINTS_OUTPUT`
START_FOLDER=`readlink -f .`
CNN_MODEL=`readlink -f $NEUROTALK/model/VGG_ILSVRC_16_layers.caffemodel`
CNN_PROTO=`readlink -f $NEUROTALK/model/VGG_ILSVRC_16_layers_deploy.prototxt`
if [ ! -f $H5_OUTPUT ] || [ ! -f $JSON_OUTPUT ]; then
rm -f $H5_OUTPUT
rm -f $JSON_OUTPUT
python $NEUROTALK/prepro.py \
--input_json neuraltalk.json \
--num_val 5000 --num_test 5000 \
--images_root . \
--word_count_threshold 5 \
--output_json $JSON_OUTPUT \
--output_h5 $H5_OUTPUT
fi
mkdir -p $CHECKPOINTS_OUTPUT
cd $NEUROTALK
th train.lua -input_h5 $H5_OUTPUT -input_json $JSON_OUTPUT -checkpoint_path $CHECKPOINTS_OUTPUT \
-gpuid -1 \
-cnn_model $CNN_MODEL \
-cnn_proto $CNN_PROTO
cd $START_FOLDER
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment