Skip to content

Instantly share code, notes, and snippets.

@Coderx7
Created January 12, 2017 18:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Coderx7/b43a206b20ff4765a1b28fe997f1137c to your computer and use it in GitHub Desktop.
Save Coderx7/b43a206b20ff4765a1b28fe997f1137c to your computer and use it in GitHub Desktop.
CIfar10-lmdb-zeropad-normalize script for caffe
#in the name of God, the most compassionate the most merciful
#Seyyed Hossein Hasanpour
#Coderx7@Gmail.com
#script for zeropadding and normalizing CIFAR10 dataset (can also be used for CIFAR100)
import math
import caffe
import lmdb
import numpy as np
from caffe.proto import caffe_pb2
import cv2
import sys
########## mean-std code ############
#use this for large datasets, where numpy.mean() fails
#since all of the dataset can not fit in the memory
def mean_semi_vectorized(a):
#sum all elements in each channel and divide by the number of elements
batch = a.shape[0]
channel = a.shape[1]
width = a.shape[2]
height = a.shape[3]
sum = np.zeros((channel))
for i in range(batch):
for j in range(channel):
sum[j] += np.sum(a[i,j,:,:])
return (sum/(width*height*batch))
#semi-vectorized, very fast.
#use this if you face memory shortage errors because of your dataset being
#too big for your memory
def std_semi_vectorized(a):
batch = a.shape[0]
channel = a.shape[1]
width = a.shape[2]
height = a.shape[3]
mean = mean_semi_vectorized(a)
sum = np.zeros((channel))
for i in range(batch):
for j in range(channel):
sum[j] += np.sum(abs(a[i,j,:,:] - mean[j])**2)
var = (sum/(width*height*batch))
std = [round(math.sqrt(x),8) for x in var ]
return std
########## Actual Code ############
db_train = lmdb.open('cifar10_train_lmdb')
db_train_txn = db_train.begin(write=True)
db_test = lmdb.open('cifar10_test_lmdb')
db_test_txn = db_test.begin(write=True)
datum = caffe_pb2.Datum()
index = sys.argv[0]
size_train = 50000
size_test = 10000
data_train = np.zeros((size_train, 3, 32, 32))
label_train = np.zeros(size_train, dtype=int)
data_test = np.zeros((size_test, 3, 32, 32))
label_test = np.zeros(size_test, dtype=int)
print 'Reading training data...'
i = -1
for key, value in db_train_txn.cursor():
i = i + 1
if i % 1000 == 0:
print i
if i == size_train:
break
datum.ParseFromString(value)
label = datum.label
data = caffe.io.datum_to_array(datum)
data_train[i] = data
label_train[i] = label
print 'Reading test data...'
i = -1
for key, value in db_test_txn.cursor():
i = i + 1
if i % 1000 == 0:
print i
if i ==size_test:
break
datum.ParseFromString(value)
label = datum.label
data = caffe.io.datum_to_array(datum)
data_test[i] = data
label_test[i] = label
print 'Computing statistics...'
mean = np.mean(data_train, axis=(0,2,3))
std = np.std(data_train, axis=(0,2,3))
print mean
print std
print mean.shape
print std.shape
#np.savetxt('mean_mnist.txt', mean)
#np.savetxt('std_mnist.txt', std)
print 'Normalizing'
for i in range(3):
print i
data_train[:, i, :, :] = data_train[:, i, :, :] - mean[i]
data_train[:, i, :, :] = data_train[:, i, :, :]/std[i]
data_test[:, i, :, :] = data_test[:, i, :, :] - mean[i]
data_test[:, i, :, :] = data_test[:, i, :, :]/std[i]
#Zero Padding
#print 'Padding...'
npad = ((0,0), (0,0), (4,4), (4,4))
data_train = np.pad(data_train, pad_width=npad, mode='constant', constant_values=0)
data_test = np.pad(data_test, pad_width=npad, mode='constant', constant_values=0)
print 'Outputting training data'
lmdb_file ='cifar10_train_lmdb_norm2'
batch_size = size_train
db = lmdb.open(lmdb_file, map_size=int(data_train.nbytes))
batch = db.begin(write=True)
datum = caffe_pb2.Datum()
for i in range(size_train):
if i % 1000 == 0:
print i
# save in datum
datum = caffe.io.array_to_datum(data_train[i], label_train[i])
keystr = '{:0>5d}'.format(i)
batch.put( keystr, datum.SerializeToString() )
# write batch
if(i + 1) % batch_size == 0:
batch.commit()
batch=db.begin(write=True)
print (i + 1)
# write last batch
if (i+1) % batch_size != 0:
batch.commit()
print 'last batch'
print (i + 1)
print 'Outputting test data'
lmdb_file = 'cifar10_test_lmdb_norm2'
batch_size = size_test
db = lmdb.open(lmdb_file,map_size=int(data_test.nbytes))
batch = db.begin(write=True)
datum = caffe_pb2.Datum()
for i in range(size_test):
# save in datum
datum = caffe.io.array_to_datum(data_test[i], label_test[i])
keystr = '{:0>5d}'.format(i)
batch.put( keystr, datum.SerializeToString() )
# write batch
if(i + 1) % batch_size == 0:
batch.commit()
batch = db.begin(write=True)
print (i + 1)
# write last batch
if (i+1) % batch_size != 0:
batch.commit()
print 'last batch'
print (i + 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment