Skip to content

Instantly share code, notes, and snippets.

@JoshZastrow
Created May 10, 2018 15:13
Show Gist options
  • Save JoshZastrow/64f08f19d891918f5931eda676ece862 to your computer and use it in GitHub Desktop.
Save JoshZastrow/64f08f19d891918f5931eda676ece862 to your computer and use it in GitHub Desktop.
Data generator function for comma-ai dataset
from __future__ import absolute_import
from __future__ import print_function
# from keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import numpy as np
from scipy import misc
import matplotlib.pyplot as plt
import os, math, time
import csv
plt.rcParams['figure.figsize'] = (20.0, 16.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
class DataGenerator():
"""
A data generator object that flows data from selected source.
"""
def __init__(self, log_file, img_dir,
batch_size=2, sample_size=10,
file_type='csv', img_ext = '.png',
target_size=(480, 640), starting_row=0):
"""
args
----
log_file: <str> path to the log file
img_dir: <str> path to the image directory
batch size: <int> how many samples to generate each time
sample_size: <int> how many total samples to generate
file_type: ['csv', 'h5'] log file type (.h5 not implemented)
"""
self.target_size = target_size
self.batch_size = batch_size
self.sample_size = sample_size
self.img_dir = img_dir
self.img_ext = img_ext
# check sample size against log row count
with open(log_file,"r") as f:
log = [csv.reader(f,delimiter = ",")]
row = len(log)
if row > self.sample_size:
print('Sample size larger than available data. '
'Setting sample size to {}'.format(row))
self.sample_size = row - 1
if file_type == 'csv':
self.reader = pd.read_csv(log_file,
chunksize=batch_size,
header=0,
skiprows=range(1, starting_row))
else:
raise ValueError('file type not implemented')
def __iter__(self):
for _ in range(0, self.sample_size, self.batch_size):
batch = self.reader.get_chunk()
images = self._process_images(batch.filename, self.img_ext)
yield images, batch
def __next__(self):
batch = self.reader.get_chunk()
images = self._process_images(batch.filename, self.img_ext)
return images, batch
def _process_images(self, dir_list, ext):
"""
Loads images from file, performs image processing (if any)
inputs
------
dir_list: list of image file paths
returns
-------
images: np array of images
"""
images = [] # np.zeros(shape=(self.batch_size, *self.target_size, 3))
for i, line in enumerate(dir_list):
full_path = '/'.join([self.img_dir, line])
full_path = os.path.splitext(full_path)[0] + ext
if ext == '.npy':
images.append(np.load(full_path))
else:
images.append(misc.imread(full_path, mode='RGB'))
# TODO: Resize image to target size
# TODO: Figure out how to use the image processing features
# of the inherited DataGenerator on the loaded image
result = np.array(images)
return result
class DataWriter(object):
"""
writes numpy array to .h5 file. Creates a dataset group within root, then
a feature and label subgroup.
args
----
folder: <str> filepath of the .h5 file
dataset: <str> dataset name (name of model used to process data)
"""
def __init__(self, output_dir, name, output_type='folder'):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
self.type = output_type
self.path = '/'.join([output_dir, name])
# Create subfolders
if self.type == 'hdf':
self.path = self.path + '.h5'
elif self.type == 'folder':
for camera_view in ['left', 'center', 'right']:
sub_folder = '/'.join([self.path, camera_view])
if not os.path.exists(sub_folder):
os.makedirs(sub_folder)
else:
raise ValueError("output type must be either <hdf> or <folder>")
def __call__(self, x, y):
if self.type == 'hdf':
with pd.HDFStore(self.path) as hdf:
for i, file in enumerate(y.filename):
hdf.put(file, x[i])
else:
for i, file in enumerate(y.filename):
path = '/'.join([self.path, file])[:-4]
np.save(path, x[i], allow_pickle=False)
def load_udacity_data(file_path='',
img_dir='',
batch=100, val_percent=.2,
shuffle=False, rescale=True):
"""
loads in images as features, steering angle as label
Inputs
----
datasets : subfolder refering to bag folder (i.e 'HMB_ 1')
batch : total number of samples to be read in
val_percent: percent of batch to be assigned to validation (i.e 0.2)
shuffle : TO DO -> shuffle dataset before returning
returns
------
X_train : (num_train, height, width, channels) array
Y_train : (num_train, labels) array
X_valid : (num_valid, height, width, channels) array
Y_valid : (num_valid, labels) array
"""
# Dataset folder
if not file_path:
file = "../Car/datasets/HMB_1/output/interpolated.csv"
assert os.path.isfile(file), 'interpolated dataset not found'
if not img_dir:
img_dir = '../Car/datasets/HMB_1/output/'
assert os.path.isdir(img_dir)
# Starting with just center camera
dataset = pd.read_csv(file)
dataset = dataset[dataset['frame_id'] == 'center_camera']
# Add directory path to dataset
dataset['filename'] = img_dir + dataset['filename']
# Setup data placeholders
assert max(dataset['width']) == min(dataset['width'])
assert max(dataset['height']) == min(dataset['height'])
width = max(dataset['width'])
height = max(dataset['height'])
channels = 3
if batch > dataset.shape[0]:
batch = dataset.shape[0]
X = np.zeros((batch, height, width, channels))
Y = np.zeros((batch, ))
num_train = int(batch * (1 - val_percent))
num_valid = int(batch * (val_percent))
mask = range(num_train, num_train + num_valid)
X_valid = X[mask]
Y_valid = Y[mask]
mask = range(num_train)
X_train = X[mask]
Y_train = Y[mask]
del X
del Y
count = 0
# read in file data
for rw in range(0, batch):
angle = dataset['angle'].iloc[rw]
ipath = dataset['filename'].iloc[rw]
image = misc.imread(ipath)
if count < num_train:
X_train[count] = image
Y_train[count] = angle
else:
X_valid[count % num_train] = image
Y_valid[count % num_train] = angle
count += 1
data = {'X_train': X_train,
'Y_train': Y_train,
'X_valid': X_valid,
'Y_valid': Y_valid}
if rescale:
data['X_train'] /= 255
data['X_valid'] /= 255
return data
def load_commai_data(log_file, cam_file):
"""
loads .h5 files from comma AI's car dataset.
Inputs
----
log_file: file path for sensor log .h5 file
cam_file: camera path for camera frames from .h5 file
Returns
-------
log: Pandas Dataframe of log file, indexed with cam1_ptr
cam: PyTables CArray of shape (frame, height, width, channels))
"""
log_store = pd.HDFStore(log_file)
cam_store = pd.HDFStore(cam_file)
samples = len(log_store.root.cam1_ptr1)
data_dic = {}
# Read datasets into dictionary
for d in log_store.root:
if d.shape[0] == samples:
if d.ndim == 1:
data_dic[d.name] = d[:]
else:
for dim in range(d.shape[1]):
data_dic['{}-{}'.format(d.name, dim + 1)] = d[:, dim]
# Average the log sensors in a Dataframe, create cam 4D array
log = pd.DataFrame(data_dic).groupby('cam1_ptr').mean()
cam = cam_store.root.X[:]
return log, cam
if __name__ == "__main__":
from config import get_user_settings
# Get model settings from config file
config = get_user_settings()['USER']
dfeed = DataGenerator(log_file=config['log file path'],
img_dir=config['image dir'],
batch_size=int(config['batch size']),
sample_size=100,
starting_row=int(config['starting row']))
store = DataWriter('data-utils-output', 'sample_data')
for imgs, labels in dfeed:
for i in range(imgs.shape[0]):
for c in range(imgs.shape[3]):
imgs[i,:,:,c] = imgs[i,:,:,c] / np.max(imgs[i,:,:,c])
store(imgs, labels)
def stopwatch(start, comment):
lap = math.floor(time.time() - start)
print('{}: {} min {} sec'.format(comment, lap // 60, lap % 60))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment