Yuri-M-Dias/gera_h5_reshape_incremental.py

## gera_h5_reshape_incremental.py
import matplotlib.pyplot as plt
from scipy.io import wavfile
from glob import glob
import cv2
import pandas as pd
import numpy as np
from os.path import basename
import gc
import h5py
import os
import wave
import pylab

max_edge = 600

#data_img = 'Dataset_5s_img_1/*.png'
data_img = 'Dataset_5s_img/*.png'
csv_file = 'train.csv'

img_file = glob(data_img)
csv_reader = pd.read_table(csv_file, sep=',', index_col='id')

wavs_imgs = []
genders = []
#ids = []
scores = []
#
gc.collect()
#
n = 0
#
with h5py.File('Depression_dataset_600_5s_incremental.h5','w') as hf:
    dsetZ = hf.create_dataset(
        'z', (0,), dtype='uint8', data=genders, chunks=True, maxshape=(None,)
    )
    print('dsetZ: ', dsetZ.dtype, ' ', dsetZ.shape)
    #
    dsetX = hf.create_dataset(
        'x', (0,max_edge,max_edge), dtype='uint8', data=wavs_imgs, chunks=True,
        maxshape=(None,None,None)
    )
    print('dsetX: ', dsetX.dtype, ' ', dsetX.shape)
    #
    dsetY = hf.create_dataset(
        'y', (0,), dtype='uint8', data=scores, chunks=True, maxshape=(None,)
    )
    print('dsetY: ', dsetY.dtype, ' ', dsetY.shape)
    print(list(hf.keys()))
    pass

print('Created empty H5 file')

# Magic number, but should be a code to get the smallest divisor, to work with primes
# Only works because I know the value beforehand...
chunks = 5
minimumPerChunk = len(img_file)/chunks
step = int(minimumPerChunk)
arrayStart = 0
arrayEnd = step
for i in range(0,chunks):
    n = arrayStart
    for x in img_file[arrayStart:arrayEnd]:
        #
        name = basename(x)
        #print (name)
        img_id, _ = name.split(sep='_')
        #indice, _ = name.split(sep='.')
        idx = int(img_id)
        n = n + 1
        #
        img = cv2.imread(x, 0)
        aspect_ratio = img.shape[0]*1./img.shape[1]
        if np.argmax(img.shape) == 0:
            c = int(max_edge*1./aspect_ratio)
            new_shape = (c, max_edge)
            pw = (max_edge - c)//2
            pad_width = ((0,0), (pw, max_edge - c - pw))
        else:
            r = int(max_edge*aspect_ratio)
            new_shape = (max_edge, r)
            pw = (max_edge - r)//2
            pad_width = ((pw, max_edge - r - pw), (0,0))
        img = cv2.resize(img, new_shape, cv2.INTER_AREA)
        img = np.lib.pad(img, pad_width, 'constant')
        #
        score = csv_reader.get_value(idx, 'score')
        gender = csv_reader.get_value(idx, 'gender')
        #
        #wavs_imgs.append(wav_img)
        wavs_imgs.append(img)
        scores.append(score)
        genders.append(gender)
        #ids.append(indice)
        print('OK: ', n)

    print('Done: reshape for ', arrayStart,'-', arrayEnd)

    with h5py.File('Depression_dataset_600_5s_incremental.h5','a') as hf:
        dsetZ = hf['z']
        dsetX = hf['x']
        dsetY = hf['y']
        print('dsetZ: ', dsetZ.dtype, ' ', dsetZ.shape)
        print('dsetX: ', dsetX.dtype, ' ', dsetX.shape)
        print('dsetY: ', dsetY.dtype, ' ', dsetY.shape)
        #
        dsetZ.resize(dsetZ.shape[0]+step, axis=0)
        dsetZ[-step:] = genders
        #
        dsetY.resize(dsetY.shape[0]+step, axis=0)
        dsetY[-step:] = scores
        #
        dsetX.resize(dsetX.shape[0]+step, axis=0)
        dsetX[-step:] = wavs_imgs
        #
        print(dsetZ.shape, dsetZ.shape, dsetY.shape)
        print(len(dsetZ.value), len(dsetZ.value), len(dsetY.value))

    print('Done: write to h5')
    # Free memory? No good way of doing this, apparently
    del wavs_imgs, genders, scores
    wavs_imgs = []
    genders = []
    #ids = []
    scores = []
    arrayStart += step
    arrayEnd = arrayStart + step
    gc.collect()
    pass

print('End to create h5')
	import matplotlib.pyplot as plt
	from scipy.io import wavfile
	from glob import glob
	import cv2
	import pandas as pd
	import numpy as np
	from os.path import basename
	import gc
	import h5py
	import os
	import wave
	import pylab

	max_edge = 600

	#data_img = 'Dataset_5s_img_1/*.png'
	data_img = 'Dataset_5s_img/*.png'
	csv_file = 'train.csv'

	img_file = glob(data_img)
	csv_reader = pd.read_table(csv_file, sep=',', index_col='id')

	wavs_imgs = []
	genders = []
	#ids = []
	scores = []
	#
	gc.collect()
	#
	n = 0
	#
	with h5py.File('Depression_dataset_600_5s_incremental.h5','w') as hf:
	dsetZ = hf.create_dataset(
	'z', (0,), dtype='uint8', data=genders, chunks=True, maxshape=(None,)
	)
	print('dsetZ: ', dsetZ.dtype, ' ', dsetZ.shape)
	#
	dsetX = hf.create_dataset(
	'x', (0,max_edge,max_edge), dtype='uint8', data=wavs_imgs, chunks=True,
	maxshape=(None,None,None)
	)
	print('dsetX: ', dsetX.dtype, ' ', dsetX.shape)
	#
	dsetY = hf.create_dataset(
	'y', (0,), dtype='uint8', data=scores, chunks=True, maxshape=(None,)
	)
	print('dsetY: ', dsetY.dtype, ' ', dsetY.shape)
	print(list(hf.keys()))
	pass

	print('Created empty H5 file')

	# Magic number, but should be a code to get the smallest divisor, to work with primes
	# Only works because I know the value beforehand...
	chunks = 5
	minimumPerChunk = len(img_file)/chunks
	step = int(minimumPerChunk)
	arrayStart = 0
	arrayEnd = step
	for i in range(0,chunks):
	n = arrayStart
	for x in img_file[arrayStart:arrayEnd]:
	#
	name = basename(x)
	#print (name)
	img_id, _ = name.split(sep='_')
	#indice, _ = name.split(sep='.')
	idx = int(img_id)
	n = n + 1
	#
	img = cv2.imread(x, 0)
	aspect_ratio = img.shape[0]*1./img.shape[1]
	if np.argmax(img.shape) == 0:
	c = int(max_edge*1./aspect_ratio)
	new_shape = (c, max_edge)
	pw = (max_edge - c)//2
	pad_width = ((0,0), (pw, max_edge - c - pw))
	else:
	r = int(max_edge*aspect_ratio)
	new_shape = (max_edge, r)
	pw = (max_edge - r)//2
	pad_width = ((pw, max_edge - r - pw), (0,0))
	img = cv2.resize(img, new_shape, cv2.INTER_AREA)
	img = np.lib.pad(img, pad_width, 'constant')
	#
	score = csv_reader.get_value(idx, 'score')
	gender = csv_reader.get_value(idx, 'gender')
	#
	#wavs_imgs.append(wav_img)
	wavs_imgs.append(img)
	scores.append(score)
	genders.append(gender)
	#ids.append(indice)
	print('OK: ', n)

	print('Done: reshape for ', arrayStart,'-', arrayEnd)

	with h5py.File('Depression_dataset_600_5s_incremental.h5','a') as hf:
	dsetZ = hf['z']
	dsetX = hf['x']
	dsetY = hf['y']
	print('dsetZ: ', dsetZ.dtype, ' ', dsetZ.shape)
	print('dsetX: ', dsetX.dtype, ' ', dsetX.shape)
	print('dsetY: ', dsetY.dtype, ' ', dsetY.shape)
	#
	dsetZ.resize(dsetZ.shape[0]+step, axis=0)
	dsetZ[-step:] = genders
	#
	dsetY.resize(dsetY.shape[0]+step, axis=0)
	dsetY[-step:] = scores
	#
	dsetX.resize(dsetX.shape[0]+step, axis=0)
	dsetX[-step:] = wavs_imgs
	#
	print(dsetZ.shape, dsetZ.shape, dsetY.shape)
	print(len(dsetZ.value), len(dsetZ.value), len(dsetY.value))

	print('Done: write to h5')
	# Free memory? No good way of doing this, apparently
	del wavs_imgs, genders, scores
	wavs_imgs = []
	genders = []
	#ids = []
	scores = []
	arrayStart += step
	arrayEnd = arrayStart + step
	gc.collect()
	pass

	print('End to create h5')