Created
February 22, 2018 17:25
-
-
Save Yuri-M-Dias/be44367d7fd02db80e84f6b8c3949c7c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
from scipy.io import wavfile | |
from glob import glob | |
import cv2 | |
import pandas as pd | |
import numpy as np | |
from os.path import basename | |
import gc | |
import h5py | |
import os | |
import wave | |
import pylab | |
max_edge = 600 | |
#data_img = 'Dataset_5s_img_1/*.png' | |
data_img = 'Dataset_5s_img/*.png' | |
csv_file = 'train.csv' | |
img_file = glob(data_img) | |
csv_reader = pd.read_table(csv_file, sep=',', index_col='id') | |
wavs_imgs = [] | |
genders = [] | |
#ids = [] | |
scores = [] | |
# | |
gc.collect() | |
# | |
n = 0 | |
# | |
with h5py.File('Depression_dataset_600_5s_incremental.h5','w') as hf: | |
dsetZ = hf.create_dataset( | |
'z', (0,), dtype='uint8', data=genders, chunks=True, maxshape=(None,) | |
) | |
print('dsetZ: ', dsetZ.dtype, ' ', dsetZ.shape) | |
# | |
dsetX = hf.create_dataset( | |
'x', (0,max_edge,max_edge), dtype='uint8', data=wavs_imgs, chunks=True, | |
maxshape=(None,None,None) | |
) | |
print('dsetX: ', dsetX.dtype, ' ', dsetX.shape) | |
# | |
dsetY = hf.create_dataset( | |
'y', (0,), dtype='uint8', data=scores, chunks=True, maxshape=(None,) | |
) | |
print('dsetY: ', dsetY.dtype, ' ', dsetY.shape) | |
print(list(hf.keys())) | |
pass | |
print('Created empty H5 file') | |
# Magic number, but should be a code to get the smallest divisor, to work with primes | |
# Only works because I know the value beforehand... | |
chunks = 5 | |
minimumPerChunk = len(img_file)/chunks | |
step = int(minimumPerChunk) | |
arrayStart = 0 | |
arrayEnd = step | |
for i in range(0,chunks): | |
n = arrayStart | |
for x in img_file[arrayStart:arrayEnd]: | |
# | |
name = basename(x) | |
#print (name) | |
img_id, _ = name.split(sep='_') | |
#indice, _ = name.split(sep='.') | |
idx = int(img_id) | |
n = n + 1 | |
# | |
img = cv2.imread(x, 0) | |
aspect_ratio = img.shape[0]*1./img.shape[1] | |
if np.argmax(img.shape) == 0: | |
c = int(max_edge*1./aspect_ratio) | |
new_shape = (c, max_edge) | |
pw = (max_edge - c)//2 | |
pad_width = ((0,0), (pw, max_edge - c - pw)) | |
else: | |
r = int(max_edge*aspect_ratio) | |
new_shape = (max_edge, r) | |
pw = (max_edge - r)//2 | |
pad_width = ((pw, max_edge - r - pw), (0,0)) | |
img = cv2.resize(img, new_shape, cv2.INTER_AREA) | |
img = np.lib.pad(img, pad_width, 'constant') | |
# | |
score = csv_reader.get_value(idx, 'score') | |
gender = csv_reader.get_value(idx, 'gender') | |
# | |
#wavs_imgs.append(wav_img) | |
wavs_imgs.append(img) | |
scores.append(score) | |
genders.append(gender) | |
#ids.append(indice) | |
print('OK: ', n) | |
print('Done: reshape for ', arrayStart,'-', arrayEnd) | |
with h5py.File('Depression_dataset_600_5s_incremental.h5','a') as hf: | |
dsetZ = hf['z'] | |
dsetX = hf['x'] | |
dsetY = hf['y'] | |
print('dsetZ: ', dsetZ.dtype, ' ', dsetZ.shape) | |
print('dsetX: ', dsetX.dtype, ' ', dsetX.shape) | |
print('dsetY: ', dsetY.dtype, ' ', dsetY.shape) | |
# | |
dsetZ.resize(dsetZ.shape[0]+step, axis=0) | |
dsetZ[-step:] = genders | |
# | |
dsetY.resize(dsetY.shape[0]+step, axis=0) | |
dsetY[-step:] = scores | |
# | |
dsetX.resize(dsetX.shape[0]+step, axis=0) | |
dsetX[-step:] = wavs_imgs | |
# | |
print(dsetZ.shape, dsetZ.shape, dsetY.shape) | |
print(len(dsetZ.value), len(dsetZ.value), len(dsetY.value)) | |
print('Done: write to h5') | |
# Free memory? No good way of doing this, apparently | |
del wavs_imgs, genders, scores | |
wavs_imgs = [] | |
genders = [] | |
#ids = [] | |
scores = [] | |
arrayStart += step | |
arrayEnd = arrayStart + step | |
gc.collect() | |
pass | |
print('End to create h5') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment