Skip to content

Instantly share code, notes, and snippets.

@arcaduf
Created April 3, 2019 07:19
Show Gist options
  • Save arcaduf/9eedd62f3264874a8b995a3108bac218 to your computer and use it in GitHub Desktop.
Save arcaduf/9eedd62f3264874a8b995a3108bac218 to your computer and use it in GitHub Desktop.
Prepare KaggleDR dataset for DL
'''
Link image data to ground truth
'''
from __future__ import print_function
import glob , os
import pandas as pd
import numpy as np
# User input
file_csv = '< path to >/trainLabels.csv'
path_imgs = '<path to downloaded images>'
SEP = ','
# Open CSV with ground truth
df_1 = pd.read_csv( file_csv , sep=SEP )
print( '\nData frame shape: ', df_1.shape )
# Binarize label
levels = df_1[ 'level' ].values
levels_bin = np.zeros( len( levels ) , dtype=np.int )
levels_bin[ levels <=1 ] = 0
levels_bin[ levels > 1 ] = 1
df_1[ 'level-binary' ] = levels_bin
# Collect all images in given path
list_imgs = []
list_bnames = []
for dirpath, dirnames, filenames in os.walk( path_imgs ):
for filename in [f for f in filenames if f.endswith( '.jpeg' )]:
list_imgs.append( os.path.join( dirpath , filename ) )
bname = os.path.splitext( os.path.basename( filename ) )[0]
list_bnames.append( bname )
id = bname.split( '_' )[0]
list_ids.append( id )
print( '\nFound ', len( list_imgs ),' inside ', path_imgs )
# Create data frame for image filepaths
df_2 = pd.DataFrame( { 'id' : list_ids ,
'image' : list_bnames ,
'filepaths': list_imgs } )
# Merge data frames
df_merge = pd.merge( df_1 , df_2 , on=[ 'image' ] )
print( '\nMerged data frame shape: ', df_merge.shape )
# Save merged data frame
file_out = os.path.join(os.path.dirname(file_csv),'labels_linked_to_imgs.csv')
df_merge.to_csv( file_out , sep=SEP , index=False )
print( '\n\n' )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment