Skip to content

Instantly share code, notes, and snippets.

@jessefreeman
Created May 30, 2019 09:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jessefreeman/da01989fdabd73f3c9f2ce54ddd59fff to your computer and use it in GitHub Desktop.
Save jessefreeman/da01989fdabd73f3c9f2ce54ddd59fff to your computer and use it in GitHub Desktop.
A Python script to convert ChestXray14 CSV labels into meta.json files to use with MissingLink.ai's Data Volumes.
import pandas as pd
import os
import random
import tqdm
class_mapping = {
0: 'Atelectasis',
1: 'Cardiomegaly',
2: 'Effusion',
3: 'Infiltration',
4: 'Mass',
5: 'Nodule',
6: 'Pneumonia',
7: 'Pneumothorax',
8: 'Consolidation',
9: 'Edema',
10: 'Emphysema',
11: 'Fibrosis',
12: 'Pleural_Thickening',
13: 'Hernia'
}
df = pd.read_csv('Data_Entry_2017.csv')
df = df.rename(index=str, columns={"Follow-up #":"Follow_up_Number",
"OriginalImage[Width": "OriginalImage_Width",
"Height]": "OriginalImage_Height",
"OriginalImagePixelSpacing[x":"OriginalImagePixelSpacing_x",
"y]":"OriginalImagePixelSpacing_y",
"Patient ID":"Patient_ID",
"Patient Age":"Patient_Age",
"Patient Gender":"Patient_Gender",
"View Position":"View_Position",
"Image Index":"Image_Index",
"Finding Labels":"Finding_Labels",
"Unnamed: 11":"Unnamed_11"})
df['Single_Finding'] = False
df['Paper_Split'] = 'train'
df['type'] = 'Image'
df['Index_No'] = df['Image_Index']
# print(df.head())
dict = df.to_dict(orient='records')
image_names = df['Image_Index'].values.tolist()
# print(df.head())
#
# print (image_names[:10])
train_val_list_pd = pd.read_csv('train_val_list.txt')
train_val_list = train_val_list_pd.values.tolist()
train_val_list = [i[0] for i in train_val_list]
test_list_pd = pd.read_csv('test_list.txt')
test_list = test_list_pd.values.tolist()
test_list = [i[0] for i in test_list]
choose = ['train', 'validation']
# uncomment this to run faster in demos
image_names = image_names[:500]
class_mapping_keys = class_mapping.keys()
class_mapping_items = class_mapping.items()
# print(class_mapping_items)
for count, image in enumerate(tqdm.tqdm(image_names)):
temp = image.split(".")[0]
df.loc[str(count), ['Index_No']] = temp
# print(df.loc[df['Image_Index'] == image])
label = df.loc[df['Image_Index'] == image]['Finding_Labels'].values.tolist()
label = label[0]
# print("Before:", df.loc[str(count), ['Single_Finding']])
try:
label_key = [key for key, value in class_mapping_items if value == label] #[0]
if label_key[0] in class_mapping_keys:
df.loc[str(count), ['Single_Finding']] = True
except IndexError as error:
# Output expected IndexErrors.
pass
# print("After:", df.loc[str(count), ['Single_Finding']])
if image in train_val_list:
df.loc[str(count),['Paper_Split']] = random.choice(choose)
if image in test_list:
df.loc[str(count),['Paper_Split']] = 'test'
print(df.head())
pwd = os.getcwd()
#Get Bbox file
df_bb = pd.read_csv('BBox_List_2017.csv')
df_bb = df_bb.rename(index=str, columns={"Image Index":"Image_Index",
"Finding Label":"Finding_Labels_Bbox",
"Bbox [x":"Bbox_x",
"y":"Bbox_y",
"w":"Bbox_w",
"h]":"Bbox_h",
"Unnamed: 6":"Unnamed_6",
"Unnamed: 7":"Unnamed_7",
"Unnamed: 8":"Unnamed_8"
})
df_bb = df_bb.loc[:,['Image_Index','Finding_Labels_Bbox','Bbox_x','Bbox_y','Bbox_w','Bbox_h']]
# print("df_bb.head()")
#
# print(df_bb.head())
df_merged = pd.merge(df, df_bb, on="Image_Index", how="outer")
# print(df_merged)
# Save
df_merged.to_json('df_merged.json', orient='records', lines=True)
for count, image in enumerate(tqdm.tqdm(image_names)):
# if count%10000==0:
# print(count)
#
# print(image)
fullname = image + '.metadata.json'
name = os.path.join(pwd, 'meta_bbox_temp', fullname)
temp_dir = os.path.join(pwd, 'meta_bbox_temp')
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
df_merged.iloc[count].to_json(str(name))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment