Skip to content

Instantly share code, notes, and snippets.

@messefor
Last active November 7, 2022 21:45
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save messefor/e2ee5fe1c18a040c90bbf91f2ee279e3 to your computer and use it in GitHub Desktop.
Save messefor/e2ee5fe1c18a040c90bbf91f2ee279e3 to your computer and use it in GitHub Desktop.
Convert IMDB META data(matlab .mat file) to csv.
#! /usr/bin/env python
"""
Convert IMDB META data(matlab .mat) to csv.
IMDB-WIKI – 500k+ face images with age and gender labels
- https://data.vision.ee.ethz.ch/cvl/rrothe/imdb-wiki/
- https://stackoverflow.com/questions/13965740/converting-matlabs-datenum-format-to-python
"""
from datetime import datetime, timedelta
import numpy as np
import scipy
from scipy.io import loadmat
import pandas as pd
def matlab_datenum2dt(matlab_datenum):
return datetime.fromordinal(int(matlab_datenum) - 366) +\
timedelta(days=int(matlab_datenum%1))
def main():
path_save = 'wiki/wiki.pkl'
path_mat = 'wiki/wiki.mat'
mat = loadmat(path_mat)
print(mat['__header__'])
print(mat['__version__'])
# Extract values
dt = mat['wiki'][0, 0]
# Check for columns
print('columns:\n', dt.dtype.names)
# Extract values with simple format
keys_s = ('gender', 'dob', 'photo_taken',
'face_score', 'second_face_score')
values = {k: dt[k].squeeze() for k in keys_s}
# Extract values with nested format
keys_n = ('full_path', 'name')
for k in keys_n:
values[k] = np.array([x if not x else x[0] for x in dt[k][0]])
# Convert face location to DataFrame
# img(face_location(2):face_location(4),face_location(1):face_location(3),:))
values['face_location'] =\
[tuple(x[0].tolist()) for x in dt['face_location'].squeeze()]
# Check all values extracted have same length
set_nrows = {len(v) for _, v in values.items()}
assert len(set_nrows) == 1
df_values = pd.DataFrame(values)
# Convert matlab datenum to datetime
df_values['dob'] = df_values['dob'].apply(matlab_datenum2dt)
# Calc ages when photo taken
df_values['photo_taken_age'] = \
df_values.apply(lambda x: x['photo_taken'] - x['dob'].year, axis=1)
# Concat all together and save
# Do not use csv format to work around tuple to be string
df_values.to_pickle(path_save)
if __name__ == '__main__':
main()
@dan-r95
Copy link

dan-r95 commented Nov 7, 2022

👍 worked like a charm.
I just converted it to csv df_values.to_csv('wiki/wiki.csv', index=False)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment