Skip to content

Instantly share code, notes, and snippets.

@nulledge
Created October 1, 2018 02:11
Show Gist options
  • Save nulledge/07c8337d16762793f087473a2ef0cad9 to your computer and use it in GitHub Desktop.
Save nulledge/07c8337d16762793f087473a2ef0cad9 to your computer and use it in GitHub Desktop.
Convert Human3.6M from MATLAB to Python
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import copy\n",
"import cv2\n",
"import h5py\n",
"import imageio\n",
"import math\n",
"import matlab.engine\n",
"import numpy as np\n",
"import os\n",
"import pickle\n",
"import skimage\n",
"import skimage.io\n",
"import skimage.transform\n",
"from functools import lru_cache\n",
"from tqdm import tqdm as tqdm\n",
"from vectormath import Vector2, Vector3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"root = 'D:/data/Human3.6M/Release-v1.1/'\n",
"script_paths = [subdir for subdir, _, _ in os.walk(root) if '.git' not in subdir]\n",
"additional_script_paths = [\n",
" # empty\n",
"]\n",
"subjects = [\n",
" 1, 5, 6, 7, 8, # training\n",
" 9, 11, # validation\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"core = matlab.engine.start_matlab()\n",
"for script_path in script_paths + additional_script_paths:\n",
" core.addpath(script_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"core.workspace['DB'] = core.H36MDataBase.instance()\n",
"core.workspace['feature_RGB'] = core.H36MRGBVideoFeature()\n",
"core.workspace['feature_BB'] = core.H36MMyBBMask()\n",
"core.workspace['feature_BG'] = core.H36MMyBGMask()\n",
"core.workspace['features'] = [\n",
" core.H36MPose2DPositionsFeature(),\n",
" core.H36MPose3DPositionsFeature('Monocular', True),\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def valid_sequence(subject, action, sub_action, camera):\n",
" return subject in [1, 5, 6, 7, 8, 9, 11] and\\\n",
" 1 <= action <= 16 and\\\n",
" 1 <= sub_action <= 2 and\\\n",
" 1 <= camera <= 4"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_max_frame(subject, action, sub_action):\n",
" return int(core.getNumFrames(core.workspace['DB'], subject, action, sub_action))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_sequence(subject, action, sub_action, camera):\n",
" core.workspace['sequence'] = core.H36MSequence(subject, action, sub_action, camera, -1)\n",
" return core.workspace['sequence']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_intrinsics(subject, action, sub_action, camera):\n",
" if not valid_sequence(subject, action, sub_action, camera):\n",
" raise IndexError()\n",
" \n",
" sequence = get_sequence(subject, action, sub_action, camera)\n",
" core.workspace['camera'] = core.getCamera(sequence)\n",
" \n",
" f, c, k, p = [core.eval('camera.%s' % attrib)[0] for attrib in ['f', 'c', 'k', 'p']]\n",
" \n",
" return f, c, k, p"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_RGB(subject, action, sub_action, camera, frame):\n",
" if not valid_sequence(subject, action, sub_action, camera):\n",
" raise IndexError()\n",
" \n",
" max_frame = get_max_frame(subject, action, sub_action)\n",
" if not (1 <= frame <= max_frame):\n",
" raise IndexError()\n",
" \n",
" sequence = get_sequence(subject, action, sub_action, camera)\n",
" core.workspace['metadata'] = core.serializer(core.workspace['feature_RGB'], sequence)\n",
" \n",
" image = core.getFrame(core.workspace['metadata'], core.double(frame))\n",
" image = np.reshape(np.asarray(image._data, dtype=np.float), newshape=(image._size[2], image._size[1], image._size[0])).transpose(2, 1, 0)\n",
" \n",
" video_name = core.eval('metadata.Reader.VideoName')\n",
" \n",
" return image, video_name"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_video_name(subject, action, sub_action, camera):\n",
" if not valid_sequence(subject, action, sub_action, camera):\n",
" raise IndexError()\n",
" \n",
" sequence = get_sequence(subject, action, sub_action, camera)\n",
" core.workspace['metadata'] = core.serializer(core.workspace['feature_RGB'], sequence)\n",
" \n",
" video_name = core.eval('metadata.Reader.VideoName')\n",
" \n",
" return video_name"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_pose(subject, action, sub_action, camera, frame):\n",
" if not valid_sequence(subject, action, sub_action, camera):\n",
" raise IndexError()\n",
" \n",
" max_frame = get_max_frame(subject, action, sub_action)\n",
" if not (1 <= frame <= max_frame):\n",
" raise IndexError()\n",
" \n",
" sequence = get_sequence(subject, action, sub_action, camera)\n",
" core.eval('sequence.IdxFrames = %d;' % frame, nargout=0)\n",
" \n",
" pose = core.H36MComputeFeatures(sequence, core.workspace['features'])\n",
" \n",
" return np.reshape(np.asarray(pose[0]), newshape=(32, 2)),\\\n",
" np.reshape(np.asarray(pose[1]), newshape=(32, 3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_center_scale(subject, action, sub_action, camera, frame):\n",
" if not valid_sequence(subject, action, sub_action, camera):\n",
" raise IndexError()\n",
" \n",
" max_frame = get_max_frame(subject, action, sub_action)\n",
" if not (1 <= frame <= max_frame):\n",
" raise IndexError()\n",
" \n",
" sequence = get_sequence(subject, action, sub_action, camera)\n",
" core.workspace['metadata'] = core.serializer(core.workspace['feature_BB'], sequence)\n",
" \n",
" mask = core.getFrame(core.workspace['metadata'], core.double(frame))\n",
" mask = np.reshape(np.asarray(mask._data, dtype=np.float), newshape=(mask._size[1], mask._size[0])).transpose(1, 0)\n",
" \n",
" flatten = mask.flatten()\n",
" flatten = np.nonzero(flatten)[0]\n",
" ul, br = [flatten[where] for where in [0, -1]]\n",
" ul = Vector2(ul % mask.shape[1], ul // mask.shape[1])\n",
" br = Vector2(br % mask.shape[1], br // mask.shape[1])\n",
"\n",
" center = (ul + br) / 2\n",
" height = (br - ul).y\n",
" width = (br - ul).x\n",
" scale = max(height, width) / 200\n",
" \n",
" return center, scale"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_center_scale_directly(video_name, frame):\n",
" \n",
" sub = video_name.split('/')[-3].split('\\\\')[0]\n",
" act, cam = video_name.split('/')[-1].split('.mp4')[0].split('.')\n",
" #act = act.replace(' ', '_')\n",
" \n",
" data_root = 'D:/data/Human3.6M/downloaded/'\n",
" bb_path = os.path.join(data_root, sub, 'MySegmentsMat', 'ground_truth_bb', '%s.%s.mat' % (act, cam))\n",
" with h5py.File(bb_path, 'r') as file:\n",
" mask = np.asarray(file[file['Masks'][frame][0]]).transpose(1, 0)\n",
"\n",
" flatten = mask.flatten()\n",
" flatten = np.nonzero(flatten)[0]\n",
" ul, br = [flatten[where] for where in [0, -1]]\n",
" ul = Vector2(ul % mask.shape[1], ul // mask.shape[1])\n",
" br = Vector2(br % mask.shape[1], br // mask.shape[1])\n",
"\n",
" center = (ul + br) / 2\n",
" height = (br - ul).y\n",
" width = (br - ul).x\n",
" scale = max(height, width) / 200\n",
" \n",
" return center, scale"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def project(keypoints, f, c, k, p):\n",
" X = keypoints.transpose(1, 0) # Already in 3D pose\n",
" XX = np.divide(X[0:2, :], X[2, :])\n",
" r2 = np.power(XX[0, :], 2) + np.power(XX[1, :], 2)\n",
" radial = np.dot(k, np.asarray([r2, np.power(r2, 2), np.power(r2, 3)])) + 1\n",
" tan = p[0] * XX[1, :] + p[1] * XX[0, :]\n",
" temp = radial + tan\n",
" first = XX * np.stack([temp, temp])\n",
" second = np.expand_dims(np.asarray([p[1], p[0]]), axis=1) * np.expand_dims(r2, axis=0)\n",
" XXX = first + second\n",
" XXX = XXX.transpose(1, 0)\n",
" proj = f * XXX + c\n",
" \n",
" return proj"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def crop_image(image, center, scale, rotate, resolution):\n",
" center = Vector2(center) # assign new array\n",
" height, width, channel = image.shape\n",
" crop_ratio = 200 * scale / resolution\n",
" if crop_ratio >= 2: # if box size is greater than two time of resolution px\n",
" # scale down image\n",
" height = math.floor(height / crop_ratio)\n",
" width = math.floor(width / crop_ratio)\n",
"\n",
" if max([height, width]) < 2:\n",
" # Zoomed out so much that the image is now a single pixel or less\n",
" raise ValueError(\"Width or height is invalid!\")\n",
"\n",
" image = skimage.transform.resize(image, (height, width), mode='constant')\n",
"# image = image.resize(image, (height, width), mode='constant')\n",
" center /= crop_ratio\n",
" scale /= crop_ratio\n",
"\n",
" ul = (center - 200 * scale / 2).astype(int)\n",
" br = (center + 200 * scale / 2).astype(int) # Vector2\n",
"\n",
" if crop_ratio >= 2: # force image size 256 x 256\n",
" br -= (br - ul - resolution)\n",
"\n",
" pad_length = math.ceil((ul - br).length - (br.x - ul.x) / 2)\n",
"\n",
" if rotate != 0:\n",
" ul -= pad_length\n",
" br += pad_length\n",
"\n",
" src = [max(0, ul.y), min(height, br.y), max(0, ul.x), min(width, br.x)]\n",
" dst = [max(0, -ul.y), min(height, br.y) - ul.y, max(0, -ul.x), min(width, br.x) - ul.x]\n",
"\n",
" new_image = np.zeros([br.y - ul.y, br.x - ul.x, channel], dtype=np.float32)\n",
" new_image[dst[0]:dst[1], dst[2]:dst[3], :] = image[src[0]:src[1], src[2]:src[3], :]\n",
"\n",
" if rotate != 0:\n",
" new_image = skimage.transform.rotate(new_image, rotate)\n",
" new_height, new_width, _ = new_image.shape\n",
" new_image = new_image[pad_length:new_height - pad_length, pad_length:new_width - pad_length, :]\n",
"\n",
" if crop_ratio < 2:\n",
" new_image = skimage.transform.resize(new_image, (resolution, resolution), mode='constant')\n",
"# new_image = Image.resize(new_image, (resolution, resolution), mode='constant')\n",
"\n",
" return new_image\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pelvis = [1]\n",
"left_leg = [7, 8, 9]\n",
"right_leg = [2, 3, 4]\n",
"spine = [13, 14, 15, 16]\n",
"left_arm = [18, 19, 20]\n",
"right_arm = [26, 27, 28]\n",
"keypoints = pelvis + left_leg + right_leg + spine + left_arm + right_arm\n",
"\n",
"converted = dict()\n",
"converted ['S'] = list()\n",
"converted ['part'] = list()\n",
"converted ['center'] = list()\n",
"converted ['scale'] = list()\n",
"converted ['image'] = list()\n",
"\n",
"total = 0\n",
"\n",
"for subject in [1, 5, 6, 7, 8, ]:\n",
" for action in range(2, 16 + 1):\n",
" for sub_action in [1, 2]:\n",
" for camera in [1, 2, 3, 4]:\n",
"\n",
" # Data corrupted.\n",
" if subject == 11 and action == 2 and sub_action == 2 and camera == 1:\n",
" continue\n",
" \n",
" max_frame = get_max_frame(subject, action, sub_action)\n",
" total = total + max_frame//5\n",
" \n",
" \n",
"with tqdm(total=total) as progress:\n",
"\n",
" for subject in [1, 5, 6, 7, 8, ]:\n",
" for action in range(2, 16 + 1):\n",
" for sub_action in [1, 2]:\n",
" for camera in [1, 2, 3, 4]:\n",
"\n",
" progress.set_description('subject(%d) action(%d-%d) camera(%d)' % (subject, action, sub_action, camera))\n",
"\n",
" # Data corrupted.\n",
" if subject == 11 and action == 2 and sub_action == 2 and camera == 1:\n",
" continue\n",
"\n",
" max_frame = get_max_frame(subject, action, sub_action)\n",
"\n",
" video_name = get_video_name(subject, action, sub_action, camera)\n",
" sub = video_name.split('/')[-3].split('\\\\')[0]\n",
" act, cam = video_name.split('/')[-1].split('.mp4')[0].split('.')\n",
" \n",
" data_root = 'D:/data/Human3.6M/downloaded/'\n",
" bb_path = os.path.join(data_root, sub, 'MySegmentsMat', 'ground_truth_bb', '%s.%s.mat' % (act, cam))\n",
" \n",
" act = act.replace(' ', '_')\n",
" video_name = '%s_%s.%s' % (sub, act, cam)\n",
" \n",
" with h5py.File(bb_path, 'r') as file:\n",
"\n",
" for frame in range(1, max_frame+1, 5):\n",
" mask = np.asarray(file[file['Masks'][frame-1][0]]).transpose(1, 0)\n",
"\n",
" flatten = mask.flatten()\n",
" flatten = np.nonzero(flatten)[0]\n",
" ul, br = [flatten[where] for where in [0, -1]]\n",
" ul = Vector2(ul % mask.shape[1], ul // mask.shape[1])\n",
" br = Vector2(br % mask.shape[1], br // mask.shape[1])\n",
"\n",
" center = (ul + br) / 2 # center\n",
" height = (br - ul).y\n",
" width = (br - ul).x\n",
" scale = max(height, width) / 200 # scale\n",
" \n",
" # center, scale = get_center_scale(subject, action, sub_action, camera, frame) # center, scale\n",
" in_image_space, in_camera_space = get_pose(subject, action, sub_action, camera, frame) # part, S\n",
"\n",
" converted ['S'].append(np.reshape([in_camera_space[idx-1] for idx in keypoints], (-1, 3)))\n",
" converted ['part'].append(np.reshape([in_image_space[idx-1] for idx in keypoints], (-1, 2)))\n",
" converted ['center'].append(center)\n",
" converted ['scale'].append(scale)\n",
" converted ['image'].append('%s_%06d.jpg' % (video_name, frame))\n",
"\n",
" progress.update(1)\n",
"\n",
"pickle.dump(converted, open('train.bin', 'wb'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pelvis = [1]\n",
"left_leg = [7, 8, 9]\n",
"right_leg = [2, 3, 4]\n",
"spine = [13, 14, 15, 16]\n",
"left_arm = [18, 19, 20]\n",
"right_arm = [26, 27, 28]\n",
"keypoints = pelvis + left_leg + right_leg + spine + left_arm + right_arm\n",
"\n",
"converted = dict()\n",
"converted ['S'] = list()\n",
"converted ['part'] = list()\n",
"converted ['center'] = list()\n",
"converted ['scale'] = list()\n",
"converted ['image'] = list()\n",
"\n",
"total = 0\n",
"\n",
"for subject in [9. 11, ]:\n",
" for action in range(2, 16 + 1):\n",
" for sub_action in [1, 2]:\n",
" for camera in [1, 2, 3, 4]:\n",
"\n",
" # Data corrupted.\n",
" if subject == 11 and action == 2 and sub_action == 2 and camera == 1:\n",
" continue\n",
" \n",
" max_frame = get_max_frame(subject, action, sub_action)\n",
" total = total + max_frame//5\n",
" \n",
" \n",
"with tqdm(total=total) as progress:\n",
"\n",
" for subject in [9, 11, ]:\n",
" for action in range(2, 16 + 1):\n",
" for sub_action in [1, 2]:\n",
" for camera in [1, 2, 3, 4]:\n",
"\n",
" progress.set_description('subject(%d) action(%d-%d) camera(%d)' % (subject, action, sub_action, camera))\n",
"\n",
" # Data corrupted.\n",
" if subject == 11 and action == 2 and sub_action == 2 and camera == 1:\n",
" continue\n",
"\n",
" max_frame = get_max_frame(subject, action, sub_action)\n",
"\n",
" video_name = get_video_name(subject, action, sub_action, camera)\n",
" sub = video_name.split('/')[-3].split('\\\\')[0]\n",
" act, cam = video_name.split('/')[-1].split('.mp4')[0].split('.')\n",
" \n",
" data_root = 'D:/data/Human3.6M/downloaded/'\n",
" bb_path = os.path.join(data_root, sub, 'MySegmentsMat', 'ground_truth_bb', '%s.%s.mat' % (act, cam))\n",
" \n",
" act = act.replace(' ', '_')\n",
" video_name = '%s_%s.%s' % (sub, act, cam)\n",
" \n",
" with h5py.File(bb_path, 'r') as file:\n",
"\n",
" for frame in range(1, max_frame+1, 5):\n",
" mask = np.asarray(file[file['Masks'][frame-1][0]]).transpose(1, 0)\n",
"\n",
" flatten = mask.flatten()\n",
" flatten = np.nonzero(flatten)[0]\n",
" ul, br = [flatten[where] for where in [0, -1]]\n",
" ul = Vector2(ul % mask.shape[1], ul // mask.shape[1])\n",
" br = Vector2(br % mask.shape[1], br // mask.shape[1])\n",
"\n",
" center = (ul + br) / 2 # center\n",
" height = (br - ul).y\n",
" width = (br - ul).x\n",
" scale = max(height, width) / 200 # scale\n",
" \n",
" # center, scale = get_center_scale(subject, action, sub_action, camera, frame) # center, scale\n",
" in_image_space, in_camera_space = get_pose(subject, action, sub_action, camera, frame) # part, S\n",
"\n",
" converted ['S'].append(np.reshape([in_camera_space[idx-1] for idx in keypoints], (-1, 3)))\n",
" converted ['part'].append(np.reshape([in_image_space[idx-1] for idx in keypoints], (-1, 2)))\n",
" converted ['center'].append(center)\n",
" converted ['scale'].append(scale)\n",
" converted ['image'].append('%s_%06d.jpg' % (video_name, frame))\n",
"\n",
" progress.update(1)\n",
"\n",
"pickle.dump(converted, open('valid.bin', 'wb'))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@nulledge
Copy link
Author

Notes. Human3.6M dataset is not that quite accurate.

  1. What 'monocular' of H36MCamera class is that it returns the average intrinsic and extrinsic of 4 cameras. So data captured from different cameras are regarded as captured from the same camera.
  2. Following README, TOF data use camera parameters of the RGB camera. You can calculate parameters but may not be accurate.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment