eldog/mediapipe_hands_world_space.py

## mediapipe_hands_world_space.py
# MIT License
#
# Copyright (c) 2023 Foxdog Studios
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


#
#  Projects hand points into 3D space. See https://github.com/google/mediapipe/issues/2199 for full discussion
#
#  Grab the ball between your index finger and thumb.
#
#  Requires a webcam.
#
#  Requirements as suggested by @Legel:
#
#  pip install opencv-python
#  pip install pygame
#  pip install mediapipe or for Mac's with M1 pip install mediapipe-silicon
#  pip install PyOpenGL
#

import cv2
import pygame
import math
import mediapipe as mp
import numpy as np
from PIL import Image
from pygame.locals import *

from OpenGL.GL import *
from OpenGL.GLU import *
from OpenGL.GLUT import *

mp_drawing = mp.solutions.drawing_utils # type:ignore
mp_drawing_styles = mp.solutions.drawing_styles #type:ignore
mp_hands = mp.solutions.hands # type:ignore


hand_edges = (
    (0, 1),
    (1, 0),
    (1, 2),
    (2, 3),
    (3, 4),
    (0, 5),
    (5, 6),
    (6, 7),
    (7, 8),
    (5, 9),
    (9, 10),
    (10, 11),
    (11, 12),
    (9, 13),
    (13, 14),
    (14, 15),
    (15, 16),
    (13, 17),
    (17, 18),
    (18, 19),
    (19, 20),
    (0, 17),
)

def draw_hand(world_points):
    glLineWidth(5)
    glLoadIdentity()
    glBegin(GL_LINES)
    for edge in hand_edges:
        for vertex in edge:
            p = world_points[vertex]
            glVertex3fv((-p[0], p[1], p[2]))
    glEnd()
    for p in world_points:
        glPushAttrib(GL_LIGHTING_BIT);
        glMaterialfv(GL_FRONT, GL_DIFFUSE, [0, 1, 0, 0.5])
        glLoadIdentity()
        glTranslatef(-p[0], p[1], p[2]);
        glutSolidSphere(0.01 / 2, 16, 16);
        glPopAttrib()

class ImageLoader:
    def __init__(self, x: float, y: float):
        self.x = x
        self.y = y
        self.width = 0
        self.height = 0
        self.img_data = 0
        self.Texture = glGenTextures(1)

    def load(self, image: cv2.Mat):
        im = image
        tx_image = cv2.flip(im, 0)
        tx_image = Image.fromarray(tx_image)
        self.width = tx_image.size[0]
        self.height = tx_image.size[1]
        self.img_data = tx_image.tobytes('raw', 'BGRX', 0, -1)

        glBindTexture(GL_TEXTURE_2D, self.Texture)
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT)
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT)
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR)
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR)
        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, self.width, self.height, 0, GL_RGBA, GL_UNSIGNED_BYTE, self.img_data)

    def draw(self):
        glMatrixMode(GL_MODELVIEW)
        glLoadIdentity()
        glTranslate(self.x, self.y, 0)
        glEnable(GL_TEXTURE_2D)
        glBegin(GL_QUADS)
        glTexCoord2f(0, 0)
        glVertex2f(0, 0)
        glTexCoord2f(1, 0)
        glVertex2f(self.width, 0)
        glTexCoord2f(1, 1)
        glVertex2f(self.width, self.height)
        glTexCoord2f(0, 1)
        glVertex2f(0, self.height)
        glEnd()
        glDisable(GL_TEXTURE_2D)

ball_pos_start = [0, 0, -0.4]
ball_pos = list(ball_pos_start)
ball_grabbed = False

with mp_hands.Hands(
        model_complexity=0,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5) as hands:
    cap = cv2.VideoCapture(0)
    width, height = int(cap.get(3)), int(cap.get(4))
    pygame.init()
    display = (width,height)
    pygame.display.set_mode(display, DOUBLEBUF|OPENGL|RESIZABLE)
    glutInit()

    im_loader = ImageLoader(0, 0)

    draw_mediapipe = False

    while True:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                quit()
            if event.type == pygame.KEYDOWN:
                if event.key == pygame.K_m:
                    draw_mediapipe = not draw_mediapipe
                    print(f'toggling draw media pipe now: {draw_mediapipe}')
                if event.key == pygame.K_b:
                    ball_pos = list(ball_pos_start)

        glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT)

        glMatrixMode(GL_PROJECTION)
        glLoadIdentity()
        gluOrtho2D(0, width, height, 0)
        glMatrixMode(GL_MODELVIEW)
        glLoadIdentity()
        glEnable(GL_DEPTH_TEST)
        glEnable(GL_TEXTURE_2D)
        glEnable(GL_LIGHTING)
        glLightfv(GL_LIGHT0, GL_DIFFUSE, [1, 1, 1, 1])
        glEnable(GL_LIGHT0)

        success: bool
        image: cv2.Mat
        success, image = cap.read()
        image = cv2.flip(image, 1)
        image.flags.writeable = False


        # To improve performance, optionally mark the image as not writeable to
        # pass by reference.
        image.flags.writeable = False

        if not success:
          print("Ignoring empty camera frame.")
          # If loading a video, use 'break' instead of 'continue'.
          continue


        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        results = hands.process(image) # type:ignore

        # Draw the hand annotations on the image.
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)


        frame_height, frame_width, channels = image.shape

        # Fiddle with this number to get the camera image
        # hands to align with the mediapipe points. Unless
        # you know your camera's focal length, then put in
        # here.
        focal_length = frame_width * 0.75
        center = (frame_width/2, frame_height/2)
        camera_matrix = np.array(
                                 [[focal_length, 0, center[0]],
                                 [0, focal_length, center[1]],
                                 [0, 0, 1]], dtype = "double"
                                 )
        distortion = np.zeros((4, 1))
        fov_x = np.rad2deg(2 * np.arctan2(focal_length, 2 * focal_length))


        world_points_total = []
        if results.multi_hand_landmarks:
            for [i, hand_landmarks] in enumerate(results.multi_hand_landmarks):
                world_landmarks = results.multi_hand_world_landmarks[i]
                if draw_mediapipe:
                    mp_drawing.draw_landmarks(
                        image,
                        hand_landmarks,
                        mp_hands.HAND_CONNECTIONS,
                        mp_drawing_styles.get_default_hand_landmarks_style(),
                        mp_drawing_styles.get_default_hand_connections_style())

                model_points = np.float32([[-l.x, -l.y, -l.z] for l in world_landmarks.landmark])
                image_points = np.float32([[l.x * frame_width, l.y * frame_height] for l in hand_landmarks.landmark])
                success, rvecs, tvecs, = cv2.solvePnP(
                    model_points,
                    image_points,
                    camera_matrix,
                    distortion,
                    flags=cv2.SOLVEPNP_SQPNP
                )

                transformation = np.eye(4)  # needs to 4x4 because you have to use homogeneous coordinates
                transformation[0:3, 3] = tvecs.squeeze()
                # the transformation consists only of the translation, because the rotation is accounted for in the model coordinates. Take a look at this (https://codepen.io/mediapipe/pen/RwGWYJw to see how the model coordinates behave - the hand rotates, but doesn't translate

                # transform model coordinates into homogeneous coordinates
                model_points_hom = np.concatenate((model_points, np.ones((21, 1))), axis=1)

                # apply the transformation
                world_points = model_points_hom.dot(np.linalg.inv(transformation).T)
                world_points_total.append(world_points)

        glDepthMask(GL_FALSE)
        im_loader.load(image)
        glColor3f(1, 1, 1)
        im_loader.draw()
        glDepthMask(GL_TRUE)

        glMatrixMode(GL_PROJECTION)
        glLoadIdentity()
        gluPerspective(fov_x, (display[0]/display[1]), 0.1, 50.0)
        glMatrixMode(GL_MODELVIEW)
        glLoadIdentity()
        glTranslatef(*ball_pos);
        glPushAttrib(GL_LIGHTING_BIT);
        glMaterialfv(GL_FRONT, GL_DIFFUSE, [1, 0, 0, 0.5])
        glutSolidSphere(0.07 / 2, 16, 16);
        glPopAttrib()

        if len(world_points_total) > 0:
            glLoadIdentity()
            grab_distnace_closest = math.inf
            thumb_ball_distance_closest = math.inf
            thumb_closest = None
            for world_points in world_points_total:
                draw_hand(world_points);

                thumb = world_points[4]
                index = world_points[8]
                gd = math.hypot(
                    thumb[0] - index[0],
                    thumb[1] - index[1],
                    thumb[2] - index[2],
                )
                grab_distnace = gd
                thumb_ball_distance = math.hypot(
                    ball_pos[0] - -thumb[0],
                    ball_pos[1] - thumb[1],
                    ball_pos[2] - thumb[2],
                )
                if thumb_ball_distance < thumb_ball_distance_closest:
                    thumb_closest = thumb
                    thumb_ball_distance_closest = thumb_ball_distance
                    grab_distnace_closest = grab_distnace

            if thumb_closest is not None:
                if (thumb_ball_distance_closest < 0.1 or ball_grabbed) and grab_distnace_closest < 0.08:
                    ball_grabbed = True
                    ball_pos[0] = -thumb_closest[0]
                    ball_pos[1] = thumb_closest[1]
                    ball_pos[2] = thumb_closest[2]
                else:
                    ball_grabbed = False


        pygame.display.flip()
	# MIT License
	#
	# Copyright (c) 2023 Foxdog Studios
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.


	#
	# Projects hand points into 3D space. See https://github.com/google/mediapipe/issues/2199 for full discussion
	#
	# Grab the ball between your index finger and thumb.
	#
	# Requires a webcam.
	#
	# Requirements as suggested by @Legel:
	#
	# pip install opencv-python
	# pip install pygame
	# pip install mediapipe or for Mac's with M1 pip install mediapipe-silicon
	# pip install PyOpenGL
	#

	import cv2
	import pygame
	import math
	import mediapipe as mp
	import numpy as np
	from PIL import Image
	from pygame.locals import *

	from OpenGL.GL import *
	from OpenGL.GLU import *
	from OpenGL.GLUT import *

	mp_drawing = mp.solutions.drawing_utils # type:ignore
	mp_drawing_styles = mp.solutions.drawing_styles #type:ignore
	mp_hands = mp.solutions.hands # type:ignore


	hand_edges = (
	(0, 1),
	(1, 0),
	(1, 2),
	(2, 3),
	(3, 4),
	(0, 5),
	(5, 6),
	(6, 7),
	(7, 8),
	(5, 9),
	(9, 10),
	(10, 11),
	(11, 12),
	(9, 13),
	(13, 14),
	(14, 15),
	(15, 16),
	(13, 17),
	(17, 18),
	(18, 19),
	(19, 20),
	(0, 17),
	)

	def draw_hand(world_points):
	glLineWidth(5)
	glLoadIdentity()
	glBegin(GL_LINES)
	for edge in hand_edges:
	for vertex in edge:
	p = world_points[vertex]
	glVertex3fv((-p[0], p[1], p[2]))
	glEnd()
	for p in world_points:
	glPushAttrib(GL_LIGHTING_BIT);
	glMaterialfv(GL_FRONT, GL_DIFFUSE, [0, 1, 0, 0.5])
	glLoadIdentity()
	glTranslatef(-p[0], p[1], p[2]);
	glutSolidSphere(0.01 / 2, 16, 16);
	glPopAttrib()

	class ImageLoader:
	def __init__(self, x: float, y: float):
	self.x = x
	self.y = y
	self.width = 0
	self.height = 0
	self.img_data = 0
	self.Texture = glGenTextures(1)

	def load(self, image: cv2.Mat):
	im = image
	tx_image = cv2.flip(im, 0)
	tx_image = Image.fromarray(tx_image)
	self.width = tx_image.size[0]
	self.height = tx_image.size[1]
	self.img_data = tx_image.tobytes('raw', 'BGRX', 0, -1)

	glBindTexture(GL_TEXTURE_2D, self.Texture)
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT)
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT)
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR)
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR)
	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, self.width, self.height, 0, GL_RGBA, GL_UNSIGNED_BYTE, self.img_data)

	def draw(self):
	glMatrixMode(GL_MODELVIEW)
	glLoadIdentity()
	glTranslate(self.x, self.y, 0)
	glEnable(GL_TEXTURE_2D)
	glBegin(GL_QUADS)
	glTexCoord2f(0, 0)
	glVertex2f(0, 0)
	glTexCoord2f(1, 0)
	glVertex2f(self.width, 0)
	glTexCoord2f(1, 1)
	glVertex2f(self.width, self.height)
	glTexCoord2f(0, 1)
	glVertex2f(0, self.height)
	glEnd()
	glDisable(GL_TEXTURE_2D)

	ball_pos_start = [0, 0, -0.4]
	ball_pos = list(ball_pos_start)
	ball_grabbed = False

	with mp_hands.Hands(
	model_complexity=0,
	min_detection_confidence=0.5,
	min_tracking_confidence=0.5) as hands:
	cap = cv2.VideoCapture(0)
	width, height = int(cap.get(3)), int(cap.get(4))
	pygame.init()
	display = (width,height)
	pygame.display.set_mode(display, DOUBLEBUF\|OPENGL\|RESIZABLE)
	glutInit()

	im_loader = ImageLoader(0, 0)

	draw_mediapipe = False

	while True:
	for event in pygame.event.get():
	if event.type == pygame.QUIT:
	pygame.quit()
	quit()
	if event.type == pygame.KEYDOWN:
	if event.key == pygame.K_m:
	draw_mediapipe = not draw_mediapipe
	print(f'toggling draw media pipe now: {draw_mediapipe}')
	if event.key == pygame.K_b:
	ball_pos = list(ball_pos_start)

	glClear(GL_COLOR_BUFFER_BIT\|GL_DEPTH_BUFFER_BIT)

	glMatrixMode(GL_PROJECTION)
	glLoadIdentity()
	gluOrtho2D(0, width, height, 0)
	glMatrixMode(GL_MODELVIEW)
	glLoadIdentity()
	glEnable(GL_DEPTH_TEST)
	glEnable(GL_TEXTURE_2D)
	glEnable(GL_LIGHTING)
	glLightfv(GL_LIGHT0, GL_DIFFUSE, [1, 1, 1, 1])
	glEnable(GL_LIGHT0)

	success: bool
	image: cv2.Mat
	success, image = cap.read()
	image = cv2.flip(image, 1)
	image.flags.writeable = False


	# To improve performance, optionally mark the image as not writeable to
	# pass by reference.
	image.flags.writeable = False

	if not success:
	print("Ignoring empty camera frame.")
	# If loading a video, use 'break' instead of 'continue'.
	continue


	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	results = hands.process(image) # type:ignore

	# Draw the hand annotations on the image.
	image.flags.writeable = True
	image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)


	frame_height, frame_width, channels = image.shape

	# Fiddle with this number to get the camera image
	# hands to align with the mediapipe points. Unless
	# you know your camera's focal length, then put in
	# here.
	focal_length = frame_width * 0.75
	center = (frame_width/2, frame_height/2)
	camera_matrix = np.array(
	[[focal_length, 0, center[0]],
	[0, focal_length, center[1]],
	[0, 0, 1]], dtype = "double"
	)
	distortion = np.zeros((4, 1))
	fov_x = np.rad2deg(2 * np.arctan2(focal_length, 2 * focal_length))


	world_points_total = []
	if results.multi_hand_landmarks:
	for [i, hand_landmarks] in enumerate(results.multi_hand_landmarks):
	world_landmarks = results.multi_hand_world_landmarks[i]
	if draw_mediapipe:
	mp_drawing.draw_landmarks(
	image,
	hand_landmarks,
	mp_hands.HAND_CONNECTIONS,
	mp_drawing_styles.get_default_hand_landmarks_style(),
	mp_drawing_styles.get_default_hand_connections_style())

	model_points = np.float32([[-l.x, -l.y, -l.z] for l in world_landmarks.landmark])
	image_points = np.float32([[l.x * frame_width, l.y * frame_height] for l in hand_landmarks.landmark])
	success, rvecs, tvecs, = cv2.solvePnP(
	model_points,
	image_points,
	camera_matrix,
	distortion,
	flags=cv2.SOLVEPNP_SQPNP
	)

	transformation = np.eye(4) # needs to 4x4 because you have to use homogeneous coordinates
	transformation[0:3, 3] = tvecs.squeeze()
	# the transformation consists only of the translation, because the rotation is accounted for in the model coordinates. Take a look at this (https://codepen.io/mediapipe/pen/RwGWYJw to see how the model coordinates behave - the hand rotates, but doesn't translate

	# transform model coordinates into homogeneous coordinates
	model_points_hom = np.concatenate((model_points, np.ones((21, 1))), axis=1)

	# apply the transformation
	world_points = model_points_hom.dot(np.linalg.inv(transformation).T)
	world_points_total.append(world_points)

	glDepthMask(GL_FALSE)
	im_loader.load(image)
	glColor3f(1, 1, 1)
	im_loader.draw()
	glDepthMask(GL_TRUE)

	glMatrixMode(GL_PROJECTION)
	glLoadIdentity()
	gluPerspective(fov_x, (display[0]/display[1]), 0.1, 50.0)
	glMatrixMode(GL_MODELVIEW)
	glLoadIdentity()
	glTranslatef(*ball_pos);
	glPushAttrib(GL_LIGHTING_BIT);
	glMaterialfv(GL_FRONT, GL_DIFFUSE, [1, 0, 0, 0.5])
	glutSolidSphere(0.07 / 2, 16, 16);
	glPopAttrib()

	if len(world_points_total) > 0:
	glLoadIdentity()
	grab_distnace_closest = math.inf
	thumb_ball_distance_closest = math.inf
	thumb_closest = None
	for world_points in world_points_total:
	draw_hand(world_points);

	thumb = world_points[4]
	index = world_points[8]
	gd = math.hypot(
	thumb[0] - index[0],
	thumb[1] - index[1],
	thumb[2] - index[2],
	)
	grab_distnace = gd
	thumb_ball_distance = math.hypot(
	ball_pos[0] - -thumb[0],
	ball_pos[1] - thumb[1],
	ball_pos[2] - thumb[2],
	)
	if thumb_ball_distance < thumb_ball_distance_closest:
	thumb_closest = thumb
	thumb_ball_distance_closest = thumb_ball_distance
	grab_distnace_closest = grab_distnace

	if thumb_closest is not None:
	if (thumb_ball_distance_closest < 0.1 or ball_grabbed) and grab_distnace_closest < 0.08:
	ball_grabbed = True
	ball_pos[0] = -thumb_closest[0]
	ball_pos[1] = thumb_closest[1]
	ball_pos[2] = thumb_closest[2]
	else:
	ball_grabbed = False


	pygame.display.flip()