Skip to content

Instantly share code, notes, and snippets.

@TheJLifeX
Last active August 15, 2024 10:33
Simple Hand Gesture Recognition Code - Hand tracking - Mediapipe

Simple Hand Gesture Recognition Code - Hand tracking - Mediapipe

Goal of this gist is to recognize ONE, TWO, TREE, FOUR, FIVE, SIX, YEAH, ROCK, SPIDERMAN and OK. We use the LANDMARKS output of the LandmarkLetterboxRemovalCalculator. This output is a landmark list that contains 21 landmark. In the 02-landmarks.jpg picture below you can see the index of each landmark. Each landmark have x, y and z values. But only x, y values are sufficient for our Goal. If you dont want to copy/paste each the code on this gist, you can clone my forked version of mediapipe here: https://github.com/TheJLifeX/mediapipe. I have already commited all code in that repository.

We have five finger states.

  1. thumbIsOpen
  2. firstFingerIsOpen
  3. secondFingerIsOpen
  4. thirdFingerIsOpen
  5. fourthFingerIsOpen

For exmaple: thumb is open if the x value of landmark 3 and the x value of landmark 4 are less than x value of landmark 2 else it is close

PS: thumb open/close works only for the right hand. Because we can not yet determine if you show your left or right hand. For more info see this issue: Can palm_detection distinguish between right and left hand?

Prerequisite: You kwon how to run the hand tracking example.

  1. Get Started with mediapipe
  2. Hand Tracking on Desktop

If you want to know how to recognize some simple hand mouvements like Scrolling, Zoom in/out and Slide left/right (see comment below) you can read this gist: Simple Hand Mouvement Recognition Code.

#include <cmath>
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/formats/landmark.pb.h"
#include "mediapipe/framework/formats/rect.pb.h"
namespace mediapipe
{
namespace
{
constexpr char normRectTag[] = "NORM_RECT";
constexpr char normalizedLandmarkListTag[] = "NORM_LANDMARKS";
} // namespace
// Graph config:
//
// node {
// calculator: "HandGestureRecognitionCalculator"
// input_stream: "NORM_LANDMARKS:scaled_landmarks"
// input_stream: "NORM_RECT:hand_rect_for_next_frame"
// }
class HandGestureRecognitionCalculator : public CalculatorBase
{
public:
static ::mediapipe::Status GetContract(CalculatorContract *cc);
::mediapipe::Status Open(CalculatorContext *cc) override;
::mediapipe::Status Process(CalculatorContext *cc) override;
private:
float get_Euclidean_DistanceAB(float a_x, float a_y, float b_x, float b_y)
{
float dist = std::pow(a_x - b_x, 2) + pow(a_y - b_y, 2);
return std::sqrt(dist);
}
bool isThumbNearFirstFinger(NormalizedLandmark point1, NormalizedLandmark point2)
{
float distance = this->get_Euclidean_DistanceAB(point1.x(), point1.y(), point2.x(), point2.y());
return distance < 0.1;
}
};
REGISTER_CALCULATOR(HandGestureRecognitionCalculator);
::mediapipe::Status HandGestureRecognitionCalculator::GetContract(
CalculatorContract *cc)
{
RET_CHECK(cc->Inputs().HasTag(normalizedLandmarkListTag));
cc->Inputs().Tag(normalizedLandmarkListTag).Set<mediapipe::NormalizedLandmarkList>();
RET_CHECK(cc->Inputs().HasTag(normRectTag));
cc->Inputs().Tag(normRectTag).Set<NormalizedRect>();
return ::mediapipe::OkStatus();
}
::mediapipe::Status HandGestureRecognitionCalculator::Open(
CalculatorContext *cc)
{
cc->SetOffset(TimestampDiff(0));
return ::mediapipe::OkStatus();
}
::mediapipe::Status HandGestureRecognitionCalculator::Process(
CalculatorContext *cc)
{
// hand closed (red) rectangle
const auto rect = &(cc->Inputs().Tag(normRectTag).Get<NormalizedRect>());
float width = rect->width();
float height = rect->height();
if (width < 0.01 || height < 0.01)
{
LOG(INFO) << "No Hand Detected";
return ::mediapipe::OkStatus();
}
const auto &landmarkList = cc->Inputs()
.Tag(normalizedLandmarkListTag)
.Get<mediapipe::NormalizedLandmarkList>();
RET_CHECK_GT(landmarkList.landmark_size(), 0) << "Input landmark vector is empty.";
// finger states
bool thumbIsOpen = false;
bool firstFingerIsOpen = false;
bool secondFingerIsOpen = false;
bool thirdFingerIsOpen = false;
bool fourthFingerIsOpen = false;
//
float pseudoFixKeyPoint = landmarkList.landmark(2).x();
if (landmarkList.landmark(3).x() < pseudoFixKeyPoint && landmarkList.landmark(4).x() < pseudoFixKeyPoint)
{
thumbIsOpen = true;
}
pseudoFixKeyPoint = landmarkList.landmark(6).y();
if (landmarkList.landmark(7).y() < pseudoFixKeyPoint && landmarkList.landmark(8).y() < pseudoFixKeyPoint)
{
firstFingerIsOpen = true;
}
pseudoFixKeyPoint = landmarkList.landmark(10).y();
if (landmarkList.landmark(11).y() < pseudoFixKeyPoint && landmarkList.landmark(12).y() < pseudoFixKeyPoint)
{
secondFingerIsOpen = true;
}
pseudoFixKeyPoint = landmarkList.landmark(14).y();
if (landmarkList.landmark(15).y() < pseudoFixKeyPoint && landmarkList.landmark(16).y() < pseudoFixKeyPoint)
{
thirdFingerIsOpen = true;
}
pseudoFixKeyPoint = landmarkList.landmark(18).y();
if (landmarkList.landmark(19).y() < pseudoFixKeyPoint && landmarkList.landmark(20).y() < pseudoFixKeyPoint)
{
fourthFingerIsOpen = true;
}
// Hand gesture recognition
if (thumbIsOpen && firstFingerIsOpen && secondFingerIsOpen && thirdFingerIsOpen && fourthFingerIsOpen)
{
LOG(INFO) << "FIVE!";
}
else if (!thumbIsOpen && firstFingerIsOpen && secondFingerIsOpen && thirdFingerIsOpen && fourthFingerIsOpen)
{
LOG(INFO) << "FOUR!";
}
else if (thumbIsOpen && firstFingerIsOpen && secondFingerIsOpen && !thirdFingerIsOpen && !fourthFingerIsOpen)
{
LOG(INFO) << "TREE!";
}
else if (thumbIsOpen && firstFingerIsOpen && !secondFingerIsOpen && !thirdFingerIsOpen && !fourthFingerIsOpen)
{
LOG(INFO) << "TWO!";
}
else if (!thumbIsOpen && firstFingerIsOpen && !secondFingerIsOpen && !thirdFingerIsOpen && !fourthFingerIsOpen)
{
LOG(INFO) << "ONE!";
}
else if (!thumbIsOpen && firstFingerIsOpen && secondFingerIsOpen && !thirdFingerIsOpen && !fourthFingerIsOpen)
{
LOG(INFO) << "YEAH!";
}
else if (!thumbIsOpen && firstFingerIsOpen && !secondFingerIsOpen && !thirdFingerIsOpen && fourthFingerIsOpen)
{
LOG(INFO) << "ROCK!";
}
else if (thumbIsOpen && firstFingerIsOpen && !secondFingerIsOpen && !thirdFingerIsOpen && fourthFingerIsOpen)
{
LOG(INFO) << "SPIDERMAN!";
}
else if (!thumbIsOpen && !firstFingerIsOpen && !secondFingerIsOpen && !thirdFingerIsOpen && !fourthFingerIsOpen)
{
LOG(INFO) << "FIST!";
}
else if (!firstFingerIsOpen && secondFingerIsOpen && thirdFingerIsOpen && fourthFingerIsOpen && this->isThumbNearFirstFinger(landmarkList.landmark(4), landmarkList.landmark(8)))
{
LOG(INFO) << "OK!";
}
else
{
LOG(INFO) << "Finger States: " << thumbIsOpen << firstFingerIsOpen << secondFingerIsOpen << thirdFingerIsOpen << fourthFingerIsOpen;
LOG(INFO) << "___";
}
return ::mediapipe::OkStatus();
} // namespace mediapipe
} // namespace mediapipe

We have to add the HandGestureRecognitionCalculator node config in the in the hand_landmark_cpu.pbtxt or hand_landmark_gpu.pbtxt graph file.

  node {
      calculator: "HandGestureRecognitionCalculator"
      input_stream: "NORM_LANDMARKS:scaled_landmarks"
      input_stream: "NORM_RECT:hand_rect_for_next_frame"
    }

For example:

  1. in the hand_landmark_cpu.pbtx see here: https://github.com/TheJLifeX/mediapipe/blob/master/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_cpu.pbtxt#L187-L191
  2. in the hand_landmark_gpu.pbtx see here: https://github.com/TheJLifeX/mediapipe/blob/master/mediapipe/graphs/hand_tracking/subgraphs/hand_landmark_gpu.pbtxt#L182-L186

We have to create a bazel build config for our Calculator.

cc_library(
name = "hand-gesture-recognition-calculator",
srcs = ["hand-gesture-recognition-calculator.cc"],
visibility = ["//visibility:public"],
deps = [
"//mediapipe/framework:calculator_framework",
"//mediapipe/framework/formats:landmark_cc_proto",
"//mediapipe/framework/port:status",
"//mediapipe/framework/formats:rect_cc_proto",
"//mediapipe/framework/port:ret_check",
],
alwayslink = 1,
)

We have to add the path to the "hand-gesture-recognition-calculator" bazel build config in the hand_landmark_cpu or hand_landmark_cpu bazel build config.

For example: "//hand-gesture-recognition:hand-gesture-recognition-calculator"

  1. in the hand_landmark_cpu see here: https://github.com/TheJLifeX/mediapipe/blob/a069e5b6e1097f3f69c161a11f336e9e3b9751dd/mediapipe/graphs/hand_tracking/subgraphs/BUILD#L88
  2. in the hand_landmark_gpu see here: https://github.com/TheJLifeX/mediapipe/blob/a069e5b6e1097f3f69c161a11f336e9e3b9751dd/mediapipe/graphs/hand_tracking/subgraphs/BUILD#L192

You can now build the project and run it.

@SaddamBInSyed
Copy link

SaddamBInSyed commented Jun 9, 2020

@TheJLifeX

Thank you very much. you save my time a lot.

I have tried the same and working fine but finger landmark detection is NOT good.

image

Here I am trying to detect the index finger is touching the thumb finger or not. but since the index finger landmark point is not always correct my prediction going wrong.

could you please advise in this part?

@OrangeSmoothie
Copy link

@TheJLifeX

I tried to figure hand orientation like you suggested on the top. But this "int ang_in_degree = radianToDegree(ang_in_radian);" always gives result about 90 degrees because axis x rotates just liket axis y. Any advice would be helpful, thank you.

@SaddamBInSyed
Copy link

Now the recognized hand gesture (text) is displayed on the webcam on my forked version of mediapipe (https://github.com/TheJLifeX/mediapipe)
You can find all changes I made to do that on this commit: Render recognized hand gesture (text).

hand-gesture

@TheJLifeX,

Seems your output is more accurate.

which model you used whether .tflite.model or (desktop) Tensorflow .pb model file.
If we use the TFLITE model in the desktop app then the accuracy will be less?

please advise.

@TheJLifeX
Copy link
Author

Hi @SaddamBInSyed, it just a visual effect. This GIF has not been created with a video but with the combination of images/screenshots (each image showing one hand gesture). I used this tool: Images to GIF to create that.

@TheJLifeX
Copy link
Author

Hi @OrangeSmoothie, make sure that you use the landmarks from the output stream (hand_landmarks) of the LandmarkProjectionCalculator as input stream (see: here).

  • Before the LandmarkProjectionCalculator, the landmarks (scaled_landmarks) are in the hand enclosed rectangle (the red rectangle) coordinate system with origin the top left corner of the hand enclosed rectangle.
  • After the LandmarkProjectionCalculator, the landmarks are projected to the full image coordinate system with origin the top left corner of the full image.

@OrangeSmoothie
Copy link

@TheJLifeX Thank you!

@SaddamBInSyed
Copy link

@TheJLifeX

Thank you very much. you save my time a lot.

I have tried the same and working fine but finger landmark detection is NOT good.

image

Here I am trying to detect the index finger is touching the thumb finger or not. but since the index finger landmark point is not always correct my prediction going wrong.

could you please advise in this part?

Could you please advise on this ?

Thanks

@TheJLifeX
Copy link
Author

TheJLifeX commented Jun 15, 2020

Hi @SaddamBInSyed, the Hand Tracking with mediapipe is not perfect. See “Mean regression error” in the On-Device, Real-Time Hand Tracking with MediaPipe article. So you have to deal with this percent of error if you build something on top of the Hand Tracking. It means your results will also be in some way erroneous due to this percent of error.

@mmm2016
Copy link

mmm2016 commented Jun 20, 2020

@risi7
Copy link

risi7 commented Jul 10, 2020

It will be very helpful if you can guide in how to proceed with this mediapipe hand recognition in windows. I cant find much resources for how to use mediapipe in windows and use it for hand gesture recognition. And also what do you think that using the windows subsystem for Linux will be better than windows for this medaipipe hand guesture.

@TheJLifeX
Copy link
Author

Hi @risi7, you can find information about how you can use MediaPipe on Windows here.

@risi7
Copy link

risi7 commented Jul 13, 2020

How can we get this hand recognition in an App? Can you please help with the steps.

@TheJLifeX
Copy link
Author

Hi @risi7, I haven't built an app in which MediaPipe is used right now. Feel free to search about that to find a way to build one.

@nyanmn
Copy link

nyanmn commented Jul 15, 2020

When I do docker build --tag=mediapipe . at my pc.
I have error as

Reading package lists...
Reading package lists...
Building dependency tree...
Reading state information...
E: Unable to locate package python-pip

Actually I have pip2 and pip3 installed.

@nyanmn
Copy link

nyanmn commented Jul 15, 2020

How can I detect the hand move end to end horizontally?

@risi7
Copy link

risi7 commented Jul 15, 2020

How can we test this hand recognition by giving an input video (like the path of video) and getting an output video with hand gesture recognition?
And also can you please share the build and run commands to build and run project.

@miltonhuang
Copy link

miltonhuang commented Jul 18, 2020

--input_video_path /home/user/Desktop/videos/input.mp4 --output_video_path /home/user/Desktop/videos/outputvideofile.mp4
execute either one of those two scripts. run-hand-tracking-desktop-cpu.sh run-hand-tracking-desktop-gpu.sh, you could check the scripts for details.

@wonjongbot
Copy link

wonjongbot commented Jul 20, 2020

Hi @TheJLifeX,
First of all, thank you so much for this awesome method for gesture recognition.
I've been trying to implement this to multi hand tracking example, but the calculators are very different from the single handed one so I'm not sure of how to approach this.
I succeeded to print out some gestures in the command line by modifying the demo_run_graph_main_gpu.cc file, but failed to do so on a separate calculator like yours.
Any insights? Thank you so much :)

@nyanmn
Copy link

nyanmn commented Jul 21, 2020

@TheJLifeX
Your forkout lib is very useful.

How can I add in third party library like https://github.com/felHR85/UsbSerial to mediapipe?
For normal Android Studio project, I just need to add in

allprojects {
	repositories {
		jcenter()
		maven { url "https://jitpack.io" }
	}
}

to build.gradle.
And add the dependency to module's build.gradle:

implementation 'com.github.felHR85:UsbSerial:6.1.0'

But for mediapipe, how can I add in to interface usbserial?

@risi7
Copy link

risi7 commented Jul 21, 2020

@TheJLifeX, Why does it show the wrong gesture when hand is flipped. Like it recognizes rock correctly when the palm is facing the camera but as the hand is flipped(palm is not facing the camera) it shows the incorrect output or recognition. And also can you please suggest how to recognize thumbs up and thumbs down.

@methomas98
Copy link

methomas98 commented Jul 23, 2020

@TheJLifeX, thanks for your work on the gesture recognition. It came in handy in my project!

If you're interested, I modified it so that it can recognize both hands, facing toward and away from the camera. You can find the changes in this commit. I just determined the hand orientation and took it into account when checking if the thumb was in or out.

@FabricioGuimaraesOliveira

Hello. How can I integrate media pipe pose and a multitracking hands in real time? Its possible?

@johntranz
Copy link

@TheJLifeX,
Can you instruct me to train the sign language recognition model ?
Or steps taken to let the model recognize a new gesture.
I would be very grateful for that

@tonywang531
Copy link

Since mediapipe can now detect left or right hand, how to implement this feature? I got this code working in IOS, if anyone is interested I will upload it in the mediapipe repository.

@happyCodingSusan
Copy link

@TheJLifeX,
Thank you for sharing your ideas and codes.
I found you simple gestures work well when I use my right hand but not work when I use my left hand. When I use my left hand, so many mistakes. For example, when my left hand shows 5 fingers, your detection result is 4. Do you encounter this issue before? Any ideas why this happens?
Many thanks.

@tonywang531
Copy link

tonywang531 commented Nov 4, 2020

@happyCodingSusan,
The detection algorithm is very basic in the example. It does not take into account of left or right hand. In order to properly recognize both hands, the left or right hand information need to be retrieved from mediapipe landmark data. So the logic would be something like if the detected hand is left, flip the positions of the fingers then the detection algorithm will work. I am still trying to work out how to access this information.
At the worst case, we could identify left or right hand by using landmark data alone. Take for example, landmark position 4 and position 10, if 4 is on the left of 10 then it is like the palm in the example picture and vice versa. By checking the positions of the 5 fingers we should be able to detect if it is a left hand or right hand. It's just that I prefer not to reinvent the wheel when Mediapipe already can detect left or right hand in the current version.
I also give a bit thought on the front of the hand or the back of the hand. It seems to be impossible to detect if it is the front of the hand and the back of the hand by using landmark data alone. What I mean is that left hand with finger lines facing your face would be detected the same as right hand facing your face, sort like when you clap your hands. I don't feel you can distinguish this by using mediapipe. Therefore you would need to specify which side of the hand in your application.

@adarshx11
Copy link

@TheJLifeX Help me,
Can you Do it with Python .py

@adarshx11
Copy link

@TheJLifeX

Thank you very much. you save my time a lot.

I have tried the same and working fine but finger landmark detection is NOT good.

image

Here I am trying to detect the index finger is touching the thumb finger or not. but since the index finger landmark point is not always correct my prediction going wrong.

could you please advise in this part?

@TheJLifeX

Thank you very much. you save my time a lot.

I have tried the same and working fine but finger landmark detection is NOT good.

image

Here I am trying to detect the index finger is touching the thumb finger or not. but since the index finger landmark point is not always correct my prediction going wrong.

could you please advise in this part?

Saddam share Code Please

@denzero13
Copy link

Hi @TheJLifeX, I have a problem, I want to use the MediaPipe Hands library for python, I display a list of positions, but I don't understand which position corresponds to the point I need, and some positions are repeated, I used your code from above, but it doesn't work, I created a function that generates the required list. I output the frozen set mp_hands.HAND_CONNECTIONS and get the result
How to decipher correctly to understand which position belongs to which point?
sorry for bad english, i use google translator)))

frozenset({(<HandLandmark.THUMB_IP: 3>, <HandLandmark.THUMB_TIP: 4>), (<HandLandmark.WRIST: 0>, <HandLandmark.INDEX_FINGER_MCP: 5>), (<HandLandmark.PINKY_MCP: 17>, <HandLandmark.PINKY_PIP: 18>), (<HandLandmark.WRIST: 0>, <HandLandmark.PINKY_MCP: 17>), (<HandLandmark.RING_FINGER_MCP: 13>, <HandLandmark.RING_FINGER_PIP: 14>), (<HandLandmark.RING_FINGER_MCP: 13>, <HandLandmark.PINKY_MCP: 17>), (<HandLandmark.PINKY_PIP: 18>, <HandLandmark.PINKY_DIP: 19>), (<HandLandmark.INDEX_FINGER_MCP: 5>, <HandLandmark.INDEX_FINGER_PIP: 6>), (<HandLandmark.INDEX_FINGER_MCP: 5>, <HandLandmark.MIDDLE_FINGER_MCP: 9>), (<HandLandmark.RING_FINGER_PIP: 14>, <HandLandmark.RING_FINGER_DIP: 15>), (<HandLandmark.WRIST: 0>, <HandLandmark.THUMB_CMC: 1>), (<HandLandmark.MIDDLE_FINGER_MCP: 9>, <HandLandmark.MIDDLE_FINGER_PIP: 10>), (<HandLandmark.THUMB_CMC: 1>, <HandLandmark.THUMB_MCP: 2>), (<HandLandmark.MIDDLE_FINGER_PIP: 10>, <HandLandmark.MIDDLE_FINGER_DIP: 11>), (<HandLandmark.MIDDLE_FINGER_MCP: 9>, <HandLandmark.RING_FINGER_MCP: 13>), (<HandLandmark.PINKY_DIP: 19>, <HandLandmark.PINKY_TIP: 20>), (<HandLandmark.INDEX_FINGER_PIP: 6>, <HandLandmark.INDEX_FINGER_DIP: 7>), (<HandLandmark.RING_FINGER_DIP: 15>, <HandLandmark.RING_FINGER_TIP: 16>), (<HandLandmark.THUMB_MCP: 2>, <HandLandmark.THUMB_IP: 3>), (<HandLandmark.MIDDLE_FINGER_DIP: 11>, <HandLandmark.MIDDLE_FINGER_TIP: 12>), (<HandLandmark.INDEX_FINGER_DIP: 7>, <HandLandmark.INDEX_FINGER_TIP: 8>)})

@Jaguaribe21
Copy link

Jaguaribe21 commented Dec 8, 2020

Hi,

Has anyone managed to implement in mediapipe 0.8.0?

I tried to modify the files, but when I run, it displays this error:

AnnotationOverlayCalculator :: GetContract failed to validate:
For input streams ValidatePacketTypeSet failed:
"INPUT_FRAME" tag index 0 was not expected.
For output streams ValidatePacketTypeSet failed:
"OUTPUT_FRAME" tag index 0 was not expected.

Any tips or solutions?

Note: I am using Bazel 3.4.1, it runs normally, without the codes to be deployed.

Thanks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment