Skip to content

Instantly share code, notes, and snippets.

@ashwani-rathee
Last active January 7, 2023 01:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ashwani-rathee/0f6006b63a7736a0c12567da30a7925d to your computer and use it in GitHub Desktop.
Save ashwani-rathee/0f6006b63a7736a0c12567da30a7925d to your computer and use it in GitHub Desktop.
hand landmarks and gesture recognition in julia
### A Pluto.jl notebook ###
# v0.17.3
using Markdown
using InteractiveUtils
# This Pluto notebook uses @bind for interactivity. When running this notebook outside of Pluto, the following 'mock version' of @bind gives bound variables a default value (instead of an error).
macro bind(def, element)
quote
local iv = try Base.loaded_modules[Base.PkgId(Base.UUID("6e696c72-6542-2067-7265-42206c756150"), "AbstractPlutoDingetjes")].Bonds.initial_value catch; b -> missing; end
local el = $(esc(element))
global $(esc(def)) = Core.applicable(Base.get, el) ? Base.get(el) : iv(el)
el
end
end
# ╔═╡ 4934e4de-b03d-419f-a076-9a8116f5ddf5
begin
using Pkg;
Pkg.activate(".") ;
end
# ╔═╡ 14519106-d4cf-4a77-acca-a22b7c426334
using Cairo, Images, ImageDraw, Luxor, LinearAlgebra, LazySets, StaticArrays
# ╔═╡ 0da24d63-180f-4913-a8d6-3ba54a28ef04
md"""
### Basic Hand Gesture Recognition using convexity defects
We want to detect gestures from hands using classical image processing methods at this moment
###### Steps
- We recieve real time image from Javascript
- Define region of interest in the image and get that part of image
- Convert ROI image to HSV space, then threshold it to skin range, we recieve a binary mask
- We use map window and dilation on the mask to reduce noise(quite robust actually)
- Find contours from the mask now(suzuki and abe algorithm)
- Find convexhull on the binary mask to find the convex hull(from ImageMorphology.jl)
- Now we have contour points and convexhull of the mask, we find the convexity defects
###### Convexity Defects wasn't available in julia so wrote my own
Explanation and Interesting ideas on convexity defects will be done later
- After we find num and location of convexity defects, we plot the points and num+1 as the number in image using Luxor.jl
- This happens every 100ms
"""
# ╔═╡ 841cd0d1-c5d4-41fe-949f-b2ddc9144634
md"""
![](https://i.imgur.com/KQ5V3hJ.png)
"""
# ╔═╡ 43f08085-b9b3-4e9b-b2ff-a0907b48a897
begin
### Important
### contour related utils
# rotate direction clocwise
function clockwise(dir)
return (dir)%8 + 1
end
# rotate direction counterclocwise
function counterclockwise(dir)
return (dir+6)%8 + 1
end
# move from current pixel to next in given direction
function move(pixel, image, dir, dir_delta)
newp = pixel + dir_delta[dir]
height, width = size(image)
if (0 < newp[1] <= height) && (0 < newp[2] <= width)
if image[newp]!=0
return newp
end
end
return CartesianIndex(0, 0)
end
# finds direction between two given pixels
function from_to(from, to, dir_delta)
delta = to-from
return findall(x->x == delta, dir_delta)[1]
end
function detect_move(image, p0, p2, nbd, border, done, dir_delta)
dir = from_to(p0, p2, dir_delta)
moved = clockwise(dir)
p1 = CartesianIndex(0, 0)
while moved != dir ## 3.1
newp = move(p0, image, moved, dir_delta)
if newp[1]!=0
p1 = newp
break
end
moved = clockwise(moved)
end
if p1 == CartesianIndex(0, 0)
return
end
p2 = p1 ## 3.2
p3 = p0 ## 3.2
done .= false
while true
dir = from_to(p3, p2, dir_delta)
moved = counterclockwise(dir)
p4 = CartesianIndex(0, 0)
done .= false
while true ## 3.3
p4 = move(p3, image, moved, dir_delta)
if p4[1] != 0
break
end
done[moved] = true
moved = counterclockwise(moved)
end
push!(border, p3) ## 3.4
if p3[1] == size(image, 1) || done[3]
image[p3] = -nbd
elseif image[p3] == 1
image[p3] = nbd
end
if (p4 == p0 && p3 == p1) ## 3.5
break
end
p2 = p3
p3 = p4
end
end
function find_contours(image)
nbd = 1
lnbd = 1
image = Float64.(image)
contour_list = Vector{typeof(CartesianIndex[])}()
done = [false, false, false, false, false, false, false, false]
# Clockwise Moore neighborhood.
dir_delta = [CartesianIndex(-1, 0) , CartesianIndex(-1, 1), CartesianIndex(0, 1), CartesianIndex(1, 1), CartesianIndex(1, 0), CartesianIndex(1, -1), CartesianIndex(0, -1), CartesianIndex(-1,-1)]
height, width = size(image)
for i=1:height
lnbd = 1
for j=1:width
fji = image[i, j]
is_outer = (image[i, j] == 1 && (j == 1 || image[i, j-1] == 0)) ## 1 (a)
is_hole = (image[i, j] >= 1 && (j == width || image[i, j+1] == 0))
if is_outer || is_hole
# 2
border = CartesianIndex[]
from = CartesianIndex(i, j)
if is_outer
nbd += 1
from -= CartesianIndex(0, 1)
else
nbd += 1
if fji > 1
lnbd = fji
end
from += CartesianIndex(0, 1)
end
p0 = CartesianIndex(i,j)
detect_move(image, p0, from, nbd, border, done, dir_delta) ## 3
if isempty(border) ##TODO
push!(border, p0)
image[p0] = -nbd
end
push!(contour_list, border)
end
if fji != 0 && fji != 1
lnbd = abs(fji)
end
end
end
return contour_list
end
# a contour is a vector of 2 int arrays
function draw_contour(image, color, contour)
for ind in contour
image[ind] = color
end
end
function draw_contours(image, color, contours)
for cnt in contours
draw_contour(image, color, cnt)
end
end
end
# ╔═╡ f5642319-05ee-4731-ad26-80bcd4f6aa7b
begin
### Important
### Webcam related utils
function camera_input(;max_size=200, default_url="https://i.imgur.com/SUmi94P.png")
"""
<span class="pl-image waiting-for-permission">
<style>
.pl-image.popped-out {
position: fixed;
top: 0;
right: 0;
z-index: 5;
}
.pl-image #video-container {
width: 250px;
}
.pl-image video {
# border-radius: 1rem 1rem 0 0;
}
.pl-image.waiting-for-permission #video-container {
display: none;
}
.pl-image #prompt {
display: none;
}
.pl-image.waiting-for-permission #prompt {
width: 250px;
height: 200px;
display: grid;
place-items: center;
font-family: monospace;
font-weight: bold;
text-decoration: underline;
cursor: pointer;
border: 5px dashed rgba(0,0,0,.5);
}
.pl-image video {
display: block;
}
.pl-image .bar {
width: inherit;
display: flex;
z-index: 6;
}
.pl-image .bar#top {
position: absolute;
flex-direction: column;
}
.pl-image .bar#bottom {
background: black;
# border-radius: 0 0 1rem 1rem;
}
.pl-image .bar button {
flex: 0 0 auto;
background: rgba(255,255,255,.8);
border: none;
width: 2rem;
height: 2rem;
border-radius: 100%;
cursor: pointer;
z-index: 7;
}
.pl-image .bar button#shutter {
width: 3rem;
height: 3rem;
margin: -1.5rem auto .2rem auto;
}
.pl-image video.takepicture {
animation: pictureflash 0ms linear;
}
@keyframes pictureflash {
0% {
filter: grayscale(1.0) contrast(2.0);
}
100% {
filter: grayscale(0.0) contrast(1.0);
}
}
</style>
<div id="video-container">
<div id="top" class="bar">
<button id="stop" title="Stop video">✖</button>
<button id="pop-out" title="Pop out/pop in">⏏</button>
</div>
<video playsinline autoplay></video>
<div id="bottom" class="bar">
<button id="shutter" title="Click to take a picture">📷</button>
</div>
</div>
<div id="prompt">
<span>
Enable webcam
</span>
</div>
<script>
// based on https://github.com/fonsp/printi-static (by the same author)
const span = currentScript.parentElement
const video = span.querySelector("video")
const popout = span.querySelector("button#pop-out")
const stop = span.querySelector("button#stop")
const shutter = span.querySelector("button#shutter")
const prompt = span.querySelector(".pl-image #prompt")
const maxsize = $(max_size)
const send_source = (source, src_width, src_height) => {
const scale = Math.min(1.0, maxsize / src_width, maxsize / src_height)
const width = Math.floor(src_width * scale)
const height = Math.floor(src_height * scale)
const canvas = html`<canvas width=\${width} height=\${height}>`
const ctx = canvas.getContext("2d")
ctx.drawImage(source, 0, 0, width, height)
span.value = {
width: width,
height: height,
data: ctx.getImageData(0, 0, width, height).data,
}
span.dispatchEvent(new CustomEvent("input"))
}
const clear_camera = () => {
window.stream.getTracks().forEach(s => s.stop());
video.srcObject = null;
span.classList.add("waiting-for-permission");
}
prompt.onclick = () => {
navigator.mediaDevices.getUserMedia({
audio: false,
video: {
facingMode: "environment",
},
}).then(function(stream) {
stream.onend = console.log
window.stream = stream
video.srcObject = stream
window.cameraConnected = true
video.controls = false
video.play()
video.controls = false
span.classList.remove("waiting-for-permission");
}).catch(function(error) {
console.log(error)
});
}
stop.onclick = () => {
clear_camera()
}
popout.onclick = () => {
span.classList.toggle("popped-out")
}
var intervalId = window.setInterval(function(){
const cl = video.classList
cl.remove("takepicture")
void video.offsetHeight
cl.add("takepicture")
video.play()
video.controls = false
send_source(video, video.videoWidth, video.videoHeight)
}, 150);
shutter.onclick = () => {
const cl = video.classList
cl.remove("takepicture")
void video.offsetHeight
cl.add("takepicture")
video.play()
video.controls = false
send_source(video, video.videoWidth, video.videoHeight)
}
document.addEventListener("visibilitychange", () => {
if (document.visibilityState != "visible") {
clear_camera()
}
})
// Set a default image
const img = html`<img crossOrigin="anonymous">`
img.onload = () => {
console.log("helloo")
send_source(img, img.width, img.height)
}
img.src = "$(default_url)"
console.log(img)
</script>
</span>
""" |> HTML
end
function process_raw_camera_data(raw_camera_data)
# the raw image data is a long byte array, we need to transform it into something
# more "Julian" - something with more _structure_.
# The encoding of the raw byte stream is:
# every 4 bytes is a single pixel
# every pixel has 4 values: Red, Green, Blue, Alpha
# (we ignore alpha for this notebook)
# So to get the red values for each pixel, we take every 4th value, starting at
# the 1st:
reds_flat = UInt8.(raw_camera_data["data"][1:4:end])
greens_flat = UInt8.(raw_camera_data["data"][2:4:end])
blues_flat = UInt8.(raw_camera_data["data"][3:4:end])
# but these are still 1-dimensional arrays, nicknamed 'flat' arrays
# We will 'reshape' this into 2D arrays:
width = raw_camera_data["width"]
height = raw_camera_data["height"]
# shuffle and flip to get it in the right shape
reds = reshape(reds_flat, (width, height))' / 255.0
greens = reshape(greens_flat, (width, height))' / 255.0
blues = reshape(blues_flat, (width, height))' / 255.0
# we have our 2D array for each color
# Let's create a single 2D array, where each value contains the R, G and B value of
# that pixel
RGB.(reds, greens, blues)
end
end
# ╔═╡ 1a0324de-ee19-11ea-1d4d-db37f4136ad3
@bind raw_camera_data camera_input(;max_size=100)
# ╔═╡ 6f80e4ff-99bc-4c77-aebe-5e7f21f0d328
begin
function drawdots!(img, res, color )
for i in res
img[i[1]-1:i[1]+1, i[2]-1:i[2]+1] .= color
end
end
function dist2p(p1, p2)
sqrt((p1[1]-p2[1])^2 + (p1[2]-p2[2])^2)
end
function findmyangle(a1, a2; center)
acos((dist2p(a1,center)^2 + dist2p(a2,center)^2 - dist2p(a1,a2)^2) / (2 * dist2p(center, a1) * dist2p(center, a2)))
end
end
# ╔═╡ ca92aa75-50c5-4720-a0d5-6993c21ea0b1
"""
findconvexitydefects(contour, convhull; dist = 1.1, absdiff = 10, mindist = 0, currsize = 50, d1 = 2, d2 = 2, anglemax = π/2)
return the convexity defects using contour points and convexhull
###### Arguments
- contour -> contour points using suzuki and abe algorithm
- convexhull -> convexhull boundaries found from ImageMorphology.jl
- dist and absdiff -> helps in edge cases when trying to synchronize contour points and convexhull
- mindist -> to control min distance of a defect from a convex hull region line
- currsize -> to avoid small regions of contours
- d1 and d2 -> to control of defect from pair of convexhull points forming line individually
- anglemax -> helps to control max angle between convex hull line points and the defect
Idea | For one region
:-------------------------:|:-------------------------:
![](https://i1.wp.com/theailearner.com/wp-content/uploads/2020/11/hand_hull1.jpg?resize=624%2C438&ssl=1) | ![](https://i0.wp.com/theailearner.com/wp-content/uploads/2020/11/conv_def1.jpg?w=489&ssl=1)
"""
function findconvexitydefects(
contour,
convhull;
dist=1.1,
absdiff=10,
mindist=0,
currsize= 50,
d1 = 2,
d2 = 2,
anglemax = π/2
)
# first we need to match our contours and our convehull regions
numindices = []
previous = 0
for i in convhull
for (num,j) in enumerate(contour)
if norm(Tuple(i) .- Tuple(j)) < dist && abs(previous-num) > absdiff # to avoid small and very close regions
push!(numindices, num)
previous = num
break
end
end
end
# we want the numindices same as our convhull points,
# to define regions of interest for each convhull line
defects = Vector{CartesianIndex{2}}([]) # indexes with defects
# incase numndices < convhull indexes,
# meaning we don't hv regions for all lines
if size(numindices)[1] < size(convhull)[1]
throw(error("Raise the range dist, numindices points less than convexhull points, $(size(numindices)[1]) $(size(convhull)[1]) "))
end
# iterate over each consecutive pair of convhull points to form line
for i in 1:size(convhull)[1]-1
# to handle the case where numindices are like 1256, then 1
if numindices[i] > numindices[i+1]
curr = vcat(contour[numindices[i]: end], contour[1: numindices[i+1]])
else
# general case to define contours poins for each convhull region
curr = contour[numindices[i]:numindices[i+1]]
end
# to remove minor regions of contours, we can set currsize
if size(curr)[1] < currsize
continue
end
# Defining the line
p1 = Float64.(Tuple(convhull[i])) # point 1
p2 = Float64.(Tuple(convhull[i+1])) # point 2
line = Line(;from=[p1[1], p1[2]], to=[p2[1], p2[2]])
maxdef = 0 # max distance from our convhull line
defloc = CartesianIndex(0,0) # location of the that point
# check for each contour point in a convhull region
# their distance and find max distance point
for j in curr
p = SA[j[1], j[2]]
lpdist = LazySets.distance(p, line) # find distance
# update if we find new max
if lpdist > maxdef
maxdef = lpdist
defloc = j
end
end
def1 = norm(Tuple(defloc) .- Tuple(p1))
def2 = norm(Tuple(defloc) .- Tuple(p2))
# we can define what minimum distance from line should be
angle = findmyangle(p1,p2; center=defloc)
if maxdef > mindist && def1 > d1 && def2 > d2 && angle < anglemax
push!(defects, defloc)
end
end
defects
end
# ╔═╡ dfb7c6be-ee0d-11ea-194e-9758857f7b20
begin
### Important
### Object tracker is here
function objecttracker(
img,
h = 50,
s = 255,
v = 255,
lh = 0,
ls = 20,
lv = 70,
boundingboxvar = 10
)
hsv_img = HSV.(img)
channels = channelview(float.(hsv_img))
hue_img = channels[1, :, :]
val_img = channels[3, :, :]
satur_img = channels[2, :, :]
mask = zeros(size(hue_img))
h, s, v = h, s, v
h1, s1, v1 = lh, ls, lv
ex = boundingboxvar
for ind in eachindex(hue_img)
if hue_img[ind] <= h && satur_img[ind] <= s / 255 && val_img[ind] <= v / 255
if hue_img[ind] >= h1 && satur_img[ind] >= s1 / 255 && val_img[ind] >= v1 / 255
mask[ind] = 1
end
end
end
img = mapwindow(ImageFiltering.median, dilate(mask), (3, 3))
contours = find_contours(img)
try
convhull = convexhull(img .> 0.5)
push!(convhull, convhull[1])
res = findconvexitydefects(contours[1], convhull; dist=3, absdiff = 2, currsize= 30, mindist =6)
img_convex1 = RGB{N0f8}.(ones(size(img)))
drawdots!(img_convex1, res, RGB(0,0,1))
draw!(img_convex1, ImageDraw.Path(convhull), RGB(0))
draw_contours(img_convex1, RGB(0), contours)
return img_convex1, size(res)[1]
catch e
img_convex1 = RGB{N0f8}.(ones(size(img)))
draw_contours(img_convex1, RGB(0), contours)
return img_convex1 , -1
# return Gray.(img), e
# return Gray.(img) , 0
end
end;
end
# ╔═╡ 594acafd-01d4-4eee-b9e6-5b886953b5b1
begin
image = process_raw_camera_data(raw_camera_data);
img, num = objecttracker(image[:,1:70])
z = convert(Array{RGB24},img')
img = CairoImageSurface(z)
Drawing(img.width, img.height, :png)
placeimage(img, 0, 0)
sethue("red")
fontsize(10)
if num != -1
Luxor.text("$(num[1]+1)", Luxor.Point(10, 10), halign=:center)
end
image_as_matrix()
end
# ╔═╡ 0814234d-459a-404b-9253-7f7665ea6a38
# begin
# Pkg.add(PackageSpec(url="https://github.com/Pocket-titan/DarkMode"))
# import DarkMode
# DarkMode.enable()
# end
# ╔═╡ f2236406-af64-403d-84bd-e3afe395b791
html"""<style>
main {
max-width: 900px;
}
"""
# ╔═╡ Cell order:
# ╟─0da24d63-180f-4913-a8d6-3ba54a28ef04
# ╟─ca92aa75-50c5-4720-a0d5-6993c21ea0b1
# ╠═14519106-d4cf-4a77-acca-a22b7c426334
# ╟─dfb7c6be-ee0d-11ea-194e-9758857f7b20
# ╟─1a0324de-ee19-11ea-1d4d-db37f4136ad3
# ╠═594acafd-01d4-4eee-b9e6-5b886953b5b1
# ╟─841cd0d1-c5d4-41fe-949f-b2ddc9144634
# ╟─43f08085-b9b3-4e9b-b2ff-a0907b48a897
# ╟─f5642319-05ee-4731-ad26-80bcd4f6aa7b
# ╟─6f80e4ff-99bc-4c77-aebe-5e7f21f0d328
# ╟─0814234d-459a-404b-9253-7f7665ea6a38
# ╟─f2236406-af64-403d-84bd-e3afe395b791
# ╟─4934e4de-b03d-419f-a076-9a8116f5ddf5
# Hand Landmark detection using OpenCV.jl and mediapipe
# add OpenCV.jl and PyCall.jl using pkg manager
using OpenCV # julia's opencv binding
using PyCall # used to call python from inside
cap = OpenCV.VideoCapture(Int32(0)) # To open the webcam capture stream
OpenCV.namedWindow("Hand Landmarks detection") # create a window for the output
# python code to call mediapipe
py"""
# install mediapipe and numpy in your python
import mediapipe # does the processing of the image for the hand landmark detection
import numpy as np
drawingModule = mediapipe.solutions.drawing_utils # used to draw the hand landmarks
handsModule = mediapipe.solutions.hands # used to detect the hand
def process_image(img):
# deep copy of the image
vis = np.array([x for x in img])
# to get the hand keypoints
with handsModule.Hands() as hands:
# processing of the image
results = hands.process(vis)
# if results has some landmarks for multiple hands or less
if results.multi_hand_landmarks != None:
# for each hand, draw the landmarks
for handLandmarks in results.multi_hand_landmarks:
# draw on the image
drawingModule.draw_landmarks(vis, handLandmarks, handsModule.HAND_CONNECTIONS)
# return the image
return vis
"""
imgfinal = OpenCV.Mat(zeros(UInt8, (3,640,480))) # create a matrix to store the final image
process_image = py"process_image" # assigning the python function to a julian name
while true
ret, img = OpenCV.read(cap) # read the image from the webcam
# img = reverse(img, dims = 2) # to flip image horizontal, but slows down
# to handle case if webcam stopped
if ret==false
print("Webcam stopped")
break
end
# rearranging dimension of the image for python
input = cat(img[3,:,:], img[2,:,:], img[1,:,:]; dims=3)
#(3,x,y) -> (x,y, 3)
# calling the python function
output = process_image(input);
# rearranging the dimension of the output
imgfinal[1,:,:] = output[:,:,3]
imgfinal[2,:,:] = output[:,:,2]
imgfinal[3,:,:] = output[:,:,1]
# displaying the image
OpenCV.imshow("Hand Landmarks detection", imgfinal)
if OpenCV.waitKey(Int32(5))==27
break
end
end
# release the webcam stream
OpenCV.release(cap)
# to release all the windows and close them
OpenCV.destroyAllWindows()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment