Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save kirisakow/325a557d89262e8d6a4f2918917e82b4 to your computer and use it in GitHub Desktop.
Save kirisakow/325a557d89262e8d6a4f2918917e82b4 to your computer and use it in GitHub Desktop.
Real-time object detection in webcam video stream in Google Colab, using Ultralytics YOLOv8
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [
{
"file_id": "https://gist.github.com/kirisakow/325a557d89262e8d6a4f2918917e82b4",
"timestamp": 1709989009226
}
],
"private_outputs": true,
"collapsed_sections": ["r094Dy18OMsg", "RP0iY45PtSFW", "rex6xev5qP-G", "pUYfZKNlhSFn"]
},
"kernelspec": { "name": "python3", "display_name": "Python 3" }
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/kirisakow/325a557d89262e8d6a4f2918917e82b4/real-time-object-detection-in-webcam-video-stream-using-ultralytics-yolov8.ipynb\" target=\"_blank\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# Real-time object detection in webcam video stream using Ultralytics YOLOv8\n",
"\n",
"This is a refactored version of an older, YOLOv3-compatible [notebook][older_notebook] which no longer works.\n",
"\n",
"Instructions:\n",
"\n",
"* Run `Runtime` > `Run all` (or press <kbd>Ctrl</kbd> <kbd>F9</kbd>) to run all cells.\n",
"* Allow access to the webcam, if asked.\n",
"* To stop the webcam capture, click the red text or the picture.\n",
"\n",
"Resources:\n",
"* Ultralytics [documentation][ultralytics_doc] for object detection.\n",
"* Google Colab [documentation][ipython_display_doc] for executing JavaScript from Python for tasks such as webcam capture, etc.\n",
"\n",
"Author: [Kiril Isakov][kisakov_linkedin] ([kirisakow][kirisakow_github])\n",
"\n",
"[kisakov_linkedin]: https://www.linkedin.com/in/kisakov/\n",
"[kirisakow_github]: https://github.com/kirisakow\n",
"[older_notebook]: https://github.com/vindruid/yolov3-in-colab/blob/aaac930911e18826c560d3156220cedb8726b8c7/yolov3_streaming_webcam.ipynb\n",
"[ipython_display_doc]: https://colab.research.google.com/notebooks/snippets/advanced_outputs.ipynb#scrollTo=2viqYx97hPMi\n",
"[ultralytics_doc]: https://docs.ultralytics.com/tasks/detect/"
],
"metadata": { "id": "VPMjpFmrhUAc" }
},
{
"cell_type": "markdown",
"source": ["## 0. Prerequisites"],
"metadata": { "id": "r094Dy18OMsg" }
},
{
"cell_type": "markdown",
"source": ["### 0.1. Install and initialize libraries and constants"],
"metadata": { "id": "RP0iY45PtSFW" }
},
{
"cell_type": "code",
"source": ["! pip install --upgrade --quiet ultralytics"],
"metadata": { "id": "--kKX4f9WlAj" },
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from base64 import b64decode, b64encode\n",
"from google.colab.output import eval_js\n",
"from IPython.display import display, Javascript\n",
"from PIL import Image\n",
"from ultralytics import YOLO\n",
"from ultralytics.engine.results import Results\n",
"import io\n",
"import numpy as np\n",
"\n",
"MODEL_NAMES = ['yolov8n.pt', 'yolov8s.pt', 'yolov8m.pt', 'yolov8l.pt', 'yolov8x.pt']\n",
"PRE_TRAINED_MODEL = YOLO(MODEL_NAMES[0])\n",
"IMG_SHAPE = [640, 480]\n",
"IMG_QUALITY = 0.8"
],
"metadata": { "id": "XAgO_ab-roQ2" },
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": { "id": "rex6xev5qP-G" },
"source": [
"### 0.2. Define JavaScript functions to capture webcam stream, and to be called by Python"
]
},
{
"cell_type": "code",
"metadata": { "id": "7OYmjeF-edKE" },
"source": [
"def start_stream():\n",
" js = Javascript(f'''\n",
" const IMG_SHAPE = {IMG_SHAPE};\n",
" const IMG_QUALITY = {IMG_QUALITY};\n",
" ''' + '''\n",
" var video;\n",
" var div = null;\n",
" var stream;\n",
" var captureCanvas;\n",
" var imgElement;\n",
" var labelElement;\n",
"\n",
" var pendingResolve = null;\n",
" var shutdown = false;\n",
"\n",
" function removeDom() {\n",
" stream.getVideoTracks()[0].stop();\n",
" video.remove();\n",
" div.remove();\n",
" video = null;\n",
" div = null;\n",
" stream = null;\n",
" imgElement = null;\n",
" captureCanvas = null;\n",
" labelElement = null;\n",
" }\n",
"\n",
" function onAnimationFrame() {\n",
" if (!shutdown) {\n",
" window.requestAnimationFrame(onAnimationFrame);\n",
" }\n",
" if (pendingResolve) {\n",
" var result = \"\";\n",
" if (!shutdown) {\n",
" captureCanvas.getContext('2d').drawImage(video, 0, 0, IMG_SHAPE[0], IMG_SHAPE[1]);\n",
" result = captureCanvas.toDataURL('image/jpeg', IMG_QUALITY)\n",
" }\n",
" var lp = pendingResolve;\n",
" pendingResolve = null;\n",
" lp(result);\n",
" }\n",
" }\n",
"\n",
" async function createDom() {\n",
" if (div !== null) {\n",
" return stream;\n",
" }\n",
"\n",
" div = document.createElement('div');\n",
" div.style.border = '2px solid black';\n",
" div.style.padding = '3px';\n",
" div.style.width = '100%';\n",
" div.style.maxWidth = '600px';\n",
" document.body.appendChild(div);\n",
"\n",
" const modelOut = document.createElement('div');\n",
" modelOut.innerHTML = \"<span>Status: </span>\";\n",
" labelElement = document.createElement('span');\n",
" labelElement.innerText = 'No data';\n",
" labelElement.style.fontWeight = 'bold';\n",
" modelOut.appendChild(labelElement);\n",
" div.appendChild(modelOut);\n",
"\n",
" video = document.createElement('video');\n",
" video.style.display = 'block';\n",
" video.width = div.clientWidth - 6;\n",
" video.setAttribute('playsinline', '');\n",
" video.onclick = () => { shutdown = true; };\n",
" stream = await navigator.mediaDevices.getUserMedia(\n",
" {video: { facingMode: \"environment\"}});\n",
" div.appendChild(video);\n",
"\n",
" imgElement = document.createElement('img');\n",
" imgElement.style.position = 'absolute';\n",
" imgElement.style.zIndex = 1;\n",
" imgElement.onclick = () => { shutdown = true; };\n",
" div.appendChild(imgElement);\n",
"\n",
" const instruction = document.createElement('div');\n",
" instruction.innerHTML =\n",
" '<span style=\"color: red; font-weight: bold;\">' +\n",
" 'When finished, click here or on the video to stop this demo</span>';\n",
" div.appendChild(instruction);\n",
" instruction.onclick = () => { shutdown = true; };\n",
"\n",
" video.srcObject = stream;\n",
" await video.play();\n",
"\n",
" captureCanvas = document.createElement('canvas');\n",
" captureCanvas.width = IMG_SHAPE[0]; //video.videoWidth;\n",
" captureCanvas.height = IMG_SHAPE[1]; //video.videoHeight;\n",
" window.requestAnimationFrame(onAnimationFrame);\n",
"\n",
" return stream;\n",
" }\n",
" async function takePhoto(label, imgData) {\n",
" if (shutdown) {\n",
" removeDom();\n",
" shutdown = false;\n",
" return '';\n",
" }\n",
"\n",
" var preCreate = Date.now();\n",
" stream = await createDom();\n",
"\n",
" var preShow = Date.now();\n",
" if (label != \"\") {\n",
" labelElement.innerHTML = label;\n",
" }\n",
"\n",
" if (imgData != \"\") {\n",
" var videoRect = video.getClientRects()[0];\n",
" imgElement.style.top = videoRect.top + \"px\";\n",
" imgElement.style.left = videoRect.left + \"px\";\n",
" imgElement.style.width = videoRect.width + \"px\";\n",
" imgElement.style.height = videoRect.height + \"px\";\n",
" imgElement.src = imgData;\n",
" }\n",
"\n",
" var preCapture = Date.now();\n",
" var result = await new Promise((resolve, reject) => pendingResolve = resolve);\n",
" shutdown = false;\n",
"\n",
" return {\n",
" 'create': preShow - preCreate,\n",
" 'show': preCapture - preShow,\n",
" 'capture': Date.now() - preCapture,\n",
" 'img': result,\n",
" };\n",
" }\n",
" ''')\n",
" display(js)\n",
"\n",
"def take_photo(label, img_data):\n",
" data = eval_js(f'takePhoto(\"{label}\", \"{img_data}\")')\n",
" return data"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": ["### 0.3. Define Python functions to"],
"metadata": { "id": "pUYfZKNlhSFn" }
},
{
"cell_type": "code",
"metadata": { "id": "A0ZgJiQBskDt" },
"source": [
"def js_response_to_image(js_response) -> Image.Image:\n",
" _, b64_str = js_response['img'].split(',')\n",
" jpeg_bytes = b64decode(b64_str)\n",
" image = Image.open(io.BytesIO(jpeg_bytes))\n",
" return image\n",
"\n",
"def turn_non_black_pixels_visible(rgba_compatible_array: np.ndarray) -> np.ndarray:\n",
" rgba_compatible_array[:, :, 3] = (rgba_compatible_array.max(axis=2) > 0).astype(int) * 255\n",
" return rgba_compatible_array\n",
"\n",
"def black_transparent_rgba_canvas(w, h) -> np.ndarray:\n",
" return np.zeros([w, h, 4], dtype=np.uint8)\n",
"\n",
"def draw_annotations_on_transparent_bg(detection_result: Results) -> Image.Image:\n",
" black_rgba_canvas = black_transparent_rgba_canvas(*detection_result.orig_shape)\n",
" transparent_canvas_with_boxes_invisible = detection_result.plot(font='verdana', masks=False, img=black_rgba_canvas)\n",
" transparent_canvas_with_boxes_visible = turn_non_black_pixels_visible(transparent_canvas_with_boxes_invisible)\n",
" image = Image.fromarray(transparent_canvas_with_boxes_visible, 'RGBA')\n",
" return image"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": { "id": "iIlKEQ3ruDCw" },
"source": ["## 1. Perform real-time object detection in webcam stream"]
},
{
"cell_type": "code",
"metadata": { "id": "hZi6NXSDyAY6" },
"source": [
"start_stream()\n",
"img_data = ''\n",
"while True:\n",
" js_response = take_photo('Capturing...', img_data)\n",
" if not js_response:\n",
" break\n",
" captured_img = js_response_to_image(js_response)\n",
" for detection_result in PRE_TRAINED_MODEL(source=np.array(captured_img), verbose=False):\n",
" annotations_img = draw_annotations_on_transparent_bg(detection_result)\n",
" with io.BytesIO() as buffer:\n",
" annotations_img.save(buffer, format='png')\n",
" img_as_base64_str = str(b64encode(buffer.getvalue()), 'utf-8')\n",
" img_data = f'data:image/png;base64,{img_as_base64_str}'"
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment