Why I Made This
Being new to coding, I wanted to make something futuristic, fun, and slightly complicated. I chose to make a Hand Gesture Mouse Controller based on Python, OpenCV, and MediaPipe. The notion of being able to control your computer using your hand alone seemed like science fiction, and I was totally in.
TL;DR: It wasn't flawless, but I gained a lot of insight into image processing, Python libraries, and the ability of hand gestures to direct real-world behavior.
What I Used
- Python
- OpenCV (for video processing)
- MediaPipe (for hand detection and landmarks)
- PyAutoGUI (for mouse movement and clicking)
- pycaw (for volume adjustment)
How It Works
Here's the big-picture logic:
- Record webcam input using OpenCV.
- Find hand landmarks with MediaPipe.
- Track finger locations, such as the thumb, index, and pinky.
- Translate the hand movement onto screen coordinates.
- Perform click, scroll, or volume gestures.
import cv2 import mediapipe as mp import pyautogui import numpy as np import time import math from ctypes import cast, POINTER from comtypes import CLSCTX_ALL from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume pyautogui.FAILSAFE = False wCam, hCam = 640, 480 frameR = 100 smoothening = 6 plocX, plocY = 0, 0 clocX, clocY = 0, 0 click_state = False scroll_timer = time.time() screenshot_timer = 0 # Volume control setup devices = AudioUtilities.GetSpeakers() interface = devices.Activate(IAudioEndpointVolume._iid_, CLSCTX_ALL, None) volume = cast(interface, POINTER(IAudioEndpointVolume)) vol_min, vol_max = volume.GetVolumeRange()[:2] cap = cv2.VideoCapture(0) cap.set(3, wCam) cap.set(4, hCam) screen_w, screen_h = pyautogui.size() mpHands = mp.solutions.hands hands = mpHands.Hands(max_num_hands=1, min_detection_confidence=0.75) mpDraw = mp.solutions.drawing_utils while True: success, img = cap.read() img = cv2.flip(img, 1) imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) results = hands.process(imgRGB) if results.multi_hand_landmarks: for handLms in results.multi_hand_landmarks: lm = handLms.landmark x1 = int(lm[8].x * wCam) y1 = int(lm[8].y * hCam) cx, cy = int(lm[0].x * wCam), int(lm[0].y * hCam) tips = [8, 12, 16, 20] fingers = [1 if lm[tip].y < lm[tip - 2].y else 0 for tip in tips] if fingers == [1, 0, 0, 0]: x3 = np.interp(x1, (frameR, wCam - frameR), (0, screen_w)) y3 = np.interp(y1, (frameR, hCam - frameR), (0, screen_h)) clocX = plocX + (x3 - plocX) / smoothening clocY = plocY + (y3 - plocY) / smoothening pyautogui.moveTo(clocX, clocY) plocX, plocY = clocX, clocY thumb_tip = lm[4] index_tip = lm[8] dist_click = np.linalg.norm(np.array([thumb_tip.x, thumb_tip.y]) - np.array([index_tip.x, index_tip.y])) if dist_click < 0.03 and not click_state: pyautogui.click() click_state = True elif dist_click > 0.05: click_state = False if fingers[0] == 1 and fingers[1] == 1: if time.time() - scroll_timer > 0.25: if lm[8].y < lm[6].y and lm[12].y < lm[10].y: pyautogui.scroll(-60) elif lm[8].y > lm[6].y and lm[12].y > lm[10].y: pyautogui.scroll(60) scroll_timer = time.time() if fingers == [0, 0, 0, 0]: x5, y5 = lm[5].x, lm[5].y x17, y17 = lm[17].x, lm[17].y angle = math.degrees(math.atan2(y17 - y5, x17 - x5)) if angle > 30: volume.SetMasterVolumeLevel(min(vol_max, volume.GetMasterVolumeLevel() + 1.0), None) elif angle < -30: volume.SetMasterVolumeLevel(max(vol_min, volume.GetMasterVolumeLevel() - 1.0), None) if fingers == [1, 1, 1, 1]: if screenshot_timer == 0: screenshot_timer = time.time() elif time.time() - screenshot_timer > 2: pyautogui.screenshot().save("screenshot.png") screenshot_timer = 0 else: screenshot_timer = 0 mpDraw.draw_landmarks(img, handLms, mpHands.HAND_CONNECTIONS) cv2.imshow("Hand Gesture Controller", img) if cv2.waitKey(1) & 0xFF == ord('q'): break cap.release() cv2.destroyAllWindows()
Top comments (0)