Daksh Sambhare
Created August 30, 2024

Haptic-Navigator and Visual Information Scanner

A device capable of helping in navigation using binocular vision and Visual information reading through audio

Circuit Diagrams

Depth estimation

Raw format of metric Depth estimation
import cv2
import numpy as np
import os
import torch

from depth_anything_v2.dpt import DepthAnythingV2

# Import the OpenCV extra functionalities for object detection
classNames = []
classFile = "/home/dsay/Documents/hackster/depthanything/Depth-Anything-V2/metric_depth/cocoobject/Object_Detection_Files/coco.names"
with open(classFile, "rt") as f:
    classNames = f.read().rstrip("\n").split("\n")

configPath = "/home/dsay/Documents/hackster/depthanything/Depth-Anything-V2/metric_depth/cocoobject/Object_Detection_Files/ssd_mobilenet_v3_large_coco_2020_01_14.pbtxt"
weightsPath = "/home/dsay/Documents/hackster/depthanything/Depth-Anything-V2/metric_depth/cocoobject/Object_Detection_Files/frozen_inference_graph.pb"

net = cv2.dnn_DetectionModel(weightsPath, configPath)
net.setInputSize(320, 320)
net.setInputScale(1.0 / 127.5)
net.setInputMean((127.5, 127.5, 127.5))

def getObjects(img, thres, nms, draw=True, objects=[]):
    classIds, confs, bbox = net.detect(img, confThreshold=thres, nmsThreshold=nms)
    objectInfo = []
    if len(classIds) != 0:
        for classId, confidence, box in zip(classIds.flatten(), confs.flatten(), bbox):
            className = classNames[classId - 1]
            if className in objects:
                objectInfo.append([box, className])
                if draw:
                    cv2.rectangle(img, box, color=(0, 255, 0), thickness=2)
                    cv2.putText(img, classNames[classId-1].upper(), (box[0]-10, box[1]-30), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2)
                    cv2.putText(img, str(round(confidence*100, 2)), (box[0]-200, box[1]-30), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2)
    return img, objectInfo

    # Perform depth estimation on the entire frame
    depth = depth_anything.infer_image(raw_image, args.input_size)
    # Normalize and convert depth to visual representation
    depth_visual = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
    depth_visual = depth_visual.astype(np.uint8)
    # Convert depth to meters
    depth_meters = depth * args.max_depth / 255.0
    # Apply colormap or grayscale
    if args.grayscale:
        depth_visual = cv2.cvtColor(depth_visual, cv2.COLOR_GRAY2BGR)
        cmap = matplotlib.cm.get_cmap('Spectral')
        depth_visual = (cmap(depth_visual)[:, :, :3] * 255).astype(np.uint8)
    # Combine original frame with depth visualization
    split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255
    combined_result = np.hstack([raw_image, split_region, depth_visual])
    return combined_result

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Depth Anything V2 Metric Depth Estimation on Webcam')
    parser.add_argument('--video-path1', type=int, default=2, help='Webcam index for video capture (camera 1)')
    parser.add_argument('--video-path2', type=int, default=1, help='Webcam index for video capture (camera 2)')
    parser.add_argument('--input-size', type=int, default=518, help='Input size for image processing')
    parser.add_argument('--outdir', type=str, default='./vis_depth', help='Output directory')

    # Initialize DepthAnythingV2 model
    depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth})
    depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu'))
    depth_anything = depth_anything.to(DEVICE).eval()
    # Open webcam captures for both cameras
    cap1 = cv2.VideoCapture(args.video_path1)
    cap2 = cv2.VideoCapture(args.video_path2)
    frame_count = 0
    # Create output directory if not exists
    os.makedirs(args.outdir, exist_ok=True)
    while True:
        # Read frames from both cameras
        ret1, frame1 = cap1.read()
        ret2, frame2 = cap2.read()
        if not ret1 or not ret2:
        frame_count += 1
        # Process frames from both cameras
        combined_result1 = process_frame(frame1, depth_anything, args, frame_count)
        combined_result2 = process_frame(frame2, depth_anything, args, frame_count)
        # Display the processed frames from both cameras
        cv2.imshow('Camera 1 Depth Estimation', combined_result1)
        cv2.imshow('Camera 2 Depth Estimation', combined_result2)
        # Press 'q' to quit
        if cv2.waitKey(1) & 0xFF == ord('q'):
    # Release webcam captures and close all windows


