Mr.洋
Published © MIT

RK3576 + Hailo-8: 12x AI Performance Boost

Deploy YOLOv11n on Seeed Studio's reComputer RK3576 with the Hailo-8 module – boosting inference from 2.3 FPS to 28.1 FPS.

BeginnerFull instructions provided4 hours16
RK3576 + Hailo-8: 12x AI Performance Boost

Things used in this project

Hardware components

reComputer RK3576
Seeed Studio reComputer RK3576
×1
Hailo-8
×1
USB Webcam
×1

Software apps and online services

Seeed Studio reComputer AI Lab
VS Code
Microsoft VS Code

Story

Read more

Code

webcam_yolo11n

Python
import numpy as np
import cv2
import time
from hailo_platform import (VDevice, HEF, InferVStreams, ConfigureParams,
                            HailoStreamInterface, InputVStreamParams, OutputVStreamParams)

# ================= Configuration =================
HEF_PATH = 'yolov11n.hef'
DEVICE_ID = "/dev/video0"  # Update based on v4l2-ctl output
CONF_THRESHOLD = 0.45

# COCO Dataset 80 Class Labels
COCO_CLASSES = [
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
    "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
    "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
    "hair drier", "toothbrush"
]
# ==================================================

def main():
    # 1. Initialize Hailo Hardware
    hef = HEF(HEF_PATH)
    input_vstream_info = hef.get_input_vstream_infos()[0]
    input_h, input_w = input_vstream_info.shape[:2]

    cap = cv2.VideoCapture(DEVICE_ID)
    if not cap.isOpened():
        print("Cannot open webcam")
        return

    # Setup inference variables
    prev_time = 0

    with VDevice() as target:
        config_params_dict = ConfigureParams.create_from_hef(hef, HailoStreamInterface.PCIe)
        network_group = target.configure(hef, config_params_dict)[0]
        with network_group.activate():
            vstream_params = (InputVStreamParams.make_from_network_group(network_group),
                              OutputVStreamParams.make_from_network_group(network_group))
            with InferVStreams(network_group, vstream_params[0], vstream_params[1]) as vstreams:
                print("[INFO] Initialization successful! Running YOLOv11n real-time detection...")
                while True:
                    start_time = time.time()  # Record start time for FPS
                    ret, frame = cap.read()
                    if not ret:
                        break

                    # Preprocessing (Convert to RGB based on previous validation)
                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    resized = cv2.resize(frame_rgb, (input_w, input_h))
                    input_tensor = np.expand_dims(resized, axis=0)

                    # Inference
                    outputs = vstreams.infer(input_tensor)

                    # Parsing and Drawing
                    h, w, _ = frame.shape
                    for name, class_list in outputs.items():
                        # Iterate through 80 classes
                        for class_id, detections in enumerate(class_list[0]):
                            if len(detections) > 0:
                                for det in detections:
                                    if len(det) >= 5:
                                        ymin, xmin, ymax, xmax, confidence = det[:5]
                                        if confidence > CONF_THRESHOLD:
                                            # Coordinate Mapping
                                            left, top = int(xmin * w), int(ymin * h)
                                            right, bottom = int(xmax * w), int(ymax * h)

                                            # Get class name, display ID if out of bounds
                                            class_name = COCO_CLASSES[class_id] if class_id < len(COCO_CLASSES) else f"ID {class_id}"

                                            # Draw bounding box
                                            cv2.rectangle(frame, (left, top), (right, bottom), (0, 255, 0), 2)
                                            # Draw background and label text
                                            label = f"{class_name}: {confidence:.2f}"
                                            cv2.putText(frame, label, (left, top - 10),
                                                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

                    # Calculate and display real-time FPS
                    curr_time = time.time()
                    fps = 1 / (curr_time - start_time)
                    # Print in the top left corner
                    cv2.putText(frame, f"FPS: {fps:.1f}", (20, 40),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

                    # Display window
                    cv2.imshow('reComputer RK3576 - Hailo YOLOv11n', frame)
                    if cv2.waitKey(1) & 0xFF == ord('q'):
                        break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

webcam_npu_save

Python
import numpy as np
import cv2
import time
import os
from rknnlite.api import RKNNLite

# ================= Configuration =================
RKNN_MODEL_PATH = 'yolo11n.rknn'  # RKNN 模型路径
DEVICE_ID = 0  # 摄像头设备号,对应 /dev/video0
CONF_THRESHOLD = 0.45
OUTPUT_DIR = "npu_detection_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# COCO Dataset 80 Class Labels
COCO_CLASSES = [
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
    "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
    "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
    "hair drier", "toothbrush"
]
# ==================================================

def post_process(outputs, frame, conf_threshold):
    """解码 YOLOv8 风格的原始输出(解耦头)"""
    h, w = frame.shape[:2]
    detections = []
    
    # 定义三个尺度的特征图尺寸
    scales = [(80, 80), (40, 40), (20, 20)]
    
    # 提取输出 (索引对应关系)
    # 0,3,6: 回归 (reg) -> [64, 80, 80] 等
    # 1,4,7: 分类 (cls) -> [80, 80, 80] 等
    # 2,5,8: 目标性 (obj) -> [1, 80, 80] 等
    reg_outputs = [outputs[0], outputs[3], outputs[6]]
    cls_outputs = [outputs[1], outputs[4], outputs[7]]
    obj_outputs = [outputs[2], outputs[5], outputs[8]]

    # 遍历三个尺度
    for reg, cls, obj, (h_feat, w_feat) in zip(reg_outputs, cls_outputs, obj_outputs, scales):
        # 将张量展平并调整维度顺序
        reg = reg.squeeze(0).transpose(1, 2, 0).reshape(-1, 64)  # [num_boxes, 64]
        cls = cls.squeeze(0).transpose(1, 2, 0).reshape(-1, 80)  # [num_boxes, 80]
        obj = obj.squeeze(0).transpose(1, 2, 0).reshape(-1, 1)   # [num_boxes, 1]

        # 对每个特征点进行解码
        for i in range(reg.shape[0]):
            # 1. 目标性分数
            obj_conf = obj[i][0]
            if obj_conf < conf_threshold:
                continue
            
            # 2. 分类分数
            cls_scores = cls[i] * obj_conf  # 分类分数 * 目标性分数
            class_id = np.argmax(cls_scores)
            confidence = cls_scores[class_id]
            if confidence < conf_threshold:
                continue

            # 3. 解码边界框 (YOLO 格式)
            # 获取特征图中的网格坐标
            row = i // w_feat
            col = i % w_feat
            
            # 从回归头中提取 x, y, w, h 的偏移量
            reg_vals = reg[i]
            dx, dy, dw, dh = reg_vals[0], reg_vals[1], reg_vals[2], reg_vals[3]
            
            # 计算中心点坐标和宽高 (在特征图上的归一化坐标)
            cx = (col + dx) / w_feat
            cy = (row + dy) / h_feat
            width = dw
            height = dh
            
            # 转换为原图坐标
            left = int((cx - width / 2) * w)
            top = int((cy - height / 2) * h)
            right = int((cx + width / 2) * w)
            bottom = int((cy + height / 2) * h)
            
            # 边界检查
            left = max(0, left)
            top = max(0, top)
            right = min(w, right)
            bottom = min(h, bottom)
            
            class_name = COCO_CLASSES[class_id] if class_id < len(COCO_CLASSES) else f"ID {class_id}"
            detections.append((class_name, float(confidence), left, top, right, bottom))
    
    # 移除多余代码(因为这里没有使用 NMS,但建议保留)
    # 注意:如果检测框过多,可以考虑保留 NMS 逻辑
    return detections

def main():
    # 1. 初始化 RKNN
    rknn = RKNNLite()
    
    # 加载模型
    print(f"[INFO] Loading RKNN model from {RKNN_MODEL_PATH}...")
    ret = rknn.load_rknn(RKNN_MODEL_PATH)
    if ret != 0:
        print(f"[ERROR] Load RKNN model failed: {ret}")
        return
    
    # 初始化运行时
    print("[INFO] Initializing RKNN runtime...")
    ret = rknn.init_runtime()
    if ret != 0:
        print(f"[ERROR] Init runtime failed: {ret}")
        return
    print("[INFO] RKNN model loaded successfully")
    
    # 2. 打开摄像头
    cap = cv2.VideoCapture(DEVICE_ID)
    if not cap.isOpened():
        print("Cannot open webcam")
        return
    
    print(f"[INFO] Camera opened. Model input: 640x640")
    print("[INFO] Press Ctrl+C to stop.")
    
    frame_count = 0
    save_interval = 5  # 每5帧保存一张
    
    try:
        while True:
            start_time = time.time()
            ret, frame = cap.read()
            if not ret:
                break
            
            # 预处理
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            resized = cv2.resize(frame_rgb, (640, 640))
            input_tensor = np.expand_dims(resized, axis=0).astype(np.float32) / 255.0
            
            # 推理
            outputs = rknn.inference(inputs=[input_tensor])
            
            # 后处理
            detections = post_process(outputs, frame, CONF_THRESHOLD)
            
            # 计算并显示 FPS
            fps = 1 / (time.time() - start_time)
            cv2.putText(frame, f"NPU FPS: {fps:.1f}", (20, 40),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
            
            # 每 N 帧保存一张图片(带检测框和FPS)
            if frame_count % save_interval == 0:
                img_path = os.path.join(OUTPUT_DIR, f"npu_frame_{frame_count:06d}.jpg")
                cv2.imwrite(img_path, frame)
                print(f"[Frame {frame_count}] Saved -> {img_path}")
            
            # 终端打印检测信息
            if detections:
                print(f"[Frame {frame_count}] Found {len(detections)} objects, FPS: {fps:.1f}")
                for cls, conf, l, t, r, b in detections[:3]:  # 最多打印3个
                    print(f"  - {cls}: {conf:.2f} at ({l},{t})-({r},{b})")
            
            frame_count += 1
            time.sleep(0.01)
            
    except KeyboardInterrupt:
        print("\n[INFO] Stopped by user.")
    
    cap.release()
    print(f"[INFO] Done. Total frames: {frame_count}")

if __name__ == "__main__":
    main()

Credits

Mr.洋
2 projects • 0 followers
AE

Comments