import datetime import logging import paho.mqtt.client as mqtt import events.events_pb2 import depthai as dai import cv2 logger = logging.getLogger(__name__) def to_tensor_result(packet): return { name: np.array(packet.getLayerFp16(name)) for name in [tensor.name for tensor in packet.getRaw().tensors] } def to_planar(frame): return frame.transpose(2, 0, 1).flatten() class FramePublisher: def __init__(self, mqtt_client: mqtt.Client, frame_topic: str, img_width: int, img_height: int): self._mqtt_client = mqtt_client self._frame_topic = frame_topic self._img_width = img_width self._img_height = img_height self._pipeline = self._configure_pipeline() def _configure_pipeline(self) -> dai.Pipeline: logger.info("configure pipeline") pipeline = dai.Pipeline() version = "2021.2" pipeline.setOpenVINOVersion(version=dai.OpenVINO.Version.VERSION_2021_2) # colorCam = pipeline.create(dai.node.ColorCamera) # colorCam.setPreviewSize(256, 256) # colorCam.setVideoSize(1024, 1024) # 4 times larger in both axis # colorCam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P) # colorCam.setInterleaved(False) # colorCam.setBoardSocket(dai.CameraBoardSocket.RGB) # colorCam.setFps(10) # # controlIn = pipeline.create(dai.node.XLinkIn) # controlIn.setStreamName('control') # controlIn.out.link(colorCam.inputControl) # # cam_xout = pipeline.create(dai.node.XLinkOut) # cam_xout.setStreamName('video') # colorCam.video.link(cam_xout.input) # --------------------------------------- # 1st stage NN - text-detection # --------------------------------------- nn = pipeline.create(dai.node.NeuralNetwork) nn.setBlobPath( blobconverter.from_zoo(name="east_text_detection_256x256", zoo_type="depthai", shaves=6, version=version)) colorCam.preview.link(nn.input) nn_xout = pipeline.create(dai.node.XLinkOut) nn_xout.setStreamName('detections') nn.out.link(nn_xout.input) # --------------------------------------- # 2nd stage NN - text-recognition-0012 # --------------------------------------- manip = pipeline.create(dai.node.ImageManip) manip.setWaitForConfigInput(True) manip_img = pipeline.create(dai.node.XLinkIn) manip_img.setStreamName('manip_img') manip_img.out.link(manip.inputImage) manip_cfg = pipeline.create(dai.node.XLinkIn) manip_cfg.setStreamName('manip_cfg') manip_cfg.out.link(manip.inputConfig) manip_xout = pipeline.create(dai.node.XLinkOut) manip_xout.setStreamName('manip_out') nn2 = pipeline.create(dai.node.NeuralNetwork) nn2.setBlobPath(blobconverter.from_zoo(name="text-recognition-0012", shaves=6, version=version)) nn2.setNumInferenceThreads(2) manip.out.link(nn2.input) manip.out.link(manip_xout.input) nn2_xout = pipeline.create(dai.node.XLinkOut) nn2_xout.setStreamName("recognitions") nn2.out.link(nn2_xout.input) cam_rgb = pipeline.create(dai.node.ColorCamera) xout_rgb = pipeline.create(dai.node.XLinkOut) xout_rgb.setStreamName("rgb") # Properties cam_rgb.setBoardSocket(dai.CameraBoardSocket.RGB) cam_rgb.setPreviewSize(width=self._img_width, height=self._img_height) cam_rgb.setInterleaved(False) cam_rgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.RGB) cam_rgb.setFps(30) # Linking cam_rgb.preview.link(xout_rgb.input) logger.info("pipeline configured") return pipeline def run(self): with dai.Device(self._pipeline) as device: q_vid = device.getOutputQueue("video", 4, blocking=False) # This should be set to block, but would get to some extreme queuing/latency! q_det = device.getOutputQueue("detections", 4, blocking=False) q_rec = device.getOutputQueue("recognitions", 4, blocking=True) q_manip_img = device.getInputQueue("manip_img") q_manip_cfg = device.getInputQueue("manip_cfg") q_manip_out = device.getOutputQueue("manip_out", 4, blocking=False) controlQueue = device.getInputQueue('control') frame = None cropped_stacked = None rotated_rectangles = [] rec_pushed = 0 rec_received = 0 host_sync = HostSeqSync() characters = '0123456789abcdefghijklmnopqrstuvwxyz#' codec = CTCCodec(characters) ctrl = dai.CameraControl() ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.CONTINUOUS_VIDEO) ctrl.setAutoFocusTrigger() controlQueue.send(ctrl) while True: vid_in = q_vid.tryGet() if vid_in is not None: host_sync.add_msg(vid_in) # Multiple recognition results may be available, read until queue is empty while True: in_rec = q_rec.tryGet() if in_rec is None: break rec_data = bboxes = np.array(in_rec.getFirstLayerFp16()).reshape(30, 1, 37) decoded_text = codec.decode(rec_data)[0] pos = rotated_rectangles[rec_received] print("{:2}: {:20}".format(rec_received, decoded_text), "center({:3},{:3}) size({:3},{:3}) angle{:5.1f} deg".format( int(pos[0][0]), int(pos[0][1]), pos[1][0], pos[1][1], pos[2])) # Draw the text on the right side of 'cropped_stacked' - placeholder if cropped_stacked is not None: cv2.putText(cropped_stacked, decoded_text, (120 + 10, 32 * rec_received + 24), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2) cv2.imshow('cropped_stacked', cropped_stacked) rec_received += 1 if cv2.waitKey(1) == ord('q'): break if rec_received >= rec_pushed: in_det = q_det.tryGet() if in_det is not None: frame = host_sync.get_msg(in_det.getSequenceNum()).getCvFrame().copy() scores, geom1, geom2 = to_tensor_result(in_det).values() scores = np.reshape(scores, (1, 1, 64, 64)) geom1 = np.reshape(geom1, (1, 4, 64, 64)) geom2 = np.reshape(geom2, (1, 1, 64, 64)) bboxes, confs, angles = east.decode_predictions(scores, geom1, geom2) boxes, angles = east.non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles)) rotated_rectangles = [ east.get_cv_rotated_rect(bbox, angle * -1) for (bbox, angle) in zip(boxes, angles) ] rec_received = 0 rec_pushed = len(rotated_rectangles) if rec_pushed: print("====== Pushing for recognition, count:", rec_pushed) cropped_stacked = None for idx, rotated_rect in enumerate(rotated_rectangles): # Detections are done on 256x256 frames, we are sending back 1024x1024 # That's why we multiply center and size values by 4 rotated_rect[0][0] = rotated_rect[0][0] * 4 rotated_rect[0][1] = rotated_rect[0][1] * 4 rotated_rect[1][0] = rotated_rect[1][0] * 4 rotated_rect[1][1] = rotated_rect[1][1] * 4 # Draw detection crop area on input frame points = np.int0(cv2.boxPoints(rotated_rect)) print(rotated_rect) cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, lineType=cv2.LINE_8) # TODO make it work taking args like in OpenCV: # rr = ((256, 256), (128, 64), 30) rr = dai.RotatedRect() rr.center.x = rotated_rect[0][0] rr.center.y = rotated_rect[0][1] rr.size.width = rotated_rect[1][0] rr.size.height = rotated_rect[1][1] rr.angle = rotated_rect[2] cfg = dai.ImageManipConfig() cfg.setCropRotatedRect(rr, False) cfg.setResize(120, 32) # Send frame and config to device if idx == 0: w, h, c = frame.shape imgFrame = dai.ImgFrame() imgFrame.setData(to_planar(frame)) imgFrame.setType(dai.ImgFrame.Type.BGR888p) imgFrame.setWidth(w) imgFrame.setHeight(h) q_manip_img.send(imgFrame) else: cfg.setReusePreviousImage(True) q_manip_cfg.send(cfg) # Get manipulated image from the device transformed = q_manip_out.get().getCvFrame() rec_placeholder_img = np.zeros((32, 200, 3), np.uint8) transformed = np.hstack((transformed, rec_placeholder_img)) if cropped_stacked is None: cropped_stacked = transformed else: cropped_stacked = np.vstack((cropped_stacked, transformed)) if cropped_stacked is not None: cv2.imshow('cropped_stacked', cropped_stacked) if frame is not None: cv2.imshow('frame', frame) key = cv2.waitKey(1) if key == ord('q'): break elif key == ord('t'): print("Autofocus trigger (and disable continuous)") ctrl = dai.CameraControl() ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.AUTO) ctrl.setAutoFocusTrigger() controlQueue.send(ctrl) # Connect to device and start pipeline with dai.Device(self._pipeline) as device: logger.info('MxId: %s', device.getDeviceInfo().getMxId()) logger.info('USB speed: %s', device.getUsbSpeed()) logger.info('Connected cameras: %s', device.getConnectedCameras()) logger.info("output queues found: %s", device.getOutputQueueNames()) device.startPipeline() # Queues queue_size = 4 q_rgb = device.getOutputQueue("rgb", maxSize=queue_size, blocking=False) while True: try: logger.debug("wait for new frame") inRgb = q_rgb.get() # blocking call, will wait until a new data has arrived im_resize = inRgb.getCvFrame() is_success, im_buf_arr = cv2.imencode(".jpg", im_resize) byte_im = im_buf_arr.tobytes() now = datetime.datetime.now() frame_msg = events.events_pb2.FrameMessage() frame_msg.id.name = "robocar-oak-camera-oak" frame_msg.id.id = str(int(now.timestamp() * 1000)) frame_msg.id.created_at.FromDatetime(now) frame_msg.frame = byte_im logger.debug("publish frame event to %s", self._frame_topic) self._mqtt_client.publish(topic=self._frame_topic, payload=frame_msg.SerializeToString(), qos=0, retain=False) except Exception as e: logger.exception("unexpected error: %s", str(e))