WIP

fix: regenerate event struct with recent version of protobuf compiler
2022-08-10 12:28:41 +02:00 · 2022-08-02 12:27:15 +02:00
9 changed files with 1027 additions and 103 deletions
--- a/12
+++ b/12
@@ -1,12 +1,3 @@
 FROM docker.io/library/python:3.9-slim AS model
 RUN python3 -m pip install blobconverter
 RUN mkdir -p /models
 RUN blobconverter --zoo-name mobile_object_localizer_192x192 --zoo-type depthai --shaves 6 --version 2021.4 --output-dir /models || echo ""
 RUN ls /models
 #######
 FROM docker.io/library/python:3.9-slim
 # Configure piwheels repo to use pre-compiled numpy wheels for arm
@@ -16,9 +7,6 @@ RUN apt-get update && apt-get install -y libgl1 libglib2.0-0
 RUN pip3 install numpy
 RUN mkdir /models
 COPY --from=model /models/mobile_object_localizer_192x192_openvino_2021.4_6shave.blob /models/mobile_object_localizer_192x192_openvino_2021.4_6shave.blob
 ADD requirements.txt requirements.txt
 RUN pip3 install -r requirements.txt
--- a/camera/cli.py
+++ b/camera/cli.py
@@ -1,25 +1,19 @@
 """
 Publish data from oak-lite device
-Usage: rc-oak-camera [-u USERNAME | --mqtt-username=USERNAME] [--mqtt-password=PASSWORD] \
+Usage: rc-oak-camera [-u USERNAME | --mqtt-username=USERNAME] [--mqtt-password=PASSWORD] [--mqtt-broker=HOSTNAME] \
-    [--mqtt-broker-host=HOSTNAME] [--mqtt-broker-port=PORT] \
+    [--mqtt-topic-robocar-oak-camera="TOPIC_CAMERA"] [--mqtt-client-id=CLIENT_ID] \
-    [--mqtt-topic-robocar-oak-camera="TOPIC_CAMERA"] [--mqtt-topic-robocar-objects="TOPIC_OBJECTS"] \
+    [-H IMG_HEIGHT | --image-height=IMG_HEIGHT] [-W IMG_WIDTH | --image-width=IMG_width]
    [--mqtt-client-id=CLIENT_ID] \
    [-H IMG_HEIGHT | --image-height=IMG_HEIGHT] [-W IMG_WIDTH | --image-width=IMG_width] \
    [-t OBJECTS_THRESHOLD | --objects-threshold=OBJECTS_THRESHOLD]
 Options:
 -h --help                                               Show this screen.
 -u USERID --mqtt-username=USERNAME                      MQTT user
 -p PASSWORD --mqtt-password=PASSWORD                    MQTT password
-b HOSTNAME --mqtt-broker-host=HOSTNAME                 MQTT broker host
+-b HOSTNAME --mqtt-broker=HOSTNAME                      MQTT broker host
 -P HOSTNAME --mqtt-broker-port=PORT                     MQTT broker port
 -C CLIENT_ID --mqtt-client-id=CLIENT_ID                 MQTT client id
 -c TOPIC_CAMERA --mqtt-topic-robocar-oak-camera=TOPIC_CAMERA        MQTT topic where to publish robocar-oak-camera frames
 -o TOPIC_OBJECTS --mqtt-topic-robocar-objects=TOPIC_OBJECTS         MQTT topic where to publish objects detection results
 -H IMG_HEIGHT --image-height=IMG_HEIGHT                 IMG_HEIGHT image height
 -W IMG_WIDTH --image-width=IMG_width                    IMG_WIDTH image width
 -t OBJECTS_THRESHOLD --objects-threshold=OBJECTS_THRESHOLD    OBJECTS_THRESHOLD threshold to filter objects detected
 """
 import logging
 import os
@@ -33,13 +27,13 @@ logging.basicConfig(level=logging.INFO)
 default_client_id = "robocar-depthai"
-def init_mqtt_client(broker_host: str, broker_port, user: str, password: str, client_id: str) -> mqtt.Client:
+def init_mqtt_client(broker_host: str, user: str, password: str, client_id: str) -> mqtt.Client:
    logger.info("Start part.py-robocar-oak-camera")
    client = mqtt.Client(client_id=client_id, clean_session=True, userdata=None, protocol=mqtt.MQTTv311)
    client.username_pw_set(user, password)
    logger.info("Connect to mqtt broker "+ broker_host)
-    client.connect(host=broker_host, port=broker_port, keepalive=60)
+    client.connect(host=broker_host, port=1883, keepalive=60)
    logger.info("Connected to mqtt broker")
    return client
@@ -49,21 +43,16 @@ def execute_from_command_line():
    args = docopt(__doc__)
-    client = init_mqtt_client(broker_host=get_default_value(args["--mqtt-broker-host"], "MQTT_BROKER_HOST", "localhost"),
+    client = init_mqtt_client(broker_host=get_default_value(args["--mqtt-broker"], "MQTT_BROKER", "localhost"),
                              broker_port=int(get_default_value(args["--mqtt-broker-port"], "MQTT_BROKER_PORT", "1883")),
                              user=get_default_value(args["--mqtt-username"], "MQTT_USERNAME", ""),
                              password=get_default_value(args["--mqtt-password"], "MQTT_PASSWORD", ""),
                              client_id=get_default_value(args["--mqtt-client-id"], "MQTT_CLIENT_ID",
                                                          default_client_id),
                              )
    frame_topic = get_default_value(args["--mqtt-topic-robocar-oak-camera"], "MQTT_TOPIC_CAMERA", "/oak/camera_rgb")
    objects_topic = get_default_value(args["--mqtt-topic-robocar-objects"], "MQTT_TOPIC_OBJECTS", "/objects")
    frame_processor = cam.FramePublisher(mqtt_client=client,
                                         frame_topic=frame_topic,
                                         objects_topic=objects_topic,
                                         objects_threshold=float(get_default_value(args["--objects-threshold"],
                                                                                   "OBJECTS_THRESHOLD", 0.2)),
                                         img_width=int(get_default_value(args["--image-width"], "IMAGE_WIDTH", 160)),
                                         img_height=int(get_default_value(args["--image-height"], "IMAGE_HEIGHT", 120)))
    frame_processor.run()
--- a/camera/depthai.py
+++ b/camera/depthai.py
@@ -6,22 +6,25 @@ import events.events_pb2
 import depthai as dai
 import cv2
 import numpy as np
 logger = logging.getLogger(__name__)
-NN_PATH = "/models/mobile_object_localizer_192x192_openvino_2021.4_6shave.blob"
+
-NN_WIDTH = 192
+def to_tensor_result(packet):
-NN_HEIGHT = 192
+    return {
        name: np.array(packet.getLayerFp16(name))
        for name in [tensor.name for tensor in packet.getRaw().tensors]
    }
 def to_planar(frame):
    return frame.transpose(2, 0, 1).flatten()
 class FramePublisher:
-    def __init__(self, mqtt_client: mqtt.Client, frame_topic: str, objects_topic: str, objects_threshold: float,
+    def __init__(self, mqtt_client: mqtt.Client, frame_topic: str, img_width: int, img_height: int):
                 img_width: int, img_height: int):
        self._mqtt_client = mqtt_client
        self._frame_topic = frame_topic
        self._objects_topic = objects_topic
        self._objects_threshold = objects_threshold
        self._img_width = img_width
        self._img_height = img_height
        self._pipeline = self._configure_pipeline()
@@ -30,29 +33,76 @@ class FramePublisher:
        logger.info("configure pipeline")
        pipeline = dai.Pipeline()
-        pipeline.setOpenVINOVersion(version=dai.OpenVINO.VERSION_2021_4)
+        version = "2021.2"
        pipeline.setOpenVINOVersion(version=dai.OpenVINO.Version.VERSION_2021_2)
-        # Define a neural network that will make predictions based on the source frames
+        # colorCam = pipeline.create(dai.node.ColorCamera)
-        detection_nn = pipeline.create(dai.node.NeuralNetwork)
+        # colorCam.setPreviewSize(256, 256)
-        detection_nn.setBlobPath(NN_PATH)
+        # colorCam.setVideoSize(1024, 1024)  # 4 times larger in both axis
-        detection_nn.setNumPoolFrames(4)
+        # colorCam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
-        detection_nn.input.setBlocking(False)
+        # colorCam.setInterleaved(False)
-        detection_nn.setNumInferenceThreads(2)
+        # colorCam.setBoardSocket(dai.CameraBoardSocket.RGB)
        # colorCam.setFps(10)
        #
        # controlIn = pipeline.create(dai.node.XLinkIn)
        # controlIn.setStreamName('control')
        # controlIn.out.link(colorCam.inputControl)
        #
        # cam_xout = pipeline.create(dai.node.XLinkOut)
        # cam_xout.setStreamName('video')
        # colorCam.video.link(cam_xout.input)
-        xout_nn = pipeline.create(dai.node.XLinkOut)
+        # ---------------------------------------
-        xout_nn.setStreamName("nn")
+        # 1st stage NN - text-detection
-        xout_nn.input.setBlocking(False)
+        # ---------------------------------------
        nn = pipeline.create(dai.node.NeuralNetwork)
        nn.setBlobPath(
            blobconverter.from_zoo(name="east_text_detection_256x256", zoo_type="depthai", shaves=6, version=version))
        colorCam.preview.link(nn.input)
        nn_xout = pipeline.create(dai.node.XLinkOut)
        nn_xout.setStreamName('detections')
        nn.out.link(nn_xout.input)
        # ---------------------------------------
        # 2nd stage NN - text-recognition-0012
        # ---------------------------------------
        # Resize image
        manip = pipeline.create(dai.node.ImageManip)
-        manip.initialConfig.setResize(NN_WIDTH, NN_HEIGHT)
+        manip.setWaitForConfigInput(True)
-        manip.initialConfig.setFrameType(dai.ImgFrame.Type.RGB888p)
+
-        manip.initialConfig.setKeepAspectRatio(False)
+        manip_img = pipeline.create(dai.node.XLinkIn)
        manip_img.setStreamName('manip_img')
        manip_img.out.link(manip.inputImage)
        manip_cfg = pipeline.create(dai.node.XLinkIn)
        manip_cfg.setStreamName('manip_cfg')
        manip_cfg.out.link(manip.inputConfig)
        manip_xout = pipeline.create(dai.node.XLinkOut)
        manip_xout.setStreamName('manip_out')
        nn2 = pipeline.create(dai.node.NeuralNetwork)
        nn2.setBlobPath(blobconverter.from_zoo(name="text-recognition-0012", shaves=6, version=version))
        nn2.setNumInferenceThreads(2)
        manip.out.link(nn2.input)
        manip.out.link(manip_xout.input)
        nn2_xout = pipeline.create(dai.node.XLinkOut)
        nn2_xout.setStreamName("recognitions")
        nn2.out.link(nn2_xout.input)
        cam_rgb = pipeline.create(dai.node.ColorCamera)
        xout_rgb = pipeline.create(dai.node.XLinkOut)
        xout_rgb.setStreamName("rgb")
        xout_rgb.setStreamName("rgb")
        # Properties
        cam_rgb.setBoardSocket(dai.CameraBoardSocket.RGB)
@@ -61,18 +111,156 @@ class FramePublisher:
        cam_rgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.RGB)
        cam_rgb.setFps(30)
-        # Link preview to manip and manip to nn
+        # Linking
        cam_rgb.preview.link(manip.inputImage)
        manip.out.link(detection_nn.input)
        # Linking to output
        cam_rgb.preview.link(xout_rgb.input)
        detection_nn.out.link(xout_nn.input)
        logger.info("pipeline configured")
        return pipeline
    def run(self):
        with dai.Device(self._pipeline) as device:
            q_vid = device.getOutputQueue("video", 4, blocking=False)
            # This should be set to block, but would get to some extreme queuing/latency!
            q_det = device.getOutputQueue("detections", 4, blocking=False)
            q_rec = device.getOutputQueue("recognitions", 4, blocking=True)
            q_manip_img = device.getInputQueue("manip_img")
            q_manip_cfg = device.getInputQueue("manip_cfg")
            q_manip_out = device.getOutputQueue("manip_out", 4, blocking=False)
            controlQueue = device.getInputQueue('control')
            frame = None
            cropped_stacked = None
            rotated_rectangles = []
            rec_pushed = 0
            rec_received = 0
            host_sync = HostSeqSync()
            characters = '0123456789abcdefghijklmnopqrstuvwxyz#'
            codec = CTCCodec(characters)
            ctrl = dai.CameraControl()
            ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.CONTINUOUS_VIDEO)
            ctrl.setAutoFocusTrigger()
            controlQueue.send(ctrl)
            while True:
                vid_in = q_vid.tryGet()
                if vid_in is not None:
                    host_sync.add_msg(vid_in)
                # Multiple recognition results may be available, read until queue is empty
                while True:
                    in_rec = q_rec.tryGet()
                    if in_rec is None:
                        break
                    rec_data = bboxes = np.array(in_rec.getFirstLayerFp16()).reshape(30, 1, 37)
                    decoded_text = codec.decode(rec_data)[0]
                    pos = rotated_rectangles[rec_received]
                    print("{:2}: {:20}".format(rec_received, decoded_text),
                          "center({:3},{:3}) size({:3},{:3}) angle{:5.1f} deg".format(
                              int(pos[0][0]), int(pos[0][1]), pos[1][0], pos[1][1], pos[2]))
                    # Draw the text on the right side of 'cropped_stacked' - placeholder
                    if cropped_stacked is not None:
                        cv2.putText(cropped_stacked, decoded_text,
                                    (120 + 10, 32 * rec_received + 24),
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
                        cv2.imshow('cropped_stacked', cropped_stacked)
                    rec_received += 1
                if cv2.waitKey(1) == ord('q'):
                    break
                if rec_received >= rec_pushed:
                    in_det = q_det.tryGet()
                    if in_det is not None:
                        frame = host_sync.get_msg(in_det.getSequenceNum()).getCvFrame().copy()
                        scores, geom1, geom2 = to_tensor_result(in_det).values()
                        scores = np.reshape(scores, (1, 1, 64, 64))
                        geom1 = np.reshape(geom1, (1, 4, 64, 64))
                        geom2 = np.reshape(geom2, (1, 1, 64, 64))
                        bboxes, confs, angles = east.decode_predictions(scores, geom1, geom2)
                        boxes, angles = east.non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles))
                        rotated_rectangles = [
                            east.get_cv_rotated_rect(bbox, angle * -1)
                            for (bbox, angle) in zip(boxes, angles)
                        ]
                        rec_received = 0
                        rec_pushed = len(rotated_rectangles)
                        if rec_pushed:
                            print("====== Pushing for recognition, count:", rec_pushed)
                        cropped_stacked = None
                        for idx, rotated_rect in enumerate(rotated_rectangles):
                            # Detections are done on 256x256 frames, we are sending back 1024x1024
                            # That's why we multiply center and size values by 4
                            rotated_rect[0][0] = rotated_rect[0][0] * 4
                            rotated_rect[0][1] = rotated_rect[0][1] * 4
                            rotated_rect[1][0] = rotated_rect[1][0] * 4
                            rotated_rect[1][1] = rotated_rect[1][1] * 4
                            # Draw detection crop area on input frame
                            points = np.int0(cv2.boxPoints(rotated_rect))
                            print(rotated_rect)
                            cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1,
                                          lineType=cv2.LINE_8)
                            # TODO make it work taking args like in OpenCV:
                            # rr = ((256, 256), (128, 64), 30)
                            rr = dai.RotatedRect()
                            rr.center.x = rotated_rect[0][0]
                            rr.center.y = rotated_rect[0][1]
                            rr.size.width = rotated_rect[1][0]
                            rr.size.height = rotated_rect[1][1]
                            rr.angle = rotated_rect[2]
                            cfg = dai.ImageManipConfig()
                            cfg.setCropRotatedRect(rr, False)
                            cfg.setResize(120, 32)
                            # Send frame and config to device
                            if idx == 0:
                                w, h, c = frame.shape
                                imgFrame = dai.ImgFrame()
                                imgFrame.setData(to_planar(frame))
                                imgFrame.setType(dai.ImgFrame.Type.BGR888p)
                                imgFrame.setWidth(w)
                                imgFrame.setHeight(h)
                                q_manip_img.send(imgFrame)
                            else:
                                cfg.setReusePreviousImage(True)
                            q_manip_cfg.send(cfg)
                            # Get manipulated image from the device
                            transformed = q_manip_out.get().getCvFrame()
                            rec_placeholder_img = np.zeros((32, 200, 3), np.uint8)
                            transformed = np.hstack((transformed, rec_placeholder_img))
                            if cropped_stacked is None:
                                cropped_stacked = transformed
                            else:
                                cropped_stacked = np.vstack((cropped_stacked, transformed))
                if cropped_stacked is not None:
                    cv2.imshow('cropped_stacked', cropped_stacked)
                if frame is not None:
                    cv2.imshow('frame', frame)
                key = cv2.waitKey(1)
                if key == ord('q'):
                    break
                elif key == ord('t'):
                    print("Autofocus trigger (and disable continuous)")
                    ctrl = dai.CameraControl()
                    ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.AUTO)
                    ctrl.setAutoFocusTrigger()
                    controlQueue.send(ctrl)
        # Connect to device and start pipeline
        with dai.Device(self._pipeline) as device:
            logger.info('MxId: %s', device.getDeviceInfo().getMxId())
@@ -84,8 +272,7 @@ class FramePublisher:
            device.startPipeline()
            # Queues
            queue_size = 4
-            q_rgb = device.getOutputQueue(name="rgb", maxSize=queue_size, blocking=False)
+            q_rgb = device.getOutputQueue("rgb", maxSize=queue_size, blocking=False)
            q_nn = device.getOutputQueue(name="nn", maxSize=queue_size, blocking=False)
            while True:
                try:
@@ -110,42 +297,5 @@ class FramePublisher:
                                              qos=0,
                                              retain=False)
                    in_nn = q_nn.get()
                    # get outputs
                    detection_boxes = np.array(in_nn.getLayerFp16("ExpandDims")).reshape((100, 4))
                    detection_scores = np.array(in_nn.getLayerFp16("ExpandDims_2")).reshape((100,))
                    # keep boxes bigger than threshold
                    mask = detection_scores >= self._objects_threshold
                    boxes = detection_boxes[mask]
                    scores = detection_scores[mask]
                    if boxes.shape[0] > 0:
                        objects_msg = events.events_pb2.ObjectsMessage()
                        objs = []
                        for i in range(boxes.shape[0]):
                            bbox = boxes[i]
                            logger.debug("new object detected: %s", str(bbox))
                            o = events.events_pb2.Object()
                            o.type = events.events_pb2.TypeObject.ANY
                            o.top = bbox[0].astype(float)
                            o.right = bbox[3].astype(float)
                            o.bottom = bbox[2].astype(float)
                            o.left = bbox[1].astype(float)
                            o.confidence = scores[i].astype(float)
                            objs.append(o)
                        objects_msg.objects.extend(objs)
                        objects_msg.frame_ref.name = frame_msg.id.name
                        objects_msg.frame_ref.id = frame_msg.id.id
                        objects_msg.frame_ref.created_at.FromDatetime(now)
                        logger.debug("publish object event to %s", self._frame_topic)
                        self._mqtt_client.publish(topic=self._objects_topic,
                                                  payload=objects_msg.SerializeToString(),
                                                  qos=0,
                                                  retain=False)
                except Exception as e:
                    logger.exception("unexpected error: %s", str(e))
--- a/camera/east.py
+++ b/camera/east.py
@@ -0,0 +1,232 @@
 import cv2
 import depthai
 import numpy as np
 _conf_threshold = 0.5
 def get_cv_rotated_rect(bbox, angle):
    x0, y0, x1, y1 = bbox
    width = abs(x0 - x1)
    height = abs(y0 - y1)
    x = x0 + width * 0.5
    y = y0 + height * 0.5
    return [x.tolist(), y.tolist()], [width.tolist(), height.tolist()], np.rad2deg(angle)
 def rotated_Rectangle(bbox, angle):
    X0, Y0, X1, Y1 = bbox
    width = abs(X0 - X1)
    height = abs(Y0 - Y1)
    x = int(X0 + width * 0.5)
    y = int(Y0 + height * 0.5)
    pt1_1 = (int(x + width / 2), int(y + height / 2))
    pt2_1 = (int(x + width / 2), int(y - height / 2))
    pt3_1 = (int(x - width / 2), int(y - height / 2))
    pt4_1 = (int(x - width / 2), int(y + height / 2))
    t = np.array([[np.cos(angle), -np.sin(angle), x - x * np.cos(angle) + y * np.sin(angle)],
                  [np.sin(angle), np.cos(angle), y - x * np.sin(angle) - y * np.cos(angle)],
                  [0, 0, 1]])
    tmp_pt1_1 = np.array([[pt1_1[0]], [pt1_1[1]], [1]])
    tmp_pt1_2 = np.dot(t, tmp_pt1_1)
    pt1_2 = (int(tmp_pt1_2[0][0]), int(tmp_pt1_2[1][0]))
    tmp_pt2_1 = np.array([[pt2_1[0]], [pt2_1[1]], [1]])
    tmp_pt2_2 = np.dot(t, tmp_pt2_1)
    pt2_2 = (int(tmp_pt2_2[0][0]), int(tmp_pt2_2[1][0]))
    tmp_pt3_1 = np.array([[pt3_1[0]], [pt3_1[1]], [1]])
    tmp_pt3_2 = np.dot(t, tmp_pt3_1)
    pt3_2 = (int(tmp_pt3_2[0][0]), int(tmp_pt3_2[1][0]))
    tmp_pt4_1 = np.array([[pt4_1[0]], [pt4_1[1]], [1]])
    tmp_pt4_2 = np.dot(t, tmp_pt4_1)
    pt4_2 = (int(tmp_pt4_2[0][0]), int(tmp_pt4_2[1][0]))
    points = np.array([pt1_2, pt2_2, pt3_2, pt4_2])
    return points
 def non_max_suppression(boxes, probs=None, angles=None, overlapThresh=0.3):
    # if there are no boxes, return an empty list
    if len(boxes) == 0:
        return [], []
    # if the bounding boxes are integers, convert them to floats -- this
    # is important since we'll be doing a bunch of divisions
    if boxes.dtype.kind == "i":
        boxes = boxes.astype("float")
    # initialize the list of picked indexes
    pick = []
    # grab the coordinates of the bounding boxes
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    # compute the area of the bounding boxes and grab the indexes to sort
    # (in the case that no probabilities are provided, simply sort on the bottom-left y-coordinate)
    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    idxs = y2
    # if probabilities are provided, sort on them instead
    if probs is not None:
        idxs = probs
    # sort the indexes
    idxs = np.argsort(idxs)
    # keep looping while some indexes still remain in the indexes list
    while len(idxs) > 0:
        # grab the last index in the indexes list and add the index value to the list of picked indexes
        last = len(idxs) - 1
        i = idxs[last]
        pick.append(i)
        # find the largest (x, y) coordinates for the start of the bounding box and the smallest (x, y) coordinates
        # for the end of the bounding box
        xx1 = np.maximum(x1[i], x1[idxs[:last]])
        yy1 = np.maximum(y1[i], y1[idxs[:last]])
        xx2 = np.minimum(x2[i], x2[idxs[:last]])
        yy2 = np.minimum(y2[i], y2[idxs[:last]])
        # compute the width and height of the bounding box
        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)
        # compute the ratio of overlap
        overlap = (w * h) / area[idxs[:last]]
        # delete all indexes from the index list that have overlap greater than the provided overlap threshold
        idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlapThresh)[0])))
    # return only the bounding boxes that were picked
    return boxes[pick].astype("int"), angles[pick]
 def decode_predictions(scores, geometry1, geometry2):
    # grab the number of rows and columns from the scores volume, then
    # initialize our set of bounding box rectangles and corresponding
    # confidence scores
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []
    angles = []
    # loop over the number of rows
    for y in range(0, numRows):
        # extract the scores (probabilities), followed by the
        # geometrical data used to derive potential bounding box
        # coordinates that surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry1[0, 0, y]
        xData1 = geometry1[0, 1, y]
        xData2 = geometry1[0, 2, y]
        xData3 = geometry1[0, 3, y]
        anglesData = geometry2[0, 0, y]
        # loop over the number of columns
        for x in range(0, numCols):
            # if our score does not have sufficient probability,
            # ignore it
            if scoresData[x] < _conf_threshold:
                continue
            # compute the offset factor as our resulting feature
            # maps will be 4x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)
            # extract the rotation angle for the prediction and
            # then compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)
            # use the geometry volume to derive the width and height
            # of the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]
            # compute both the starting and ending (x, y)-coordinates
            # for the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)
            # add the bounding box coordinates and probability score
            # to our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])
            angles.append(angle)
    # return a tuple of the bounding boxes and associated confidences
    return (rects, confidences, angles)
 def decode_east(nnet_packet, **kwargs):
    scores = nnet_packet.get_tensor(0)
    geometry1 = nnet_packet.get_tensor(1)
    geometry2 = nnet_packet.get_tensor(2)
    bboxes, confs, angles = decode_predictions(scores, geometry1, geometry2
                                               )
    boxes, angles = non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles))
    boxesangles = (boxes, angles)
    return boxesangles
 def show_east(boxesangles, frame, **kwargs):
    bboxes = boxesangles[0]
    angles = boxesangles[1]
    for ((X0, Y0, X1, Y1), angle) in zip(bboxes, angles):
        width = abs(X0 - X1)
        height = abs(Y0 - Y1)
        cX = int(X0 + width * 0.5)
        cY = int(Y0 + height * 0.5)
        rotRect = ((cX, cY), ((X1 - X0), (Y1 - Y0)), angle * (-1))
        points = rotated_Rectangle(frame, rotRect, color=(255, 0, 0), thickness=1)
        cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, lineType=cv2.LINE_8)
    return frame
 def order_points(pts):
    rect = np.zeros((4, 2), dtype="float32")
    s = pts.sum(axis=1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]
    diff = np.diff(pts, axis=1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]
    return rect
 def four_point_transform(image, pts):
    rect = order_points(pts)
    (tl, tr, br, bl) = rect
    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
    maxWidth = max(int(widthA), int(widthB))
    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
    maxHeight = max(int(heightA), int(heightB))
    dst = np.array([
        [0, 0],
        [maxWidth - 1, 0],
        [maxWidth - 1, maxHeight - 1],
        [0, maxHeight - 1]], dtype="float32")
    M = cv2.getPerspectiveTransform(rect, dst)
    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
    return warped
--- a/camera/text.py
+++ b/camera/text.py
@@ -0,0 +1,61 @@
 class HostSeqSync:
    def __init__(self):
        self.imfFrames = []
    def add_msg(self, msg):
        self.imfFrames.append(msg)
    def get_msg(self, target_seq):
        for i, imgFrame in enumerate(self.imfFrames):
            if target_seq == imgFrame.getSequenceNum():
                self.imfFrames = self.imfFrames[i:]
                break
        return self.imfFrames[0]
 class CTCCodec(object):
    """ Convert between text-label and text-index """
    def __init__(self, characters):
        # characters (str): set of the possible characters.
        dict_character = list(characters)
        self.dict = {}
        for i, char in enumerate(dict_character):
            self.dict[char] = i + 1
        self.characters = dict_character
        # print(self.characters)
        # input()
    def decode(self, preds):
        """ convert text-index into text-label. """
        texts = []
        index = 0
        # Select max probabilty (greedy decoding) then decode index to character
        preds = preds.astype(np.float16)
        preds_index = np.argmax(preds, 2)
        preds_index = preds_index.transpose(1, 0)
        preds_index_reshape = preds_index.reshape(-1)
        preds_sizes = np.array([preds_index.shape[1]] * preds_index.shape[0])
        for l in preds_sizes:
            t = preds_index_reshape[index:index + l]
            # NOTE: t might be zero size
            if t.shape[0] == 0:
                continue
            char_list = []
            for i in range(l):
                # removing repeated characters and blank.
                if not (i > 0 and t[i - 1] == t[i]):
                    if self.characters[t[i]] != '#':
                        char_list.append(self.characters[t[i]])
            text = ''.join(char_list)
            texts.append(text)
            index += l
        return texts
--- a/east.py
+++ b/east.py
@@ -0,0 +1,229 @@
 import cv2
 import depthai
 import numpy as np
 _conf_threshold = 0.5
 def get_cv_rotated_rect(bbox, angle):
    x0, y0, x1, y1 = bbox
    width = abs(x0 - x1)
    height = abs(y0 - y1)
    x = x0 + width * 0.5
    y = y0 + height * 0.5
    return ([x.tolist(), y.tolist()], [width.tolist(), height.tolist()], np.rad2deg(angle))
 def rotated_Rectangle(bbox, angle):
    X0, Y0, X1, Y1 = bbox
    width = abs(X0 - X1)
    height = abs(Y0 - Y1)
    x = int(X0 + width * 0.5)
    y = int(Y0 + height * 0.5)
    pt1_1 = (int(x + width / 2), int(y + height / 2))
    pt2_1 = (int(x + width / 2), int(y - height / 2))
    pt3_1 = (int(x - width / 2), int(y - height / 2))
    pt4_1 = (int(x - width / 2), int(y + height / 2))
    t = np.array([[np.cos(angle), -np.sin(angle), x - x * np.cos(angle) + y * np.sin(angle)],
                  [np.sin(angle), np.cos(angle), y - x * np.sin(angle) - y * np.cos(angle)],
                  [0, 0, 1]])
    tmp_pt1_1 = np.array([[pt1_1[0]], [pt1_1[1]], [1]])
    tmp_pt1_2 = np.dot(t, tmp_pt1_1)
    pt1_2 = (int(tmp_pt1_2[0][0]), int(tmp_pt1_2[1][0]))
    tmp_pt2_1 = np.array([[pt2_1[0]], [pt2_1[1]], [1]])
    tmp_pt2_2 = np.dot(t, tmp_pt2_1)
    pt2_2 = (int(tmp_pt2_2[0][0]), int(tmp_pt2_2[1][0]))
    tmp_pt3_1 = np.array([[pt3_1[0]], [pt3_1[1]], [1]])
    tmp_pt3_2 = np.dot(t, tmp_pt3_1)
    pt3_2 = (int(tmp_pt3_2[0][0]), int(tmp_pt3_2[1][0]))
    tmp_pt4_1 = np.array([[pt4_1[0]], [pt4_1[1]], [1]])
    tmp_pt4_2 = np.dot(t, tmp_pt4_1)
    pt4_2 = (int(tmp_pt4_2[0][0]), int(tmp_pt4_2[1][0]))
    points = np.array([pt1_2, pt2_2, pt3_2, pt4_2])
    return points
 def non_max_suppression(boxes, probs=None, angles=None, overlapThresh=0.3):
    # if there are no boxes, return an empty list
    if len(boxes) == 0:
        return [], []
    # if the bounding boxes are integers, convert them to floats -- this
    # is important since we'll be doing a bunch of divisions
    if boxes.dtype.kind == "i":
        boxes = boxes.astype("float")
    # initialize the list of picked indexes
    pick = []
    # grab the coordinates of the bounding boxes
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    # compute the area of the bounding boxes and grab the indexes to sort
    # (in the case that no probabilities are provided, simply sort on the bottom-left y-coordinate)
    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    idxs = y2
    # if probabilities are provided, sort on them instead
    if probs is not None:
        idxs = probs
    # sort the indexes
    idxs = np.argsort(idxs)
    # keep looping while some indexes still remain in the indexes list
    while len(idxs) > 0:
        # grab the last index in the indexes list and add the index value to the list of picked indexes
        last = len(idxs) - 1
        i = idxs[last]
        pick.append(i)
        # find the largest (x, y) coordinates for the start of the bounding box and the smallest (x, y) coordinates for the end of the bounding box
        xx1 = np.maximum(x1[i], x1[idxs[:last]])
        yy1 = np.maximum(y1[i], y1[idxs[:last]])
        xx2 = np.minimum(x2[i], x2[idxs[:last]])
        yy2 = np.minimum(y2[i], y2[idxs[:last]])
        # compute the width and height of the bounding box
        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)
        # compute the ratio of overlap
        overlap = (w * h) / area[idxs[:last]]
        # delete all indexes from the index list that have overlap greater than the provided overlap threshold
        idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlapThresh)[0])))
    # return only the bounding boxes that were picked
    return boxes[pick].astype("int"), angles[pick]
 def decode_predictions(scores, geometry1, geometry2):
    # grab the number of rows and columns from the scores volume, then
    # initialize our set of bounding box rectangles and corresponding
    # confidence scores
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []
    angles = []
    # loop over the number of rows
    for y in range(0, numRows):
        # extract the scores (probabilities), followed by the
        # geometrical data used to derive potential bounding box
        # coordinates that surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry1[0, 0, y]
        xData1 = geometry1[0, 1, y]
        xData2 = geometry1[0, 2, y]
        xData3 = geometry1[0, 3, y]
        anglesData = geometry2[0, 0, y]
        # loop over the number of columns
        for x in range(0, numCols):
            # if our score does not have sufficient probability,
            # ignore it
            if scoresData[x] < _conf_threshold:
                continue
            # compute the offset factor as our resulting feature
            # maps will be 4x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)
            # extract the rotation angle for the prediction and
            # then compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)
            # use the geometry volume to derive the width and height
            # of the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]
            # compute both the starting and ending (x, y)-coordinates
            # for the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)
            # add the bounding box coordinates and probability score
            # to our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])
            angles.append(angle)
    # return a tuple of the bounding boxes and associated confidences
    return (rects, confidences, angles)
 def decode_east(nnet_packet, **kwargs):
    scores = nnet_packet.get_tensor(0)
    geometry1 = nnet_packet.get_tensor(1)
    geometry2 = nnet_packet.get_tensor(2)
    bboxes, confs, angles = decode_predictions(scores, geometry1, geometry2
                                               )
    boxes, angles = non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles))
    boxesangles = (boxes, angles)
    return boxesangles
 def show_east(boxesangles, frame, **kwargs):
    bboxes = boxesangles[0]
    angles = boxesangles[1]
    for ((X0, Y0, X1, Y1), angle) in zip(bboxes, angles):
        width = abs(X0 - X1)
        height = abs(Y0 - Y1)
        cX = int(X0 + width * 0.5)
        cY = int(Y0 + height * 0.5)
        rotRect = ((cX, cY), ((X1 - X0), (Y1 - Y0)), angle * (-1))
        points = rotated_Rectangle(frame, rotRect, color=(255, 0, 0), thickness=1)
        cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, lineType=cv2.LINE_8)
    return frame
 def order_points(pts):
    rect = np.zeros((4, 2), dtype="float32")
    s = pts.sum(axis=1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]
    diff = np.diff(pts, axis=1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]
    return rect
 def four_point_transform(image, pts):
    rect = order_points(pts)
    (tl, tr, br, bl) = rect
    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
    maxWidth = max(int(widthA), int(widthB))
    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
    maxHeight = max(int(heightA), int(heightB))
    dst = np.array([
        [0, 0],
        [maxWidth - 1, 0],
        [maxWidth - 1, maxHeight - 1],
        [0, maxHeight - 1]], dtype="float32")
    M = cv2.getPerspectiveTransform(rect, dst)
    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
    return warped
--- a/events/events_pb2.py
+++ b/events/events_pb2.py
@@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
 from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13\x65vents/events.proto\x12\x0erobocar.events\x1a\x1fgoogle/protobuf/timestamp.proto\"T\n\x08\x46rameRef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\t\x12.\n\ncreated_at\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"C\n\x0c\x46rameMessage\x12$\n\x02id\x18\x01 \x01(\x0b\x32\x18.robocar.events.FrameRef\x12\r\n\x05\x66rame\x18\x02 \x01(\x0c\"d\n\x0fSteeringMessage\x12\x10\n\x08steering\x18\x01 \x01(\x02\x12\x12\n\nconfidence\x18\x02 \x01(\x02\x12+\n\tframe_ref\x18\x03 \x01(\x0b\x32\x18.robocar.events.FrameRef\"d\n\x0fThrottleMessage\x12\x10\n\x08throttle\x18\x01 \x01(\x02\x12\x12\n\nconfidence\x18\x02 \x01(\x02\x12+\n\tframe_ref\x18\x03 \x01(\x0b\x32\x18.robocar.events.FrameRef\"A\n\x10\x44riveModeMessage\x12-\n\ndrive_mode\x18\x01 \x01(\x0e\x32\x19.robocar.events.DriveMode\"f\n\x0eObjectsMessage\x12\'\n\x07objects\x18\x01 \x03(\x0b\x32\x16.robocar.events.Object\x12+\n\tframe_ref\x18\x02 \x01(\x0b\x32\x18.robocar.events.FrameRef\"\x80\x01\n\x06Object\x12(\n\x04type\x18\x01 \x01(\x0e\x32\x1a.robocar.events.TypeObject\x12\x0c\n\x04left\x18\x02 \x01(\x02\x12\x0b\n\x03top\x18\x03 \x01(\x02\x12\r\n\x05right\x18\x04 \x01(\x02\x12\x0e\n\x06\x62ottom\x18\x05 \x01(\x02\x12\x12\n\nconfidence\x18\x06 \x01(\x02\"&\n\x13SwitchRecordMessage\x12\x0f\n\x07\x65nabled\x18\x01 \x01(\x08\"\x8c\x01\n\x0bRoadMessage\x12&\n\x07\x63ontour\x18\x01 \x03(\x0b\x32\x15.robocar.events.Point\x12(\n\x07\x65llipse\x18\x02 \x01(\x0b\x32\x17.robocar.events.Ellipse\x12+\n\tframe_ref\x18\x03 \x01(\x0b\x32\x18.robocar.events.FrameRef\"\x1d\n\x05Point\x12\t\n\x01x\x18\x01 \x01(\x05\x12\t\n\x01y\x18\x02 \x01(\x05\"r\n\x07\x45llipse\x12%\n\x06\x63\x65nter\x18\x01 \x01(\x0b\x32\x15.robocar.events.Point\x12\r\n\x05width\x18\x02 \x01(\x05\x12\x0e\n\x06height\x18\x03 \x01(\x05\x12\r\n\x05\x61ngle\x18\x04 \x01(\x02\x12\x12\n\nconfidence\x18\x05 \x01(\x02\"\x82\x01\n\rRecordMessage\x12+\n\x05\x66rame\x18\x01 \x01(\x0b\x32\x1c.robocar.events.FrameMessage\x12\x31\n\x08steering\x18\x02 \x01(\x0b\x32\x1f.robocar.events.SteeringMessage\x12\x11\n\trecordSet\x18\x03 \x01(\t*-\n\tDriveMode\x12\x0b\n\x07INVALID\x10\x00\x12\x08\n\x04USER\x10\x01\x12\t\n\x05PILOT\x10\x02*2\n\nTypeObject\x12\x07\n\x03\x41NY\x10\x00\x12\x07\n\x03\x43\x41R\x10\x01\x12\x08\n\x04\x42UMP\x10\x02\x12\x08\n\x04PLOT\x10\x03\x42\nZ\x08./eventsb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13\x65vents/events.proto\x12\x0erobocar.events\x1a\x1fgoogle/protobuf/timestamp.proto\"T\n\x08\x46rameRef\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\t\x12.\n\ncreated_at\x18\x03 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\"C\n\x0c\x46rameMessage\x12$\n\x02id\x18\x01 \x01(\x0b\x32\x18.robocar.events.FrameRef\x12\r\n\x05\x66rame\x18\x02 \x01(\x0c\"d\n\x0fSteeringMessage\x12\x10\n\x08steering\x18\x01 \x01(\x02\x12\x12\n\nconfidence\x18\x02 \x01(\x02\x12+\n\tframe_ref\x18\x03 \x01(\x0b\x32\x18.robocar.events.FrameRef\"d\n\x0fThrottleMessage\x12\x10\n\x08throttle\x18\x01 \x01(\x02\x12\x12\n\nconfidence\x18\x02 \x01(\x02\x12+\n\tframe_ref\x18\x03 \x01(\x0b\x32\x18.robocar.events.FrameRef\"A\n\x10\x44riveModeMessage\x12-\n\ndrive_mode\x18\x01 \x01(\x0e\x32\x19.robocar.events.DriveMode\"f\n\x0eObjectsMessage\x12\'\n\x07objects\x18\x01 \x03(\x0b\x32\x16.robocar.events.Object\x12+\n\tframe_ref\x18\x02 \x01(\x0b\x32\x18.robocar.events.FrameRef\"\x80\x01\n\x06Object\x12(\n\x04type\x18\x01 \x01(\x0e\x32\x1a.robocar.events.TypeObject\x12\x0c\n\x04left\x18\x02 \x01(\x05\x12\x0b\n\x03top\x18\x03 \x01(\x05\x12\r\n\x05right\x18\x04 \x01(\x05\x12\x0e\n\x06\x62ottom\x18\x05 \x01(\x05\x12\x12\n\nconfidence\x18\x06 \x01(\x02\"&\n\x13SwitchRecordMessage\x12\x0f\n\x07\x65nabled\x18\x01 \x01(\x08\"\x8c\x01\n\x0bRoadMessage\x12&\n\x07\x63ontour\x18\x01 \x03(\x0b\x32\x15.robocar.events.Point\x12(\n\x07\x65llipse\x18\x02 \x01(\x0b\x32\x17.robocar.events.Ellipse\x12+\n\tframe_ref\x18\x03 \x01(\x0b\x32\x18.robocar.events.FrameRef\"\x1d\n\x05Point\x12\t\n\x01x\x18\x01 \x01(\x05\x12\t\n\x01y\x18\x02 \x01(\x05\"r\n\x07\x45llipse\x12%\n\x06\x63\x65nter\x18\x01 \x01(\x0b\x32\x15.robocar.events.Point\x12\r\n\x05width\x18\x02 \x01(\x05\x12\x0e\n\x06height\x18\x03 \x01(\x05\x12\r\n\x05\x61ngle\x18\x04 \x01(\x02\x12\x12\n\nconfidence\x18\x05 \x01(\x02\"\x82\x01\n\rRecordMessage\x12+\n\x05\x66rame\x18\x01 \x01(\x0b\x32\x1c.robocar.events.FrameMessage\x12\x31\n\x08steering\x18\x02 \x01(\x0b\x32\x1f.robocar.events.SteeringMessage\x12\x11\n\trecordSet\x18\x03 \x01(\t*-\n\tDriveMode\x12\x0b\n\x07INVALID\x10\x00\x12\x08\n\x04USER\x10\x01\x12\t\n\x05PILOT\x10\x02*2\n\nTypeObject\x12\x07\n\x03\x41NY\x10\x00\x12\x07\n\x03\x43\x41R\x10\x01\x12\x08\n\x04\x42UMP\x10\x02\x12\x08\n\x04PLOT\x10\x03\x42\nZ\x08./eventsb\x06proto3')
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'events.events_pb2', globals())
--- a/main.py
+++ b/main.py
@@ -0,0 +1,274 @@
 #!/usr/bin/env python3
 from pathlib import Path
 import cv2
 import numpy as np
 import depthai as dai
 import east
 import blobconverter
 class HostSeqSync:
    def __init__(self):
        self.imfFrames = []
    def add_msg(self, msg):
        self.imfFrames.append(msg)
    def get_msg(self, target_seq):
        for i, imgFrame in enumerate(self.imfFrames):
            if target_seq == imgFrame.getSequenceNum():
                self.imfFrames = self.imfFrames[i:]
                break
        return self.imfFrames[0]
 pipeline = dai.Pipeline()
 version = "2021.2"
 pipeline.setOpenVINOVersion(version=dai.OpenVINO.Version.VERSION_2021_2)
 colorCam = pipeline.create(dai.node.ColorCamera)
 colorCam.setPreviewSize(256, 256)
 colorCam.setVideoSize(1024, 1024) # 4 times larger in both axis
 colorCam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
 colorCam.setInterleaved(False)
 colorCam.setBoardSocket(dai.CameraBoardSocket.RGB)
 colorCam.setFps(10)
 controlIn = pipeline.create(dai.node.XLinkIn)
 controlIn.setStreamName('control')
 controlIn.out.link(colorCam.inputControl)
 cam_xout = pipeline.create(dai.node.XLinkOut)
 cam_xout.setStreamName('video')
 colorCam.video.link(cam_xout.input)
 # ---------------------------------------
 # 1st stage NN - text-detection
 # ---------------------------------------
 nn = pipeline.create(dai.node.NeuralNetwork)
 nn.setBlobPath(blobconverter.from_zoo(name="east_text_detection_256x256",zoo_type="depthai",shaves=6, version=version))
 colorCam.preview.link(nn.input)
 nn_xout = pipeline.create(dai.node.XLinkOut)
 nn_xout.setStreamName('detections')
 nn.out.link(nn_xout.input)
 # ---------------------------------------
 # 2nd stage NN - text-recognition-0012
 # ---------------------------------------
 manip = pipeline.create(dai.node.ImageManip)
 manip.setWaitForConfigInput(True)
 manip_img = pipeline.create(dai.node.XLinkIn)
 manip_img.setStreamName('manip_img')
 manip_img.out.link(manip.inputImage)
 manip_cfg = pipeline.create(dai.node.XLinkIn)
 manip_cfg.setStreamName('manip_cfg')
 manip_cfg.out.link(manip.inputConfig)
 manip_xout = pipeline.create(dai.node.XLinkOut)
 manip_xout.setStreamName('manip_out')
 nn2 = pipeline.create(dai.node.NeuralNetwork)
 nn2.setBlobPath(blobconverter.from_zoo(name="text-recognition-0012", shaves=6, version=version))
 nn2.setNumInferenceThreads(2)
 manip.out.link(nn2.input)
 manip.out.link(manip_xout.input)
 nn2_xout = pipeline.create(dai.node.XLinkOut)
 nn2_xout.setStreamName("recognitions")
 nn2.out.link(nn2_xout.input)
 def to_tensor_result(packet):
    return {
        name: np.array(packet.getLayerFp16(name))
        for name in [tensor.name for tensor in packet.getRaw().tensors]
    }
 def to_planar(frame):
    return frame.transpose(2, 0, 1).flatten()
 with dai.Device(pipeline) as device:
    q_vid = device.getOutputQueue("video", 4, blocking=False)
    # This should be set to block, but would get to some extreme queuing/latency!
    q_det = device.getOutputQueue("detections", 4, blocking=False)
    q_rec = device.getOutputQueue("recognitions", 4, blocking=True)
    q_manip_img = device.getInputQueue("manip_img")
    q_manip_cfg = device.getInputQueue("manip_cfg")
    q_manip_out = device.getOutputQueue("manip_out", 4, blocking=False)
    controlQueue = device.getInputQueue('control')
    frame = None
    cropped_stacked = None
    rotated_rectangles = []
    rec_pushed = 0
    rec_received = 0
    host_sync = HostSeqSync()
    class CTCCodec(object):
        """ Convert between text-label and text-index """
        def __init__(self, characters):
            # characters (str): set of the possible characters.
            dict_character = list(characters)
            self.dict = {}
            for i, char in enumerate(dict_character):
                self.dict[char] = i + 1
            self.characters = dict_character
            #print(self.characters)
            #input()
        def decode(self, preds):
            """ convert text-index into text-label. """
            texts = []
            index = 0
            # Select max probabilty (greedy decoding) then decode index to character
            preds = preds.astype(np.float16)
            preds_index = np.argmax(preds, 2)
            preds_index = preds_index.transpose(1, 0)
            preds_index_reshape = preds_index.reshape(-1)
            preds_sizes = np.array([preds_index.shape[1]] * preds_index.shape[0])
            for l in preds_sizes:
                t = preds_index_reshape[index:index + l]
                # NOTE: t might be zero size
                if t.shape[0] == 0:
                    continue
                char_list = []
                for i in range(l):
                    # removing repeated characters and blank.
                    if not (i > 0 and t[i - 1] == t[i]):
                        if self.characters[t[i]] != '#':
                            char_list.append(self.characters[t[i]])
                text = ''.join(char_list)
                texts.append(text)
                index += l
            return texts
    characters = '0123456789abcdefghijklmnopqrstuvwxyz#'
    codec = CTCCodec(characters)
    ctrl = dai.CameraControl()
    ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.CONTINUOUS_VIDEO)
    ctrl.setAutoFocusTrigger()
    controlQueue.send(ctrl)
    while True:
        vid_in = q_vid.tryGet()
        if vid_in is not None:
            host_sync.add_msg(vid_in)
        # Multiple recognition results may be available, read until queue is empty
        while True:
            in_rec = q_rec.tryGet()
            if in_rec is None:
                break
            rec_data = bboxes = np.array(in_rec.getFirstLayerFp16()).reshape(30,1,37)
            decoded_text = codec.decode(rec_data)[0]
            pos = rotated_rectangles[rec_received]
            print("{:2}: {:20}".format(rec_received, decoded_text),
                "center({:3},{:3}) size({:3},{:3}) angle{:5.1f} deg".format(
                    int(pos[0][0]), int(pos[0][1]), pos[1][0], pos[1][1], pos[2]))
            # Draw the text on the right side of 'cropped_stacked' - placeholder
            if cropped_stacked is not None:
                cv2.putText(cropped_stacked, decoded_text,
                                (120 + 10 , 32 * rec_received + 24),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)
                cv2.imshow('cropped_stacked', cropped_stacked)
            rec_received += 1
        if cv2.waitKey(1) == ord('q'):
            break
        if rec_received >= rec_pushed:
            in_det = q_det.tryGet()
            if in_det is not None:
                frame = host_sync.get_msg(in_det.getSequenceNum()).getCvFrame().copy()
                scores, geom1, geom2 = to_tensor_result(in_det).values()
                scores = np.reshape(scores, (1, 1, 64, 64))
                geom1 = np.reshape(geom1, (1, 4, 64, 64))
                geom2 = np.reshape(geom2, (1, 1, 64, 64))
                bboxes, confs, angles = east.decode_predictions(scores, geom1, geom2)
                boxes, angles = east.non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles))
                rotated_rectangles = [
                    east.get_cv_rotated_rect(bbox, angle * -1)
                    for (bbox, angle) in zip(boxes, angles)
                ]
                rec_received = 0
                rec_pushed = len(rotated_rectangles)
                if rec_pushed:
                    print("====== Pushing for recognition, count:", rec_pushed)
                cropped_stacked = None
                for idx, rotated_rect in enumerate(rotated_rectangles):
                    # Detections are done on 256x256 frames, we are sending back 1024x1024
                    # That's why we multiply center and size values by 4
                    rotated_rect[0][0] = rotated_rect[0][0] * 4
                    rotated_rect[0][1] = rotated_rect[0][1] * 4
                    rotated_rect[1][0] = rotated_rect[1][0] * 4
                    rotated_rect[1][1] = rotated_rect[1][1] * 4
                    # Draw detection crop area on input frame
                    points = np.int0(cv2.boxPoints(rotated_rect))
                    print(rotated_rect)
                    cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, lineType=cv2.LINE_8)
                    # TODO make it work taking args like in OpenCV:
                    # rr = ((256, 256), (128, 64), 30)
                    rr = dai.RotatedRect()
                    rr.center.x    = rotated_rect[0][0]
                    rr.center.y    = rotated_rect[0][1]
                    rr.size.width  = rotated_rect[1][0]
                    rr.size.height = rotated_rect[1][1]
                    rr.angle       = rotated_rect[2]
                    cfg = dai.ImageManipConfig()
                    cfg.setCropRotatedRect(rr, False)
                    cfg.setResize(120, 32)
                    # Send frame and config to device
                    if idx == 0:
                        w,h,c = frame.shape
                        imgFrame = dai.ImgFrame()
                        imgFrame.setData(to_planar(frame))
                        imgFrame.setType(dai.ImgFrame.Type.BGR888p)
                        imgFrame.setWidth(w)
                        imgFrame.setHeight(h)
                        q_manip_img.send(imgFrame)
                    else:
                        cfg.setReusePreviousImage(True)
                    q_manip_cfg.send(cfg)
                    # Get manipulated image from the device
                    transformed = q_manip_out.get().getCvFrame()
                    rec_placeholder_img = np.zeros((32, 200, 3), np.uint8)
                    transformed = np.hstack((transformed, rec_placeholder_img))
                    if cropped_stacked is None:
                        cropped_stacked = transformed
                    else:
                        cropped_stacked = np.vstack((cropped_stacked, transformed))
        if cropped_stacked is not None:
            cv2.imshow('cropped_stacked', cropped_stacked)
        if frame is not None:
            cv2.imshow('frame', frame)
        key = cv2.waitKey(1)
        if  key == ord('q'):
            break
        elif key == ord('t'):
            print("Autofocus trigger (and disable continuous)")
            ctrl = dai.CameraControl()
            ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.AUTO)
            ctrl.setAutoFocusTrigger()
            controlQueue.send(ctrl)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,9 @@
 paho-mqtt~=1.6.1
 docopt~=0.6.2
-depthai==2.17.2.0
+depthai==2.14.1.0
-opencv-python==4.6.0.66
+opencv-python~=4.5.5.62
 google~=3.0.0
 google-api-core~=2.4.0
 setuptools==60.5.0
-blobconverter==1.3.0
+protobuf3
 blobconverter>=1.2.9
Author	SHA1	Message	Date
Cyrille Nofficial	0467ab780c	WIP	2022-08-10 12:28:41 +02:00
Cyrille Nofficial	2c9c7d9078	fix: regenerate event struct with recent version of protobuf compiler	2022-08-02 12:27:15 +02:00