From 0467ab780cc21ed4545cd9516a37a6cd7d06aaf0 Mon Sep 17 00:00:00 2001
From: Cyrille Nofficial <cynoffic@cyrilix.fr>
Date: Wed, 10 Aug 2022 12:28:41 +0200
Subject: [PATCH] WIP

---
 camera/depthai.py | 221 +++++++++++++++++++++++++++++++++++++
 camera/east.py    | 232 +++++++++++++++++++++++++++++++++++++++
 camera/text.py    |  61 +++++++++++
 east.py           | 229 ++++++++++++++++++++++++++++++++++++++
 main.py           | 274 ++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt  |   3 +-
 6 files changed, 1019 insertions(+), 1 deletion(-)
 create mode 100644 camera/east.py
 create mode 100644 camera/text.py
 create mode 100644 east.py
 create mode 100644 main.py

diff --git a/camera/depthai.py b/camera/depthai.py
index 2e60965..243e9e5 100644
--- a/camera/depthai.py
+++ b/camera/depthai.py
@@ -10,6 +10,17 @@ import cv2
 logger = logging.getLogger(__name__)
 
 
+def to_tensor_result(packet):
+    return {
+        name: np.array(packet.getLayerFp16(name))
+        for name in [tensor.name for tensor in packet.getRaw().tensors]
+    }
+
+
+def to_planar(frame):
+    return frame.transpose(2, 0, 1).flatten()
+
+
 class FramePublisher:
     def __init__(self, mqtt_client: mqtt.Client, frame_topic: str, img_width: int, img_height: int):
         self._mqtt_client = mqtt_client
@@ -22,6 +33,72 @@ class FramePublisher:
         logger.info("configure pipeline")
         pipeline = dai.Pipeline()
 
+        version = "2021.2"
+        pipeline.setOpenVINOVersion(version=dai.OpenVINO.Version.VERSION_2021_2)
+
+        # colorCam = pipeline.create(dai.node.ColorCamera)
+        # colorCam.setPreviewSize(256, 256)
+        # colorCam.setVideoSize(1024, 1024)  # 4 times larger in both axis
+        # colorCam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
+        # colorCam.setInterleaved(False)
+        # colorCam.setBoardSocket(dai.CameraBoardSocket.RGB)
+        # colorCam.setFps(10)
+        #
+        # controlIn = pipeline.create(dai.node.XLinkIn)
+        # controlIn.setStreamName('control')
+        # controlIn.out.link(colorCam.inputControl)
+        #
+        # cam_xout = pipeline.create(dai.node.XLinkOut)
+        # cam_xout.setStreamName('video')
+        # colorCam.video.link(cam_xout.input)
+
+        # ---------------------------------------
+        # 1st stage NN - text-detection
+        # ---------------------------------------
+
+        nn = pipeline.create(dai.node.NeuralNetwork)
+        nn.setBlobPath(
+            blobconverter.from_zoo(name="east_text_detection_256x256", zoo_type="depthai", shaves=6, version=version))
+        colorCam.preview.link(nn.input)
+
+        nn_xout = pipeline.create(dai.node.XLinkOut)
+        nn_xout.setStreamName('detections')
+        nn.out.link(nn_xout.input)
+
+        # ---------------------------------------
+        # 2nd stage NN - text-recognition-0012
+        # ---------------------------------------
+
+        manip = pipeline.create(dai.node.ImageManip)
+        manip.setWaitForConfigInput(True)
+
+        manip_img = pipeline.create(dai.node.XLinkIn)
+        manip_img.setStreamName('manip_img')
+        manip_img.out.link(manip.inputImage)
+
+        manip_cfg = pipeline.create(dai.node.XLinkIn)
+        manip_cfg.setStreamName('manip_cfg')
+        manip_cfg.out.link(manip.inputConfig)
+
+        manip_xout = pipeline.create(dai.node.XLinkOut)
+        manip_xout.setStreamName('manip_out')
+
+        nn2 = pipeline.create(dai.node.NeuralNetwork)
+        nn2.setBlobPath(blobconverter.from_zoo(name="text-recognition-0012", shaves=6, version=version))
+        nn2.setNumInferenceThreads(2)
+        manip.out.link(nn2.input)
+        manip.out.link(manip_xout.input)
+
+        nn2_xout = pipeline.create(dai.node.XLinkOut)
+        nn2_xout.setStreamName("recognitions")
+        nn2.out.link(nn2_xout.input)
+
+
+
+
+
+
+
         cam_rgb = pipeline.create(dai.node.ColorCamera)
         xout_rgb = pipeline.create(dai.node.XLinkOut)
 
@@ -40,6 +117,150 @@ class FramePublisher:
         return pipeline
 
     def run(self):
+
+        with dai.Device(self._pipeline) as device:
+            q_vid = device.getOutputQueue("video", 4, blocking=False)
+            # This should be set to block, but would get to some extreme queuing/latency!
+            q_det = device.getOutputQueue("detections", 4, blocking=False)
+
+            q_rec = device.getOutputQueue("recognitions", 4, blocking=True)
+
+            q_manip_img = device.getInputQueue("manip_img")
+            q_manip_cfg = device.getInputQueue("manip_cfg")
+            q_manip_out = device.getOutputQueue("manip_out", 4, blocking=False)
+
+            controlQueue = device.getInputQueue('control')
+
+            frame = None
+            cropped_stacked = None
+            rotated_rectangles = []
+            rec_pushed = 0
+            rec_received = 0
+            host_sync = HostSeqSync()
+
+            characters = '0123456789abcdefghijklmnopqrstuvwxyz#'
+            codec = CTCCodec(characters)
+
+            ctrl = dai.CameraControl()
+            ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.CONTINUOUS_VIDEO)
+            ctrl.setAutoFocusTrigger()
+            controlQueue.send(ctrl)
+
+            while True:
+                vid_in = q_vid.tryGet()
+                if vid_in is not None:
+                    host_sync.add_msg(vid_in)
+
+                # Multiple recognition results may be available, read until queue is empty
+                while True:
+                    in_rec = q_rec.tryGet()
+                    if in_rec is None:
+                        break
+                    rec_data = bboxes = np.array(in_rec.getFirstLayerFp16()).reshape(30, 1, 37)
+                    decoded_text = codec.decode(rec_data)[0]
+                    pos = rotated_rectangles[rec_received]
+                    print("{:2}: {:20}".format(rec_received, decoded_text),
+                          "center({:3},{:3}) size({:3},{:3}) angle{:5.1f} deg".format(
+                              int(pos[0][0]), int(pos[0][1]), pos[1][0], pos[1][1], pos[2]))
+                    # Draw the text on the right side of 'cropped_stacked' - placeholder
+                    if cropped_stacked is not None:
+                        cv2.putText(cropped_stacked, decoded_text,
+                                    (120 + 10, 32 * rec_received + 24),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
+                        cv2.imshow('cropped_stacked', cropped_stacked)
+                    rec_received += 1
+
+                if cv2.waitKey(1) == ord('q'):
+                    break
+
+                if rec_received >= rec_pushed:
+                    in_det = q_det.tryGet()
+                    if in_det is not None:
+                        frame = host_sync.get_msg(in_det.getSequenceNum()).getCvFrame().copy()
+
+                        scores, geom1, geom2 = to_tensor_result(in_det).values()
+                        scores = np.reshape(scores, (1, 1, 64, 64))
+                        geom1 = np.reshape(geom1, (1, 4, 64, 64))
+                        geom2 = np.reshape(geom2, (1, 1, 64, 64))
+
+                        bboxes, confs, angles = east.decode_predictions(scores, geom1, geom2)
+                        boxes, angles = east.non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles))
+                        rotated_rectangles = [
+                            east.get_cv_rotated_rect(bbox, angle * -1)
+                            for (bbox, angle) in zip(boxes, angles)
+                        ]
+
+                        rec_received = 0
+                        rec_pushed = len(rotated_rectangles)
+                        if rec_pushed:
+                            print("====== Pushing for recognition, count:", rec_pushed)
+                        cropped_stacked = None
+                        for idx, rotated_rect in enumerate(rotated_rectangles):
+                            # Detections are done on 256x256 frames, we are sending back 1024x1024
+                            # That's why we multiply center and size values by 4
+                            rotated_rect[0][0] = rotated_rect[0][0] * 4
+                            rotated_rect[0][1] = rotated_rect[0][1] * 4
+                            rotated_rect[1][0] = rotated_rect[1][0] * 4
+                            rotated_rect[1][1] = rotated_rect[1][1] * 4
+
+                            # Draw detection crop area on input frame
+                            points = np.int0(cv2.boxPoints(rotated_rect))
+                            print(rotated_rect)
+                            cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1,
+                                          lineType=cv2.LINE_8)
+
+                            # TODO make it work taking args like in OpenCV:
+                            # rr = ((256, 256), (128, 64), 30)
+                            rr = dai.RotatedRect()
+                            rr.center.x = rotated_rect[0][0]
+                            rr.center.y = rotated_rect[0][1]
+                            rr.size.width = rotated_rect[1][0]
+                            rr.size.height = rotated_rect[1][1]
+                            rr.angle = rotated_rect[2]
+                            cfg = dai.ImageManipConfig()
+                            cfg.setCropRotatedRect(rr, False)
+                            cfg.setResize(120, 32)
+                            # Send frame and config to device
+                            if idx == 0:
+                                w, h, c = frame.shape
+                                imgFrame = dai.ImgFrame()
+                                imgFrame.setData(to_planar(frame))
+                                imgFrame.setType(dai.ImgFrame.Type.BGR888p)
+                                imgFrame.setWidth(w)
+                                imgFrame.setHeight(h)
+                                q_manip_img.send(imgFrame)
+                            else:
+                                cfg.setReusePreviousImage(True)
+                            q_manip_cfg.send(cfg)
+
+                            # Get manipulated image from the device
+                            transformed = q_manip_out.get().getCvFrame()
+
+                            rec_placeholder_img = np.zeros((32, 200, 3), np.uint8)
+                            transformed = np.hstack((transformed, rec_placeholder_img))
+                            if cropped_stacked is None:
+                                cropped_stacked = transformed
+                            else:
+                                cropped_stacked = np.vstack((cropped_stacked, transformed))
+
+                if cropped_stacked is not None:
+                    cv2.imshow('cropped_stacked', cropped_stacked)
+
+                if frame is not None:
+                    cv2.imshow('frame', frame)
+
+                key = cv2.waitKey(1)
+                if key == ord('q'):
+                    break
+                elif key == ord('t'):
+                    print("Autofocus trigger (and disable continuous)")
+                    ctrl = dai.CameraControl()
+                    ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.AUTO)
+                    ctrl.setAutoFocusTrigger()
+                    controlQueue.send(ctrl)
+
+
+
         # Connect to device and start pipeline
         with dai.Device(self._pipeline) as device:
             logger.info('MxId: %s', device.getDeviceInfo().getMxId())
diff --git a/camera/east.py b/camera/east.py
new file mode 100644
index 0000000..1b95c92
--- /dev/null
+++ b/camera/east.py
@@ -0,0 +1,232 @@
+import cv2
+import depthai
+import numpy as np
+
+_conf_threshold = 0.5
+
+
+def get_cv_rotated_rect(bbox, angle):
+    x0, y0, x1, y1 = bbox
+    width = abs(x0 - x1)
+    height = abs(y0 - y1)
+    x = x0 + width * 0.5
+    y = y0 + height * 0.5
+    return [x.tolist(), y.tolist()], [width.tolist(), height.tolist()], np.rad2deg(angle)
+
+
+def rotated_Rectangle(bbox, angle):
+    X0, Y0, X1, Y1 = bbox
+    width = abs(X0 - X1)
+    height = abs(Y0 - Y1)
+    x = int(X0 + width * 0.5)
+    y = int(Y0 + height * 0.5)
+
+    pt1_1 = (int(x + width / 2), int(y + height / 2))
+    pt2_1 = (int(x + width / 2), int(y - height / 2))
+    pt3_1 = (int(x - width / 2), int(y - height / 2))
+    pt4_1 = (int(x - width / 2), int(y + height / 2))
+
+    t = np.array([[np.cos(angle), -np.sin(angle), x - x * np.cos(angle) + y * np.sin(angle)],
+                  [np.sin(angle), np.cos(angle), y - x * np.sin(angle) - y * np.cos(angle)],
+                  [0, 0, 1]])
+
+    tmp_pt1_1 = np.array([[pt1_1[0]], [pt1_1[1]], [1]])
+    tmp_pt1_2 = np.dot(t, tmp_pt1_1)
+    pt1_2 = (int(tmp_pt1_2[0][0]), int(tmp_pt1_2[1][0]))
+
+    tmp_pt2_1 = np.array([[pt2_1[0]], [pt2_1[1]], [1]])
+    tmp_pt2_2 = np.dot(t, tmp_pt2_1)
+    pt2_2 = (int(tmp_pt2_2[0][0]), int(tmp_pt2_2[1][0]))
+
+    tmp_pt3_1 = np.array([[pt3_1[0]], [pt3_1[1]], [1]])
+    tmp_pt3_2 = np.dot(t, tmp_pt3_1)
+    pt3_2 = (int(tmp_pt3_2[0][0]), int(tmp_pt3_2[1][0]))
+
+    tmp_pt4_1 = np.array([[pt4_1[0]], [pt4_1[1]], [1]])
+    tmp_pt4_2 = np.dot(t, tmp_pt4_1)
+    pt4_2 = (int(tmp_pt4_2[0][0]), int(tmp_pt4_2[1][0]))
+
+    points = np.array([pt1_2, pt2_2, pt3_2, pt4_2])
+
+    return points
+
+
+def non_max_suppression(boxes, probs=None, angles=None, overlapThresh=0.3):
+    # if there are no boxes, return an empty list
+    if len(boxes) == 0:
+        return [], []
+
+    # if the bounding boxes are integers, convert them to floats -- this
+    # is important since we'll be doing a bunch of divisions
+    if boxes.dtype.kind == "i":
+        boxes = boxes.astype("float")
+
+    # initialize the list of picked indexes
+    pick = []
+
+    # grab the coordinates of the bounding boxes
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    # compute the area of the bounding boxes and grab the indexes to sort
+    # (in the case that no probabilities are provided, simply sort on the bottom-left y-coordinate)
+    area = (x2 - x1 + 1) * (y2 - y1 + 1)
+    idxs = y2
+
+    # if probabilities are provided, sort on them instead
+    if probs is not None:
+        idxs = probs
+
+    # sort the indexes
+    idxs = np.argsort(idxs)
+
+    # keep looping while some indexes still remain in the indexes list
+    while len(idxs) > 0:
+        # grab the last index in the indexes list and add the index value to the list of picked indexes
+        last = len(idxs) - 1
+        i = idxs[last]
+        pick.append(i)
+
+        # find the largest (x, y) coordinates for the start of the bounding box and the smallest (x, y) coordinates
+        # for the end of the bounding box
+        xx1 = np.maximum(x1[i], x1[idxs[:last]])
+        yy1 = np.maximum(y1[i], y1[idxs[:last]])
+        xx2 = np.minimum(x2[i], x2[idxs[:last]])
+        yy2 = np.minimum(y2[i], y2[idxs[:last]])
+
+        # compute the width and height of the bounding box
+        w = np.maximum(0, xx2 - xx1 + 1)
+        h = np.maximum(0, yy2 - yy1 + 1)
+
+        # compute the ratio of overlap
+        overlap = (w * h) / area[idxs[:last]]
+
+        # delete all indexes from the index list that have overlap greater than the provided overlap threshold
+        idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlapThresh)[0])))
+
+    # return only the bounding boxes that were picked
+    return boxes[pick].astype("int"), angles[pick]
+
+
+def decode_predictions(scores, geometry1, geometry2):
+    # grab the number of rows and columns from the scores volume, then
+    # initialize our set of bounding box rectangles and corresponding
+    # confidence scores
+    (numRows, numCols) = scores.shape[2:4]
+    rects = []
+    confidences = []
+    angles = []
+
+    # loop over the number of rows
+    for y in range(0, numRows):
+        # extract the scores (probabilities), followed by the
+        # geometrical data used to derive potential bounding box
+        # coordinates that surround text
+        scoresData = scores[0, 0, y]
+        xData0 = geometry1[0, 0, y]
+        xData1 = geometry1[0, 1, y]
+        xData2 = geometry1[0, 2, y]
+        xData3 = geometry1[0, 3, y]
+        anglesData = geometry2[0, 0, y]
+
+        # loop over the number of columns
+        for x in range(0, numCols):
+            # if our score does not have sufficient probability,
+            # ignore it
+            if scoresData[x] < _conf_threshold:
+                continue
+
+            # compute the offset factor as our resulting feature
+            # maps will be 4x smaller than the input image
+            (offsetX, offsetY) = (x * 4.0, y * 4.0)
+
+            # extract the rotation angle for the prediction and
+            # then compute the sin and cosine
+            angle = anglesData[x]
+            cos = np.cos(angle)
+            sin = np.sin(angle)
+
+            # use the geometry volume to derive the width and height
+            # of the bounding box
+            h = xData0[x] + xData2[x]
+            w = xData1[x] + xData3[x]
+
+            # compute both the starting and ending (x, y)-coordinates
+            # for the text prediction bounding box
+            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
+            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
+            startX = int(endX - w)
+            startY = int(endY - h)
+
+            # add the bounding box coordinates and probability score
+            # to our respective lists
+            rects.append((startX, startY, endX, endY))
+            confidences.append(scoresData[x])
+            angles.append(angle)
+
+    # return a tuple of the bounding boxes and associated confidences
+    return (rects, confidences, angles)
+
+
+def decode_east(nnet_packet, **kwargs):
+    scores = nnet_packet.get_tensor(0)
+    geometry1 = nnet_packet.get_tensor(1)
+    geometry2 = nnet_packet.get_tensor(2)
+    bboxes, confs, angles = decode_predictions(scores, geometry1, geometry2
+                                               )
+    boxes, angles = non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles))
+    boxesangles = (boxes, angles)
+    return boxesangles
+
+
+def show_east(boxesangles, frame, **kwargs):
+    bboxes = boxesangles[0]
+    angles = boxesangles[1]
+    for ((X0, Y0, X1, Y1), angle) in zip(bboxes, angles):
+        width = abs(X0 - X1)
+        height = abs(Y0 - Y1)
+        cX = int(X0 + width * 0.5)
+        cY = int(Y0 + height * 0.5)
+
+        rotRect = ((cX, cY), ((X1 - X0), (Y1 - Y0)), angle * (-1))
+        points = rotated_Rectangle(frame, rotRect, color=(255, 0, 0), thickness=1)
+        cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, lineType=cv2.LINE_8)
+
+    return frame
+
+
+def order_points(pts):
+    rect = np.zeros((4, 2), dtype="float32")
+    s = pts.sum(axis=1)
+    rect[0] = pts[np.argmin(s)]
+    rect[2] = pts[np.argmax(s)]
+    diff = np.diff(pts, axis=1)
+    rect[1] = pts[np.argmin(diff)]
+    rect[3] = pts[np.argmax(diff)]
+    return rect
+
+
+def four_point_transform(image, pts):
+    rect = order_points(pts)
+    (tl, tr, br, bl) = rect
+
+    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
+    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
+    maxWidth = max(int(widthA), int(widthB))
+
+    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
+    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
+    maxHeight = max(int(heightA), int(heightB))
+
+    dst = np.array([
+        [0, 0],
+        [maxWidth - 1, 0],
+        [maxWidth - 1, maxHeight - 1],
+        [0, maxHeight - 1]], dtype="float32")
+
+    M = cv2.getPerspectiveTransform(rect, dst)
+    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
+
+    return warped
diff --git a/camera/text.py b/camera/text.py
new file mode 100644
index 0000000..c959e35
--- /dev/null
+++ b/camera/text.py
@@ -0,0 +1,61 @@
+
+class HostSeqSync:
+    def __init__(self):
+        self.imfFrames = []
+
+    def add_msg(self, msg):
+        self.imfFrames.append(msg)
+
+    def get_msg(self, target_seq):
+        for i, imgFrame in enumerate(self.imfFrames):
+            if target_seq == imgFrame.getSequenceNum():
+                self.imfFrames = self.imfFrames[i:]
+                break
+        return self.imfFrames[0]
+
+
+class CTCCodec(object):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, characters):
+        # characters (str): set of the possible characters.
+        dict_character = list(characters)
+
+        self.dict = {}
+        for i, char in enumerate(dict_character):
+            self.dict[char] = i + 1
+
+        self.characters = dict_character
+        # print(self.characters)
+        # input()
+
+    def decode(self, preds):
+        """ convert text-index into text-label. """
+        texts = []
+        index = 0
+        # Select max probabilty (greedy decoding) then decode index to character
+        preds = preds.astype(np.float16)
+        preds_index = np.argmax(preds, 2)
+        preds_index = preds_index.transpose(1, 0)
+        preds_index_reshape = preds_index.reshape(-1)
+        preds_sizes = np.array([preds_index.shape[1]] * preds_index.shape[0])
+
+        for l in preds_sizes:
+            t = preds_index_reshape[index:index + l]
+
+            # NOTE: t might be zero size
+            if t.shape[0] == 0:
+                continue
+
+            char_list = []
+            for i in range(l):
+                # removing repeated characters and blank.
+                if not (i > 0 and t[i - 1] == t[i]):
+                    if self.characters[t[i]] != '#':
+                        char_list.append(self.characters[t[i]])
+            text = ''.join(char_list)
+            texts.append(text)
+
+            index += l
+
+        return texts
diff --git a/east.py b/east.py
new file mode 100644
index 0000000..7eb13af
--- /dev/null
+++ b/east.py
@@ -0,0 +1,229 @@
+import cv2
+import depthai
+import numpy as np
+
+_conf_threshold = 0.5
+
+def get_cv_rotated_rect(bbox, angle):
+    x0, y0, x1, y1 = bbox
+    width = abs(x0 - x1)
+    height = abs(y0 - y1)
+    x = x0 + width * 0.5
+    y = y0 + height * 0.5
+    return ([x.tolist(), y.tolist()], [width.tolist(), height.tolist()], np.rad2deg(angle))
+
+def rotated_Rectangle(bbox, angle):
+    X0, Y0, X1, Y1 = bbox
+    width = abs(X0 - X1)
+    height = abs(Y0 - Y1)
+    x = int(X0 + width * 0.5)
+    y = int(Y0 + height * 0.5)
+
+    pt1_1 = (int(x + width / 2), int(y + height / 2))
+    pt2_1 = (int(x + width / 2), int(y - height / 2))
+    pt3_1 = (int(x - width / 2), int(y - height / 2))
+    pt4_1 = (int(x - width / 2), int(y + height / 2))
+
+    t = np.array([[np.cos(angle), -np.sin(angle), x - x * np.cos(angle) + y * np.sin(angle)],
+                  [np.sin(angle), np.cos(angle), y - x * np.sin(angle) - y * np.cos(angle)],
+                  [0, 0, 1]])
+
+    tmp_pt1_1 = np.array([[pt1_1[0]], [pt1_1[1]], [1]])
+    tmp_pt1_2 = np.dot(t, tmp_pt1_1)
+    pt1_2 = (int(tmp_pt1_2[0][0]), int(tmp_pt1_2[1][0]))
+
+    tmp_pt2_1 = np.array([[pt2_1[0]], [pt2_1[1]], [1]])
+    tmp_pt2_2 = np.dot(t, tmp_pt2_1)
+    pt2_2 = (int(tmp_pt2_2[0][0]), int(tmp_pt2_2[1][0]))
+
+    tmp_pt3_1 = np.array([[pt3_1[0]], [pt3_1[1]], [1]])
+    tmp_pt3_2 = np.dot(t, tmp_pt3_1)
+    pt3_2 = (int(tmp_pt3_2[0][0]), int(tmp_pt3_2[1][0]))
+
+    tmp_pt4_1 = np.array([[pt4_1[0]], [pt4_1[1]], [1]])
+    tmp_pt4_2 = np.dot(t, tmp_pt4_1)
+    pt4_2 = (int(tmp_pt4_2[0][0]), int(tmp_pt4_2[1][0]))
+
+    points = np.array([pt1_2, pt2_2, pt3_2, pt4_2])
+
+    return points
+
+
+def non_max_suppression(boxes, probs=None, angles=None, overlapThresh=0.3):
+    # if there are no boxes, return an empty list
+    if len(boxes) == 0:
+        return [], []
+
+    # if the bounding boxes are integers, convert them to floats -- this
+    # is important since we'll be doing a bunch of divisions
+    if boxes.dtype.kind == "i":
+        boxes = boxes.astype("float")
+
+    # initialize the list of picked indexes
+    pick = []
+
+    # grab the coordinates of the bounding boxes
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    # compute the area of the bounding boxes and grab the indexes to sort
+    # (in the case that no probabilities are provided, simply sort on the bottom-left y-coordinate)
+    area = (x2 - x1 + 1) * (y2 - y1 + 1)
+    idxs = y2
+
+    # if probabilities are provided, sort on them instead
+    if probs is not None:
+        idxs = probs
+
+    # sort the indexes
+    idxs = np.argsort(idxs)
+
+    # keep looping while some indexes still remain in the indexes list
+    while len(idxs) > 0:
+        # grab the last index in the indexes list and add the index value to the list of picked indexes
+        last = len(idxs) - 1
+        i = idxs[last]
+        pick.append(i)
+
+        # find the largest (x, y) coordinates for the start of the bounding box and the smallest (x, y) coordinates for the end of the bounding box
+        xx1 = np.maximum(x1[i], x1[idxs[:last]])
+        yy1 = np.maximum(y1[i], y1[idxs[:last]])
+        xx2 = np.minimum(x2[i], x2[idxs[:last]])
+        yy2 = np.minimum(y2[i], y2[idxs[:last]])
+
+        # compute the width and height of the bounding box
+        w = np.maximum(0, xx2 - xx1 + 1)
+        h = np.maximum(0, yy2 - yy1 + 1)
+
+        # compute the ratio of overlap
+        overlap = (w * h) / area[idxs[:last]]
+
+        # delete all indexes from the index list that have overlap greater than the provided overlap threshold
+        idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlapThresh)[0])))
+
+    # return only the bounding boxes that were picked
+    return boxes[pick].astype("int"), angles[pick]
+
+
+def decode_predictions(scores, geometry1, geometry2):
+    # grab the number of rows and columns from the scores volume, then
+    # initialize our set of bounding box rectangles and corresponding
+    # confidence scores
+    (numRows, numCols) = scores.shape[2:4]
+    rects = []
+    confidences = []
+    angles = []
+
+    # loop over the number of rows
+    for y in range(0, numRows):
+        # extract the scores (probabilities), followed by the
+        # geometrical data used to derive potential bounding box
+        # coordinates that surround text
+        scoresData = scores[0, 0, y]
+        xData0 = geometry1[0, 0, y]
+        xData1 = geometry1[0, 1, y]
+        xData2 = geometry1[0, 2, y]
+        xData3 = geometry1[0, 3, y]
+        anglesData = geometry2[0, 0, y]
+
+        # loop over the number of columns
+        for x in range(0, numCols):
+            # if our score does not have sufficient probability,
+            # ignore it
+            if scoresData[x] < _conf_threshold:
+                continue
+
+            # compute the offset factor as our resulting feature
+            # maps will be 4x smaller than the input image
+            (offsetX, offsetY) = (x * 4.0, y * 4.0)
+
+            # extract the rotation angle for the prediction and
+            # then compute the sin and cosine
+            angle = anglesData[x]
+            cos = np.cos(angle)
+            sin = np.sin(angle)
+
+            # use the geometry volume to derive the width and height
+            # of the bounding box
+            h = xData0[x] + xData2[x]
+            w = xData1[x] + xData3[x]
+
+            # compute both the starting and ending (x, y)-coordinates
+            # for the text prediction bounding box
+            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
+            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
+            startX = int(endX - w)
+            startY = int(endY - h)
+
+            # add the bounding box coordinates and probability score
+            # to our respective lists
+            rects.append((startX, startY, endX, endY))
+            confidences.append(scoresData[x])
+            angles.append(angle)
+
+    # return a tuple of the bounding boxes and associated confidences
+    return (rects, confidences, angles)
+
+
+def decode_east(nnet_packet, **kwargs):
+    scores = nnet_packet.get_tensor(0)
+    geometry1 = nnet_packet.get_tensor(1)
+    geometry2 = nnet_packet.get_tensor(2)
+    bboxes, confs, angles = decode_predictions(scores, geometry1, geometry2
+                                               )
+    boxes, angles = non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles))
+    boxesangles = (boxes, angles)
+    return boxesangles
+
+
+def show_east(boxesangles, frame, **kwargs):
+    bboxes = boxesangles[0]
+    angles = boxesangles[1]
+    for ((X0, Y0, X1, Y1), angle) in zip(bboxes, angles):
+        width = abs(X0 - X1)
+        height = abs(Y0 - Y1)
+        cX = int(X0 + width * 0.5)
+        cY = int(Y0 + height * 0.5)
+
+        rotRect = ((cX, cY), ((X1 - X0), (Y1 - Y0)), angle * (-1))
+        points = rotated_Rectangle(frame, rotRect, color=(255, 0, 0), thickness=1)
+        cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, lineType=cv2.LINE_8)
+
+    return frame
+
+
+def order_points(pts):
+    rect = np.zeros((4, 2), dtype="float32")
+    s = pts.sum(axis=1)
+    rect[0] = pts[np.argmin(s)]
+    rect[2] = pts[np.argmax(s)]
+    diff = np.diff(pts, axis=1)
+    rect[1] = pts[np.argmin(diff)]
+    rect[3] = pts[np.argmax(diff)]
+    return rect
+
+
+def four_point_transform(image, pts):
+    rect = order_points(pts)
+    (tl, tr, br, bl) = rect
+
+    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
+    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
+    maxWidth = max(int(widthA), int(widthB))
+
+    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
+    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
+    maxHeight = max(int(heightA), int(heightB))
+
+    dst = np.array([
+        [0, 0],
+        [maxWidth - 1, 0],
+        [maxWidth - 1, maxHeight - 1],
+        [0, maxHeight - 1]], dtype="float32")
+
+    M = cv2.getPerspectiveTransform(rect, dst)
+    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
+
+    return warped
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..61fa4e2
--- /dev/null
+++ b/main.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+
+from pathlib import Path
+
+import cv2
+import numpy as np
+import depthai as dai
+import east
+import blobconverter
+
+class HostSeqSync:
+    def __init__(self):
+        self.imfFrames = []
+    def add_msg(self, msg):
+        self.imfFrames.append(msg)
+    def get_msg(self, target_seq):
+        for i, imgFrame in enumerate(self.imfFrames):
+            if target_seq == imgFrame.getSequenceNum():
+                self.imfFrames = self.imfFrames[i:]
+                break
+        return self.imfFrames[0]
+
+pipeline = dai.Pipeline()
+version = "2021.2"
+pipeline.setOpenVINOVersion(version=dai.OpenVINO.Version.VERSION_2021_2)
+
+colorCam = pipeline.create(dai.node.ColorCamera)
+colorCam.setPreviewSize(256, 256)
+colorCam.setVideoSize(1024, 1024) # 4 times larger in both axis
+colorCam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
+colorCam.setInterleaved(False)
+colorCam.setBoardSocket(dai.CameraBoardSocket.RGB)
+colorCam.setFps(10)
+
+controlIn = pipeline.create(dai.node.XLinkIn)
+controlIn.setStreamName('control')
+controlIn.out.link(colorCam.inputControl)
+
+cam_xout = pipeline.create(dai.node.XLinkOut)
+cam_xout.setStreamName('video')
+colorCam.video.link(cam_xout.input)
+
+# ---------------------------------------
+# 1st stage NN - text-detection
+# ---------------------------------------
+
+nn = pipeline.create(dai.node.NeuralNetwork)
+nn.setBlobPath(blobconverter.from_zoo(name="east_text_detection_256x256",zoo_type="depthai",shaves=6, version=version))
+colorCam.preview.link(nn.input)
+
+nn_xout = pipeline.create(dai.node.XLinkOut)
+nn_xout.setStreamName('detections')
+nn.out.link(nn_xout.input)
+
+# ---------------------------------------
+# 2nd stage NN - text-recognition-0012
+# ---------------------------------------
+
+manip = pipeline.create(dai.node.ImageManip)
+manip.setWaitForConfigInput(True)
+
+manip_img = pipeline.create(dai.node.XLinkIn)
+manip_img.setStreamName('manip_img')
+manip_img.out.link(manip.inputImage)
+
+manip_cfg = pipeline.create(dai.node.XLinkIn)
+manip_cfg.setStreamName('manip_cfg')
+manip_cfg.out.link(manip.inputConfig)
+
+manip_xout = pipeline.create(dai.node.XLinkOut)
+manip_xout.setStreamName('manip_out')
+
+nn2 = pipeline.create(dai.node.NeuralNetwork)
+nn2.setBlobPath(blobconverter.from_zoo(name="text-recognition-0012", shaves=6, version=version))
+nn2.setNumInferenceThreads(2)
+manip.out.link(nn2.input)
+manip.out.link(manip_xout.input)
+
+nn2_xout = pipeline.create(dai.node.XLinkOut)
+nn2_xout.setStreamName("recognitions")
+nn2.out.link(nn2_xout.input)
+
+def to_tensor_result(packet):
+    return {
+        name: np.array(packet.getLayerFp16(name))
+        for name in [tensor.name for tensor in packet.getRaw().tensors]
+    }
+
+def to_planar(frame):
+    return frame.transpose(2, 0, 1).flatten()
+
+with dai.Device(pipeline) as device:
+    q_vid = device.getOutputQueue("video", 4, blocking=False)
+    # This should be set to block, but would get to some extreme queuing/latency!
+    q_det = device.getOutputQueue("detections", 4, blocking=False)
+
+    q_rec = device.getOutputQueue("recognitions", 4, blocking=True)
+
+    q_manip_img = device.getInputQueue("manip_img")
+    q_manip_cfg = device.getInputQueue("manip_cfg")
+    q_manip_out = device.getOutputQueue("manip_out", 4, blocking=False)
+
+    controlQueue = device.getInputQueue('control')
+
+    frame = None
+    cropped_stacked = None
+    rotated_rectangles = []
+    rec_pushed = 0
+    rec_received = 0
+    host_sync = HostSeqSync()
+
+    class CTCCodec(object):
+        """ Convert between text-label and text-index """
+        def __init__(self, characters):
+            # characters (str): set of the possible characters.
+            dict_character = list(characters)
+
+            self.dict = {}
+            for i, char in enumerate(dict_character):
+                self.dict[char] = i + 1
+
+            self.characters = dict_character
+            #print(self.characters)
+            #input()
+        def decode(self, preds):
+            """ convert text-index into text-label. """
+            texts = []
+            index = 0
+            # Select max probabilty (greedy decoding) then decode index to character
+            preds = preds.astype(np.float16)
+            preds_index = np.argmax(preds, 2)
+            preds_index = preds_index.transpose(1, 0)
+            preds_index_reshape = preds_index.reshape(-1)
+            preds_sizes = np.array([preds_index.shape[1]] * preds_index.shape[0])
+
+            for l in preds_sizes:
+                t = preds_index_reshape[index:index + l]
+
+                # NOTE: t might be zero size
+                if t.shape[0] == 0:
+                    continue
+
+                char_list = []
+                for i in range(l):
+                    # removing repeated characters and blank.
+                    if not (i > 0 and t[i - 1] == t[i]):
+                        if self.characters[t[i]] != '#':
+                            char_list.append(self.characters[t[i]])
+                text = ''.join(char_list)
+                texts.append(text)
+
+                index += l
+
+            return texts
+
+    characters = '0123456789abcdefghijklmnopqrstuvwxyz#'
+    codec = CTCCodec(characters)
+
+    ctrl = dai.CameraControl()
+    ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.CONTINUOUS_VIDEO)
+    ctrl.setAutoFocusTrigger()
+    controlQueue.send(ctrl)
+
+    while True:
+        vid_in = q_vid.tryGet()
+        if vid_in is not None:
+            host_sync.add_msg(vid_in)
+
+        # Multiple recognition results may be available, read until queue is empty
+        while True:
+            in_rec = q_rec.tryGet()
+            if in_rec is None:
+                break
+            rec_data = bboxes = np.array(in_rec.getFirstLayerFp16()).reshape(30,1,37)
+            decoded_text = codec.decode(rec_data)[0]
+            pos = rotated_rectangles[rec_received]
+            print("{:2}: {:20}".format(rec_received, decoded_text),
+                "center({:3},{:3}) size({:3},{:3}) angle{:5.1f} deg".format(
+                    int(pos[0][0]), int(pos[0][1]), pos[1][0], pos[1][1], pos[2]))
+            # Draw the text on the right side of 'cropped_stacked' - placeholder
+            if cropped_stacked is not None:
+                cv2.putText(cropped_stacked, decoded_text,
+                                (120 + 10 , 32 * rec_received + 24),
+                                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)
+                cv2.imshow('cropped_stacked', cropped_stacked)
+            rec_received += 1
+
+        if cv2.waitKey(1) == ord('q'):
+            break
+
+        if rec_received >= rec_pushed:
+            in_det = q_det.tryGet()
+            if in_det is not None:
+                frame = host_sync.get_msg(in_det.getSequenceNum()).getCvFrame().copy()
+
+                scores, geom1, geom2 = to_tensor_result(in_det).values()
+                scores = np.reshape(scores, (1, 1, 64, 64))
+                geom1 = np.reshape(geom1, (1, 4, 64, 64))
+                geom2 = np.reshape(geom2, (1, 1, 64, 64))
+
+                bboxes, confs, angles = east.decode_predictions(scores, geom1, geom2)
+                boxes, angles = east.non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles))
+                rotated_rectangles = [
+                    east.get_cv_rotated_rect(bbox, angle * -1)
+                    for (bbox, angle) in zip(boxes, angles)
+                ]
+
+                rec_received = 0
+                rec_pushed = len(rotated_rectangles)
+                if rec_pushed:
+                    print("====== Pushing for recognition, count:", rec_pushed)
+                cropped_stacked = None
+                for idx, rotated_rect in enumerate(rotated_rectangles):
+                    # Detections are done on 256x256 frames, we are sending back 1024x1024
+                    # That's why we multiply center and size values by 4
+                    rotated_rect[0][0] = rotated_rect[0][0] * 4
+                    rotated_rect[0][1] = rotated_rect[0][1] * 4
+                    rotated_rect[1][0] = rotated_rect[1][0] * 4
+                    rotated_rect[1][1] = rotated_rect[1][1] * 4
+
+                    # Draw detection crop area on input frame
+                    points = np.int0(cv2.boxPoints(rotated_rect))
+                    print(rotated_rect)
+                    cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, lineType=cv2.LINE_8)
+
+                    # TODO make it work taking args like in OpenCV:
+                    # rr = ((256, 256), (128, 64), 30)
+                    rr = dai.RotatedRect()
+                    rr.center.x    = rotated_rect[0][0]
+                    rr.center.y    = rotated_rect[0][1]
+                    rr.size.width  = rotated_rect[1][0]
+                    rr.size.height = rotated_rect[1][1]
+                    rr.angle       = rotated_rect[2]
+                    cfg = dai.ImageManipConfig()
+                    cfg.setCropRotatedRect(rr, False)
+                    cfg.setResize(120, 32)
+                    # Send frame and config to device
+                    if idx == 0:
+                        w,h,c = frame.shape
+                        imgFrame = dai.ImgFrame()
+                        imgFrame.setData(to_planar(frame))
+                        imgFrame.setType(dai.ImgFrame.Type.BGR888p)
+                        imgFrame.setWidth(w)
+                        imgFrame.setHeight(h)
+                        q_manip_img.send(imgFrame)
+                    else:
+                        cfg.setReusePreviousImage(True)
+                    q_manip_cfg.send(cfg)
+
+                    # Get manipulated image from the device
+                    transformed = q_manip_out.get().getCvFrame()
+
+                    rec_placeholder_img = np.zeros((32, 200, 3), np.uint8)
+                    transformed = np.hstack((transformed, rec_placeholder_img))
+                    if cropped_stacked is None:
+                        cropped_stacked = transformed
+                    else:
+                        cropped_stacked = np.vstack((cropped_stacked, transformed))
+
+        if cropped_stacked is not None:
+            cv2.imshow('cropped_stacked', cropped_stacked)
+
+        if frame is not None:
+            cv2.imshow('frame', frame)
+
+        key = cv2.waitKey(1)
+        if  key == ord('q'):
+            break
+        elif key == ord('t'):
+            print("Autofocus trigger (and disable continuous)")
+            ctrl = dai.CameraControl()
+            ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.AUTO)
+            ctrl.setAutoFocusTrigger()
+            controlQueue.send(ctrl)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index faf953c..521893e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ opencv-python~=4.5.5.62
 google~=3.0.0
 google-api-core~=2.4.0
 setuptools==60.5.0
-protobuf3
\ No newline at end of file
+protobuf3
+blobconverter>=1.2.9
\ No newline at end of file