diff --git a/camera/depthai.py b/camera/depthai.py index 2e60965..243e9e5 100644 --- a/camera/depthai.py +++ b/camera/depthai.py @@ -10,6 +10,17 @@ import cv2 logger = logging.getLogger(__name__) +def to_tensor_result(packet): + return { + name: np.array(packet.getLayerFp16(name)) + for name in [tensor.name for tensor in packet.getRaw().tensors] + } + + +def to_planar(frame): + return frame.transpose(2, 0, 1).flatten() + + class FramePublisher: def __init__(self, mqtt_client: mqtt.Client, frame_topic: str, img_width: int, img_height: int): self._mqtt_client = mqtt_client @@ -22,6 +33,72 @@ class FramePublisher: logger.info("configure pipeline") pipeline = dai.Pipeline() + version = "2021.2" + pipeline.setOpenVINOVersion(version=dai.OpenVINO.Version.VERSION_2021_2) + + # colorCam = pipeline.create(dai.node.ColorCamera) + # colorCam.setPreviewSize(256, 256) + # colorCam.setVideoSize(1024, 1024) # 4 times larger in both axis + # colorCam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P) + # colorCam.setInterleaved(False) + # colorCam.setBoardSocket(dai.CameraBoardSocket.RGB) + # colorCam.setFps(10) + # + # controlIn = pipeline.create(dai.node.XLinkIn) + # controlIn.setStreamName('control') + # controlIn.out.link(colorCam.inputControl) + # + # cam_xout = pipeline.create(dai.node.XLinkOut) + # cam_xout.setStreamName('video') + # colorCam.video.link(cam_xout.input) + + # --------------------------------------- + # 1st stage NN - text-detection + # --------------------------------------- + + nn = pipeline.create(dai.node.NeuralNetwork) + nn.setBlobPath( + blobconverter.from_zoo(name="east_text_detection_256x256", zoo_type="depthai", shaves=6, version=version)) + colorCam.preview.link(nn.input) + + nn_xout = pipeline.create(dai.node.XLinkOut) + nn_xout.setStreamName('detections') + nn.out.link(nn_xout.input) + + # --------------------------------------- + # 2nd stage NN - text-recognition-0012 + # --------------------------------------- + + manip = pipeline.create(dai.node.ImageManip) + manip.setWaitForConfigInput(True) + + manip_img = pipeline.create(dai.node.XLinkIn) + manip_img.setStreamName('manip_img') + manip_img.out.link(manip.inputImage) + + manip_cfg = pipeline.create(dai.node.XLinkIn) + manip_cfg.setStreamName('manip_cfg') + manip_cfg.out.link(manip.inputConfig) + + manip_xout = pipeline.create(dai.node.XLinkOut) + manip_xout.setStreamName('manip_out') + + nn2 = pipeline.create(dai.node.NeuralNetwork) + nn2.setBlobPath(blobconverter.from_zoo(name="text-recognition-0012", shaves=6, version=version)) + nn2.setNumInferenceThreads(2) + manip.out.link(nn2.input) + manip.out.link(manip_xout.input) + + nn2_xout = pipeline.create(dai.node.XLinkOut) + nn2_xout.setStreamName("recognitions") + nn2.out.link(nn2_xout.input) + + + + + + + cam_rgb = pipeline.create(dai.node.ColorCamera) xout_rgb = pipeline.create(dai.node.XLinkOut) @@ -40,6 +117,150 @@ class FramePublisher: return pipeline def run(self): + + with dai.Device(self._pipeline) as device: + q_vid = device.getOutputQueue("video", 4, blocking=False) + # This should be set to block, but would get to some extreme queuing/latency! + q_det = device.getOutputQueue("detections", 4, blocking=False) + + q_rec = device.getOutputQueue("recognitions", 4, blocking=True) + + q_manip_img = device.getInputQueue("manip_img") + q_manip_cfg = device.getInputQueue("manip_cfg") + q_manip_out = device.getOutputQueue("manip_out", 4, blocking=False) + + controlQueue = device.getInputQueue('control') + + frame = None + cropped_stacked = None + rotated_rectangles = [] + rec_pushed = 0 + rec_received = 0 + host_sync = HostSeqSync() + + characters = '0123456789abcdefghijklmnopqrstuvwxyz#' + codec = CTCCodec(characters) + + ctrl = dai.CameraControl() + ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.CONTINUOUS_VIDEO) + ctrl.setAutoFocusTrigger() + controlQueue.send(ctrl) + + while True: + vid_in = q_vid.tryGet() + if vid_in is not None: + host_sync.add_msg(vid_in) + + # Multiple recognition results may be available, read until queue is empty + while True: + in_rec = q_rec.tryGet() + if in_rec is None: + break + rec_data = bboxes = np.array(in_rec.getFirstLayerFp16()).reshape(30, 1, 37) + decoded_text = codec.decode(rec_data)[0] + pos = rotated_rectangles[rec_received] + print("{:2}: {:20}".format(rec_received, decoded_text), + "center({:3},{:3}) size({:3},{:3}) angle{:5.1f} deg".format( + int(pos[0][0]), int(pos[0][1]), pos[1][0], pos[1][1], pos[2])) + # Draw the text on the right side of 'cropped_stacked' - placeholder + if cropped_stacked is not None: + cv2.putText(cropped_stacked, decoded_text, + (120 + 10, 32 * rec_received + 24), + cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2) + cv2.imshow('cropped_stacked', cropped_stacked) + rec_received += 1 + + if cv2.waitKey(1) == ord('q'): + break + + if rec_received >= rec_pushed: + in_det = q_det.tryGet() + if in_det is not None: + frame = host_sync.get_msg(in_det.getSequenceNum()).getCvFrame().copy() + + scores, geom1, geom2 = to_tensor_result(in_det).values() + scores = np.reshape(scores, (1, 1, 64, 64)) + geom1 = np.reshape(geom1, (1, 4, 64, 64)) + geom2 = np.reshape(geom2, (1, 1, 64, 64)) + + bboxes, confs, angles = east.decode_predictions(scores, geom1, geom2) + boxes, angles = east.non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles)) + rotated_rectangles = [ + east.get_cv_rotated_rect(bbox, angle * -1) + for (bbox, angle) in zip(boxes, angles) + ] + + rec_received = 0 + rec_pushed = len(rotated_rectangles) + if rec_pushed: + print("====== Pushing for recognition, count:", rec_pushed) + cropped_stacked = None + for idx, rotated_rect in enumerate(rotated_rectangles): + # Detections are done on 256x256 frames, we are sending back 1024x1024 + # That's why we multiply center and size values by 4 + rotated_rect[0][0] = rotated_rect[0][0] * 4 + rotated_rect[0][1] = rotated_rect[0][1] * 4 + rotated_rect[1][0] = rotated_rect[1][0] * 4 + rotated_rect[1][1] = rotated_rect[1][1] * 4 + + # Draw detection crop area on input frame + points = np.int0(cv2.boxPoints(rotated_rect)) + print(rotated_rect) + cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, + lineType=cv2.LINE_8) + + # TODO make it work taking args like in OpenCV: + # rr = ((256, 256), (128, 64), 30) + rr = dai.RotatedRect() + rr.center.x = rotated_rect[0][0] + rr.center.y = rotated_rect[0][1] + rr.size.width = rotated_rect[1][0] + rr.size.height = rotated_rect[1][1] + rr.angle = rotated_rect[2] + cfg = dai.ImageManipConfig() + cfg.setCropRotatedRect(rr, False) + cfg.setResize(120, 32) + # Send frame and config to device + if idx == 0: + w, h, c = frame.shape + imgFrame = dai.ImgFrame() + imgFrame.setData(to_planar(frame)) + imgFrame.setType(dai.ImgFrame.Type.BGR888p) + imgFrame.setWidth(w) + imgFrame.setHeight(h) + q_manip_img.send(imgFrame) + else: + cfg.setReusePreviousImage(True) + q_manip_cfg.send(cfg) + + # Get manipulated image from the device + transformed = q_manip_out.get().getCvFrame() + + rec_placeholder_img = np.zeros((32, 200, 3), np.uint8) + transformed = np.hstack((transformed, rec_placeholder_img)) + if cropped_stacked is None: + cropped_stacked = transformed + else: + cropped_stacked = np.vstack((cropped_stacked, transformed)) + + if cropped_stacked is not None: + cv2.imshow('cropped_stacked', cropped_stacked) + + if frame is not None: + cv2.imshow('frame', frame) + + key = cv2.waitKey(1) + if key == ord('q'): + break + elif key == ord('t'): + print("Autofocus trigger (and disable continuous)") + ctrl = dai.CameraControl() + ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.AUTO) + ctrl.setAutoFocusTrigger() + controlQueue.send(ctrl) + + + # Connect to device and start pipeline with dai.Device(self._pipeline) as device: logger.info('MxId: %s', device.getDeviceInfo().getMxId()) diff --git a/camera/east.py b/camera/east.py new file mode 100644 index 0000000..1b95c92 --- /dev/null +++ b/camera/east.py @@ -0,0 +1,232 @@ +import cv2 +import depthai +import numpy as np + +_conf_threshold = 0.5 + + +def get_cv_rotated_rect(bbox, angle): + x0, y0, x1, y1 = bbox + width = abs(x0 - x1) + height = abs(y0 - y1) + x = x0 + width * 0.5 + y = y0 + height * 0.5 + return [x.tolist(), y.tolist()], [width.tolist(), height.tolist()], np.rad2deg(angle) + + +def rotated_Rectangle(bbox, angle): + X0, Y0, X1, Y1 = bbox + width = abs(X0 - X1) + height = abs(Y0 - Y1) + x = int(X0 + width * 0.5) + y = int(Y0 + height * 0.5) + + pt1_1 = (int(x + width / 2), int(y + height / 2)) + pt2_1 = (int(x + width / 2), int(y - height / 2)) + pt3_1 = (int(x - width / 2), int(y - height / 2)) + pt4_1 = (int(x - width / 2), int(y + height / 2)) + + t = np.array([[np.cos(angle), -np.sin(angle), x - x * np.cos(angle) + y * np.sin(angle)], + [np.sin(angle), np.cos(angle), y - x * np.sin(angle) - y * np.cos(angle)], + [0, 0, 1]]) + + tmp_pt1_1 = np.array([[pt1_1[0]], [pt1_1[1]], [1]]) + tmp_pt1_2 = np.dot(t, tmp_pt1_1) + pt1_2 = (int(tmp_pt1_2[0][0]), int(tmp_pt1_2[1][0])) + + tmp_pt2_1 = np.array([[pt2_1[0]], [pt2_1[1]], [1]]) + tmp_pt2_2 = np.dot(t, tmp_pt2_1) + pt2_2 = (int(tmp_pt2_2[0][0]), int(tmp_pt2_2[1][0])) + + tmp_pt3_1 = np.array([[pt3_1[0]], [pt3_1[1]], [1]]) + tmp_pt3_2 = np.dot(t, tmp_pt3_1) + pt3_2 = (int(tmp_pt3_2[0][0]), int(tmp_pt3_2[1][0])) + + tmp_pt4_1 = np.array([[pt4_1[0]], [pt4_1[1]], [1]]) + tmp_pt4_2 = np.dot(t, tmp_pt4_1) + pt4_2 = (int(tmp_pt4_2[0][0]), int(tmp_pt4_2[1][0])) + + points = np.array([pt1_2, pt2_2, pt3_2, pt4_2]) + + return points + + +def non_max_suppression(boxes, probs=None, angles=None, overlapThresh=0.3): + # if there are no boxes, return an empty list + if len(boxes) == 0: + return [], [] + + # if the bounding boxes are integers, convert them to floats -- this + # is important since we'll be doing a bunch of divisions + if boxes.dtype.kind == "i": + boxes = boxes.astype("float") + + # initialize the list of picked indexes + pick = [] + + # grab the coordinates of the bounding boxes + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + # compute the area of the bounding boxes and grab the indexes to sort + # (in the case that no probabilities are provided, simply sort on the bottom-left y-coordinate) + area = (x2 - x1 + 1) * (y2 - y1 + 1) + idxs = y2 + + # if probabilities are provided, sort on them instead + if probs is not None: + idxs = probs + + # sort the indexes + idxs = np.argsort(idxs) + + # keep looping while some indexes still remain in the indexes list + while len(idxs) > 0: + # grab the last index in the indexes list and add the index value to the list of picked indexes + last = len(idxs) - 1 + i = idxs[last] + pick.append(i) + + # find the largest (x, y) coordinates for the start of the bounding box and the smallest (x, y) coordinates + # for the end of the bounding box + xx1 = np.maximum(x1[i], x1[idxs[:last]]) + yy1 = np.maximum(y1[i], y1[idxs[:last]]) + xx2 = np.minimum(x2[i], x2[idxs[:last]]) + yy2 = np.minimum(y2[i], y2[idxs[:last]]) + + # compute the width and height of the bounding box + w = np.maximum(0, xx2 - xx1 + 1) + h = np.maximum(0, yy2 - yy1 + 1) + + # compute the ratio of overlap + overlap = (w * h) / area[idxs[:last]] + + # delete all indexes from the index list that have overlap greater than the provided overlap threshold + idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlapThresh)[0]))) + + # return only the bounding boxes that were picked + return boxes[pick].astype("int"), angles[pick] + + +def decode_predictions(scores, geometry1, geometry2): + # grab the number of rows and columns from the scores volume, then + # initialize our set of bounding box rectangles and corresponding + # confidence scores + (numRows, numCols) = scores.shape[2:4] + rects = [] + confidences = [] + angles = [] + + # loop over the number of rows + for y in range(0, numRows): + # extract the scores (probabilities), followed by the + # geometrical data used to derive potential bounding box + # coordinates that surround text + scoresData = scores[0, 0, y] + xData0 = geometry1[0, 0, y] + xData1 = geometry1[0, 1, y] + xData2 = geometry1[0, 2, y] + xData3 = geometry1[0, 3, y] + anglesData = geometry2[0, 0, y] + + # loop over the number of columns + for x in range(0, numCols): + # if our score does not have sufficient probability, + # ignore it + if scoresData[x] < _conf_threshold: + continue + + # compute the offset factor as our resulting feature + # maps will be 4x smaller than the input image + (offsetX, offsetY) = (x * 4.0, y * 4.0) + + # extract the rotation angle for the prediction and + # then compute the sin and cosine + angle = anglesData[x] + cos = np.cos(angle) + sin = np.sin(angle) + + # use the geometry volume to derive the width and height + # of the bounding box + h = xData0[x] + xData2[x] + w = xData1[x] + xData3[x] + + # compute both the starting and ending (x, y)-coordinates + # for the text prediction bounding box + endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x])) + endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x])) + startX = int(endX - w) + startY = int(endY - h) + + # add the bounding box coordinates and probability score + # to our respective lists + rects.append((startX, startY, endX, endY)) + confidences.append(scoresData[x]) + angles.append(angle) + + # return a tuple of the bounding boxes and associated confidences + return (rects, confidences, angles) + + +def decode_east(nnet_packet, **kwargs): + scores = nnet_packet.get_tensor(0) + geometry1 = nnet_packet.get_tensor(1) + geometry2 = nnet_packet.get_tensor(2) + bboxes, confs, angles = decode_predictions(scores, geometry1, geometry2 + ) + boxes, angles = non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles)) + boxesangles = (boxes, angles) + return boxesangles + + +def show_east(boxesangles, frame, **kwargs): + bboxes = boxesangles[0] + angles = boxesangles[1] + for ((X0, Y0, X1, Y1), angle) in zip(bboxes, angles): + width = abs(X0 - X1) + height = abs(Y0 - Y1) + cX = int(X0 + width * 0.5) + cY = int(Y0 + height * 0.5) + + rotRect = ((cX, cY), ((X1 - X0), (Y1 - Y0)), angle * (-1)) + points = rotated_Rectangle(frame, rotRect, color=(255, 0, 0), thickness=1) + cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, lineType=cv2.LINE_8) + + return frame + + +def order_points(pts): + rect = np.zeros((4, 2), dtype="float32") + s = pts.sum(axis=1) + rect[0] = pts[np.argmin(s)] + rect[2] = pts[np.argmax(s)] + diff = np.diff(pts, axis=1) + rect[1] = pts[np.argmin(diff)] + rect[3] = pts[np.argmax(diff)] + return rect + + +def four_point_transform(image, pts): + rect = order_points(pts) + (tl, tr, br, bl) = rect + + widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2)) + widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2)) + maxWidth = max(int(widthA), int(widthB)) + + heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2)) + heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2)) + maxHeight = max(int(heightA), int(heightB)) + + dst = np.array([ + [0, 0], + [maxWidth - 1, 0], + [maxWidth - 1, maxHeight - 1], + [0, maxHeight - 1]], dtype="float32") + + M = cv2.getPerspectiveTransform(rect, dst) + warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight)) + + return warped diff --git a/camera/text.py b/camera/text.py new file mode 100644 index 0000000..c959e35 --- /dev/null +++ b/camera/text.py @@ -0,0 +1,61 @@ + +class HostSeqSync: + def __init__(self): + self.imfFrames = [] + + def add_msg(self, msg): + self.imfFrames.append(msg) + + def get_msg(self, target_seq): + for i, imgFrame in enumerate(self.imfFrames): + if target_seq == imgFrame.getSequenceNum(): + self.imfFrames = self.imfFrames[i:] + break + return self.imfFrames[0] + + +class CTCCodec(object): + """ Convert between text-label and text-index """ + + def __init__(self, characters): + # characters (str): set of the possible characters. + dict_character = list(characters) + + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + 1 + + self.characters = dict_character + # print(self.characters) + # input() + + def decode(self, preds): + """ convert text-index into text-label. """ + texts = [] + index = 0 + # Select max probabilty (greedy decoding) then decode index to character + preds = preds.astype(np.float16) + preds_index = np.argmax(preds, 2) + preds_index = preds_index.transpose(1, 0) + preds_index_reshape = preds_index.reshape(-1) + preds_sizes = np.array([preds_index.shape[1]] * preds_index.shape[0]) + + for l in preds_sizes: + t = preds_index_reshape[index:index + l] + + # NOTE: t might be zero size + if t.shape[0] == 0: + continue + + char_list = [] + for i in range(l): + # removing repeated characters and blank. + if not (i > 0 and t[i - 1] == t[i]): + if self.characters[t[i]] != '#': + char_list.append(self.characters[t[i]]) + text = ''.join(char_list) + texts.append(text) + + index += l + + return texts diff --git a/east.py b/east.py new file mode 100644 index 0000000..7eb13af --- /dev/null +++ b/east.py @@ -0,0 +1,229 @@ +import cv2 +import depthai +import numpy as np + +_conf_threshold = 0.5 + +def get_cv_rotated_rect(bbox, angle): + x0, y0, x1, y1 = bbox + width = abs(x0 - x1) + height = abs(y0 - y1) + x = x0 + width * 0.5 + y = y0 + height * 0.5 + return ([x.tolist(), y.tolist()], [width.tolist(), height.tolist()], np.rad2deg(angle)) + +def rotated_Rectangle(bbox, angle): + X0, Y0, X1, Y1 = bbox + width = abs(X0 - X1) + height = abs(Y0 - Y1) + x = int(X0 + width * 0.5) + y = int(Y0 + height * 0.5) + + pt1_1 = (int(x + width / 2), int(y + height / 2)) + pt2_1 = (int(x + width / 2), int(y - height / 2)) + pt3_1 = (int(x - width / 2), int(y - height / 2)) + pt4_1 = (int(x - width / 2), int(y + height / 2)) + + t = np.array([[np.cos(angle), -np.sin(angle), x - x * np.cos(angle) + y * np.sin(angle)], + [np.sin(angle), np.cos(angle), y - x * np.sin(angle) - y * np.cos(angle)], + [0, 0, 1]]) + + tmp_pt1_1 = np.array([[pt1_1[0]], [pt1_1[1]], [1]]) + tmp_pt1_2 = np.dot(t, tmp_pt1_1) + pt1_2 = (int(tmp_pt1_2[0][0]), int(tmp_pt1_2[1][0])) + + tmp_pt2_1 = np.array([[pt2_1[0]], [pt2_1[1]], [1]]) + tmp_pt2_2 = np.dot(t, tmp_pt2_1) + pt2_2 = (int(tmp_pt2_2[0][0]), int(tmp_pt2_2[1][0])) + + tmp_pt3_1 = np.array([[pt3_1[0]], [pt3_1[1]], [1]]) + tmp_pt3_2 = np.dot(t, tmp_pt3_1) + pt3_2 = (int(tmp_pt3_2[0][0]), int(tmp_pt3_2[1][0])) + + tmp_pt4_1 = np.array([[pt4_1[0]], [pt4_1[1]], [1]]) + tmp_pt4_2 = np.dot(t, tmp_pt4_1) + pt4_2 = (int(tmp_pt4_2[0][0]), int(tmp_pt4_2[1][0])) + + points = np.array([pt1_2, pt2_2, pt3_2, pt4_2]) + + return points + + +def non_max_suppression(boxes, probs=None, angles=None, overlapThresh=0.3): + # if there are no boxes, return an empty list + if len(boxes) == 0: + return [], [] + + # if the bounding boxes are integers, convert them to floats -- this + # is important since we'll be doing a bunch of divisions + if boxes.dtype.kind == "i": + boxes = boxes.astype("float") + + # initialize the list of picked indexes + pick = [] + + # grab the coordinates of the bounding boxes + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + # compute the area of the bounding boxes and grab the indexes to sort + # (in the case that no probabilities are provided, simply sort on the bottom-left y-coordinate) + area = (x2 - x1 + 1) * (y2 - y1 + 1) + idxs = y2 + + # if probabilities are provided, sort on them instead + if probs is not None: + idxs = probs + + # sort the indexes + idxs = np.argsort(idxs) + + # keep looping while some indexes still remain in the indexes list + while len(idxs) > 0: + # grab the last index in the indexes list and add the index value to the list of picked indexes + last = len(idxs) - 1 + i = idxs[last] + pick.append(i) + + # find the largest (x, y) coordinates for the start of the bounding box and the smallest (x, y) coordinates for the end of the bounding box + xx1 = np.maximum(x1[i], x1[idxs[:last]]) + yy1 = np.maximum(y1[i], y1[idxs[:last]]) + xx2 = np.minimum(x2[i], x2[idxs[:last]]) + yy2 = np.minimum(y2[i], y2[idxs[:last]]) + + # compute the width and height of the bounding box + w = np.maximum(0, xx2 - xx1 + 1) + h = np.maximum(0, yy2 - yy1 + 1) + + # compute the ratio of overlap + overlap = (w * h) / area[idxs[:last]] + + # delete all indexes from the index list that have overlap greater than the provided overlap threshold + idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlapThresh)[0]))) + + # return only the bounding boxes that were picked + return boxes[pick].astype("int"), angles[pick] + + +def decode_predictions(scores, geometry1, geometry2): + # grab the number of rows and columns from the scores volume, then + # initialize our set of bounding box rectangles and corresponding + # confidence scores + (numRows, numCols) = scores.shape[2:4] + rects = [] + confidences = [] + angles = [] + + # loop over the number of rows + for y in range(0, numRows): + # extract the scores (probabilities), followed by the + # geometrical data used to derive potential bounding box + # coordinates that surround text + scoresData = scores[0, 0, y] + xData0 = geometry1[0, 0, y] + xData1 = geometry1[0, 1, y] + xData2 = geometry1[0, 2, y] + xData3 = geometry1[0, 3, y] + anglesData = geometry2[0, 0, y] + + # loop over the number of columns + for x in range(0, numCols): + # if our score does not have sufficient probability, + # ignore it + if scoresData[x] < _conf_threshold: + continue + + # compute the offset factor as our resulting feature + # maps will be 4x smaller than the input image + (offsetX, offsetY) = (x * 4.0, y * 4.0) + + # extract the rotation angle for the prediction and + # then compute the sin and cosine + angle = anglesData[x] + cos = np.cos(angle) + sin = np.sin(angle) + + # use the geometry volume to derive the width and height + # of the bounding box + h = xData0[x] + xData2[x] + w = xData1[x] + xData3[x] + + # compute both the starting and ending (x, y)-coordinates + # for the text prediction bounding box + endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x])) + endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x])) + startX = int(endX - w) + startY = int(endY - h) + + # add the bounding box coordinates and probability score + # to our respective lists + rects.append((startX, startY, endX, endY)) + confidences.append(scoresData[x]) + angles.append(angle) + + # return a tuple of the bounding boxes and associated confidences + return (rects, confidences, angles) + + +def decode_east(nnet_packet, **kwargs): + scores = nnet_packet.get_tensor(0) + geometry1 = nnet_packet.get_tensor(1) + geometry2 = nnet_packet.get_tensor(2) + bboxes, confs, angles = decode_predictions(scores, geometry1, geometry2 + ) + boxes, angles = non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles)) + boxesangles = (boxes, angles) + return boxesangles + + +def show_east(boxesangles, frame, **kwargs): + bboxes = boxesangles[0] + angles = boxesangles[1] + for ((X0, Y0, X1, Y1), angle) in zip(bboxes, angles): + width = abs(X0 - X1) + height = abs(Y0 - Y1) + cX = int(X0 + width * 0.5) + cY = int(Y0 + height * 0.5) + + rotRect = ((cX, cY), ((X1 - X0), (Y1 - Y0)), angle * (-1)) + points = rotated_Rectangle(frame, rotRect, color=(255, 0, 0), thickness=1) + cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, lineType=cv2.LINE_8) + + return frame + + +def order_points(pts): + rect = np.zeros((4, 2), dtype="float32") + s = pts.sum(axis=1) + rect[0] = pts[np.argmin(s)] + rect[2] = pts[np.argmax(s)] + diff = np.diff(pts, axis=1) + rect[1] = pts[np.argmin(diff)] + rect[3] = pts[np.argmax(diff)] + return rect + + +def four_point_transform(image, pts): + rect = order_points(pts) + (tl, tr, br, bl) = rect + + widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2)) + widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2)) + maxWidth = max(int(widthA), int(widthB)) + + heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2)) + heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2)) + maxHeight = max(int(heightA), int(heightB)) + + dst = np.array([ + [0, 0], + [maxWidth - 1, 0], + [maxWidth - 1, maxHeight - 1], + [0, maxHeight - 1]], dtype="float32") + + M = cv2.getPerspectiveTransform(rect, dst) + warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight)) + + return warped \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..61fa4e2 --- /dev/null +++ b/main.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 + +from pathlib import Path + +import cv2 +import numpy as np +import depthai as dai +import east +import blobconverter + +class HostSeqSync: + def __init__(self): + self.imfFrames = [] + def add_msg(self, msg): + self.imfFrames.append(msg) + def get_msg(self, target_seq): + for i, imgFrame in enumerate(self.imfFrames): + if target_seq == imgFrame.getSequenceNum(): + self.imfFrames = self.imfFrames[i:] + break + return self.imfFrames[0] + +pipeline = dai.Pipeline() +version = "2021.2" +pipeline.setOpenVINOVersion(version=dai.OpenVINO.Version.VERSION_2021_2) + +colorCam = pipeline.create(dai.node.ColorCamera) +colorCam.setPreviewSize(256, 256) +colorCam.setVideoSize(1024, 1024) # 4 times larger in both axis +colorCam.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P) +colorCam.setInterleaved(False) +colorCam.setBoardSocket(dai.CameraBoardSocket.RGB) +colorCam.setFps(10) + +controlIn = pipeline.create(dai.node.XLinkIn) +controlIn.setStreamName('control') +controlIn.out.link(colorCam.inputControl) + +cam_xout = pipeline.create(dai.node.XLinkOut) +cam_xout.setStreamName('video') +colorCam.video.link(cam_xout.input) + +# --------------------------------------- +# 1st stage NN - text-detection +# --------------------------------------- + +nn = pipeline.create(dai.node.NeuralNetwork) +nn.setBlobPath(blobconverter.from_zoo(name="east_text_detection_256x256",zoo_type="depthai",shaves=6, version=version)) +colorCam.preview.link(nn.input) + +nn_xout = pipeline.create(dai.node.XLinkOut) +nn_xout.setStreamName('detections') +nn.out.link(nn_xout.input) + +# --------------------------------------- +# 2nd stage NN - text-recognition-0012 +# --------------------------------------- + +manip = pipeline.create(dai.node.ImageManip) +manip.setWaitForConfigInput(True) + +manip_img = pipeline.create(dai.node.XLinkIn) +manip_img.setStreamName('manip_img') +manip_img.out.link(manip.inputImage) + +manip_cfg = pipeline.create(dai.node.XLinkIn) +manip_cfg.setStreamName('manip_cfg') +manip_cfg.out.link(manip.inputConfig) + +manip_xout = pipeline.create(dai.node.XLinkOut) +manip_xout.setStreamName('manip_out') + +nn2 = pipeline.create(dai.node.NeuralNetwork) +nn2.setBlobPath(blobconverter.from_zoo(name="text-recognition-0012", shaves=6, version=version)) +nn2.setNumInferenceThreads(2) +manip.out.link(nn2.input) +manip.out.link(manip_xout.input) + +nn2_xout = pipeline.create(dai.node.XLinkOut) +nn2_xout.setStreamName("recognitions") +nn2.out.link(nn2_xout.input) + +def to_tensor_result(packet): + return { + name: np.array(packet.getLayerFp16(name)) + for name in [tensor.name for tensor in packet.getRaw().tensors] + } + +def to_planar(frame): + return frame.transpose(2, 0, 1).flatten() + +with dai.Device(pipeline) as device: + q_vid = device.getOutputQueue("video", 4, blocking=False) + # This should be set to block, but would get to some extreme queuing/latency! + q_det = device.getOutputQueue("detections", 4, blocking=False) + + q_rec = device.getOutputQueue("recognitions", 4, blocking=True) + + q_manip_img = device.getInputQueue("manip_img") + q_manip_cfg = device.getInputQueue("manip_cfg") + q_manip_out = device.getOutputQueue("manip_out", 4, blocking=False) + + controlQueue = device.getInputQueue('control') + + frame = None + cropped_stacked = None + rotated_rectangles = [] + rec_pushed = 0 + rec_received = 0 + host_sync = HostSeqSync() + + class CTCCodec(object): + """ Convert between text-label and text-index """ + def __init__(self, characters): + # characters (str): set of the possible characters. + dict_character = list(characters) + + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + 1 + + self.characters = dict_character + #print(self.characters) + #input() + def decode(self, preds): + """ convert text-index into text-label. """ + texts = [] + index = 0 + # Select max probabilty (greedy decoding) then decode index to character + preds = preds.astype(np.float16) + preds_index = np.argmax(preds, 2) + preds_index = preds_index.transpose(1, 0) + preds_index_reshape = preds_index.reshape(-1) + preds_sizes = np.array([preds_index.shape[1]] * preds_index.shape[0]) + + for l in preds_sizes: + t = preds_index_reshape[index:index + l] + + # NOTE: t might be zero size + if t.shape[0] == 0: + continue + + char_list = [] + for i in range(l): + # removing repeated characters and blank. + if not (i > 0 and t[i - 1] == t[i]): + if self.characters[t[i]] != '#': + char_list.append(self.characters[t[i]]) + text = ''.join(char_list) + texts.append(text) + + index += l + + return texts + + characters = '0123456789abcdefghijklmnopqrstuvwxyz#' + codec = CTCCodec(characters) + + ctrl = dai.CameraControl() + ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.CONTINUOUS_VIDEO) + ctrl.setAutoFocusTrigger() + controlQueue.send(ctrl) + + while True: + vid_in = q_vid.tryGet() + if vid_in is not None: + host_sync.add_msg(vid_in) + + # Multiple recognition results may be available, read until queue is empty + while True: + in_rec = q_rec.tryGet() + if in_rec is None: + break + rec_data = bboxes = np.array(in_rec.getFirstLayerFp16()).reshape(30,1,37) + decoded_text = codec.decode(rec_data)[0] + pos = rotated_rectangles[rec_received] + print("{:2}: {:20}".format(rec_received, decoded_text), + "center({:3},{:3}) size({:3},{:3}) angle{:5.1f} deg".format( + int(pos[0][0]), int(pos[0][1]), pos[1][0], pos[1][1], pos[2])) + # Draw the text on the right side of 'cropped_stacked' - placeholder + if cropped_stacked is not None: + cv2.putText(cropped_stacked, decoded_text, + (120 + 10 , 32 * rec_received + 24), + cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2) + cv2.imshow('cropped_stacked', cropped_stacked) + rec_received += 1 + + if cv2.waitKey(1) == ord('q'): + break + + if rec_received >= rec_pushed: + in_det = q_det.tryGet() + if in_det is not None: + frame = host_sync.get_msg(in_det.getSequenceNum()).getCvFrame().copy() + + scores, geom1, geom2 = to_tensor_result(in_det).values() + scores = np.reshape(scores, (1, 1, 64, 64)) + geom1 = np.reshape(geom1, (1, 4, 64, 64)) + geom2 = np.reshape(geom2, (1, 1, 64, 64)) + + bboxes, confs, angles = east.decode_predictions(scores, geom1, geom2) + boxes, angles = east.non_max_suppression(np.array(bboxes), probs=confs, angles=np.array(angles)) + rotated_rectangles = [ + east.get_cv_rotated_rect(bbox, angle * -1) + for (bbox, angle) in zip(boxes, angles) + ] + + rec_received = 0 + rec_pushed = len(rotated_rectangles) + if rec_pushed: + print("====== Pushing for recognition, count:", rec_pushed) + cropped_stacked = None + for idx, rotated_rect in enumerate(rotated_rectangles): + # Detections are done on 256x256 frames, we are sending back 1024x1024 + # That's why we multiply center and size values by 4 + rotated_rect[0][0] = rotated_rect[0][0] * 4 + rotated_rect[0][1] = rotated_rect[0][1] * 4 + rotated_rect[1][0] = rotated_rect[1][0] * 4 + rotated_rect[1][1] = rotated_rect[1][1] * 4 + + # Draw detection crop area on input frame + points = np.int0(cv2.boxPoints(rotated_rect)) + print(rotated_rect) + cv2.polylines(frame, [points], isClosed=True, color=(255, 0, 0), thickness=1, lineType=cv2.LINE_8) + + # TODO make it work taking args like in OpenCV: + # rr = ((256, 256), (128, 64), 30) + rr = dai.RotatedRect() + rr.center.x = rotated_rect[0][0] + rr.center.y = rotated_rect[0][1] + rr.size.width = rotated_rect[1][0] + rr.size.height = rotated_rect[1][1] + rr.angle = rotated_rect[2] + cfg = dai.ImageManipConfig() + cfg.setCropRotatedRect(rr, False) + cfg.setResize(120, 32) + # Send frame and config to device + if idx == 0: + w,h,c = frame.shape + imgFrame = dai.ImgFrame() + imgFrame.setData(to_planar(frame)) + imgFrame.setType(dai.ImgFrame.Type.BGR888p) + imgFrame.setWidth(w) + imgFrame.setHeight(h) + q_manip_img.send(imgFrame) + else: + cfg.setReusePreviousImage(True) + q_manip_cfg.send(cfg) + + # Get manipulated image from the device + transformed = q_manip_out.get().getCvFrame() + + rec_placeholder_img = np.zeros((32, 200, 3), np.uint8) + transformed = np.hstack((transformed, rec_placeholder_img)) + if cropped_stacked is None: + cropped_stacked = transformed + else: + cropped_stacked = np.vstack((cropped_stacked, transformed)) + + if cropped_stacked is not None: + cv2.imshow('cropped_stacked', cropped_stacked) + + if frame is not None: + cv2.imshow('frame', frame) + + key = cv2.waitKey(1) + if key == ord('q'): + break + elif key == ord('t'): + print("Autofocus trigger (and disable continuous)") + ctrl = dai.CameraControl() + ctrl.setAutoFocusMode(dai.CameraControl.AutoFocusMode.AUTO) + ctrl.setAutoFocusTrigger() + controlQueue.send(ctrl) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index faf953c..521893e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ opencv-python~=4.5.5.62 google~=3.0.0 google-api-core~=2.4.0 setuptools==60.5.0 -protobuf3 \ No newline at end of file +protobuf3 +blobconverter>=1.2.9 \ No newline at end of file