# Import required modules import cv2 as cv import math import argparse ############ Add argument parser for command line arguments ############ parser = argparse.ArgumentParser(description='Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)') parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.') parser.add_argument('--model', required=True, help='Path to a binary .pb file of model contains trained weights.') parser.add_argument('--width', type=int, default=320, help='Preprocess input image by resizing to a specific width. It should be multiple by 32.') parser.add_argument('--height',type=int, default=320, help='Preprocess input image by resizing to a specific height. It should be multiple by 32.') parser.add_argument('--thr',type=float, default=0.5, help='Confidence threshold.') parser.add_argument('--nms',type=float, default=0.4, help='Non-maximum suppression threshold.') args = parser.parse_args() ############ Utility functions ############ def decode(scores, geometry, scoreThresh): detections = [] confidences = [] ############ CHECK DIMENSIONS AND SHAPES OF geometry AND scores ############ assert len(scores.shape) == 4, "Incorrect dimensions of scores" assert len(geometry.shape) == 4, "Incorrect dimensions of geometry" assert scores.shape[0] == 1, "Invalid dimensions of scores" assert geometry.shape[0] == 1, "Invalid dimensions of geometry" assert scores.shape[1] == 1, "Invalid dimensions of scores" assert geometry.shape[1] == 5, "Invalid dimensions of geometry" assert scores.shape[2] == geometry.shape[2], "Invalid dimensions of scores and geometry" assert scores.shape[3] == geometry.shape[3], "Invalid dimensions of scores and geometry" height = scores.shape[2] width = scores.shape[3] for y in range(0, height): # Extract data from scores scoresData = scores[0][0][y] x0_data = geometry[0][0][y] x1_data = geometry[0][1][y] x2_data = geometry[0][2][y] x3_data = geometry[0][3][y] anglesData = geometry[0][4][y] for x in range(0, width): score = scoresData[x] # If score is lower than threshold score, move to next x if(score < scoreThresh): continue # Calculate offset offsetX = x * 4.0 offsetY = y * 4.0 angle = anglesData[x] # Calculate cos and sin of angle cosA = math.cos(angle) sinA = math.sin(angle) h = x0_data[x] + x2_data[x] w = x1_data[x] + x3_data[x] # Calculate offset offset = ([offsetX + cosA * x1_data[x] + sinA * x2_data[x], offsetY - sinA * x1_data[x] + cosA * x2_data[x]]) # Find points for rectangle p1 = (-sinA * h + offset[0], -cosA * h + offset[1]) p3 = (-cosA * w + offset[0], sinA * w + offset[1]) center = (0.5*(p1[0]+p3[0]), 0.5*(p1[1]+p3[1])) detections.append((center, (w,h), -1*angle * 180.0 / math.pi)) confidences.append(float(score)) # Return detections and confidences return [detections, confidences] def main(): # Read and store arguments confThreshold = args.thr nmsThreshold = args.nms inpWidth = args.width inpHeight = args.height model = args.model # Load network net = cv.dnn.readNet(model) # Create a new named window kWinName = "EAST: An Efficient and Accurate Scene Text Detector" cv.namedWindow(kWinName, cv.WINDOW_NORMAL) outNames = [] outNames.append("feature_fusion/Conv_7/Sigmoid") outNames.append("feature_fusion/concat_3") # Open a video file or an image file or a camera stream cap = cv.VideoCapture(args.input if args.input else 0) while cv.waitKey(1) < 0: # Read frame hasFrame, frame = cap.read() if not hasFrame: cv.waitKey() break # Get frame height and width height_ = frame.shape[0] width_ = frame.shape[1] rW = width_ / float(inpWidth) rH = height_ / float(inpHeight) # Create a 4D blob from frame. blob = cv.dnn.blobFromImage(frame, 1.0, (inpWidth, inpHeight), (123.68, 116.78, 103.94), True, False) # Run the model net.setInput(blob) outs = net.forward(outNames) t, _ = net.getPerfProfile() label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency()) # Get scores and geometry scores = outs[0] geometry = outs[1] [boxes, confidences] = decode(scores, geometry, confThreshold) # Apply NMS indices = cv.dnn.NMSBoxesRotated(boxes, confidences, confThreshold,nmsThreshold) for i in indices: # get 4 corners of the rotated rect vertices = cv.boxPoints(boxes[i[0]]) # scale the bounding box coordinates based on the respective ratios for j in range(4): vertices[j][0] *= rW vertices[j][1] *= rH for j in range(4): p1 = (vertices[j][0], vertices[j][1]) p2 = (vertices[(j + 1) % 4][0], vertices[(j + 1) % 4][1]) cv.line(frame, p1, p2, (0, 255, 0), 1) # Put efficiency information cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0)) # Display the frame cv.imshow(kWinName,frame) if __name__ == "__main__": main()