-
-
Save sayakpaul/24314074d16018c1ce1b7699cc8395ab to your computer and use it in GitHub Desktop.
# Code adapted from: | |
# https://www.pyimagesearch.com/2018/08/20/opencv-text-detection-east-text-detector/ | |
# USAGE | |
# python text_detection_video.py --east east_model_float16.tflite | |
# import the necessary packages | |
from imutils.video import VideoStream | |
from imutils.video import FPS | |
from imutils.object_detection import non_max_suppression | |
import tensorflow as tf | |
import numpy as np | |
import argparse | |
import imutils | |
import time | |
import cv2 | |
def decode_predictions(scores, geometry): | |
# grab the number of rows and columns from the scores volume, then | |
# initialize our set of bounding box rectangles and corresponding | |
# confidence scores | |
(numRows, numCols) = scores.shape[2:4] | |
rects = [] | |
confidences = [] | |
# loop over the number of rows | |
for y in range(0, numRows): | |
# extract the scores (probabilities), followed by the | |
# geometrical data used to derive potential bounding box | |
# coordinates that surround text | |
scoresData = scores[0, 0, y] | |
xData0 = geometry[0, 0, y] | |
xData1 = geometry[0, 1, y] | |
xData2 = geometry[0, 2, y] | |
xData3 = geometry[0, 3, y] | |
anglesData = geometry[0, 4, y] | |
# loop over the number of columns | |
for x in range(0, numCols): | |
# if our score does not have sufficient probability, | |
# ignore it | |
if scoresData[x] < args["min_confidence"]: | |
continue | |
# compute the offset factor as our resulting feature | |
# maps will be 4x smaller than the input image | |
(offsetX, offsetY) = (x * 4.0, y * 4.0) | |
# extract the rotation angle for the prediction and | |
# then compute the sin and cosine | |
angle = anglesData[x] | |
cos = np.cos(angle) | |
sin = np.sin(angle) | |
# use the geometry volume to derive the width and height | |
# of the bounding box | |
h = xData0[x] + xData2[x] | |
w = xData1[x] + xData3[x] | |
# compute both the starting and ending (x, y)-coordinates | |
# for the text prediction bounding box | |
endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x])) | |
endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x])) | |
startX = int(endX - w) | |
startY = int(endY - h) | |
# add the bounding box coordinates and probability score | |
# to our respective lists | |
rects.append((startX, startY, endX, endY)) | |
confidences.append(scoresData[x]) | |
# return a tuple of the bounding boxes and associated confidences | |
return (rects, confidences) | |
def preprocess_image(image, mean): | |
image = image.astype("float32") | |
image -= mean | |
image = np.expand_dims(image, 0) | |
return image | |
def run_inference(image, model_path): | |
# initialize the TFLite interpreter | |
interpreter = tf.lite.Interpreter(model_path=model_path) | |
input_details = interpreter.get_input_details() | |
interpreter.allocate_tensors() | |
# perform inference and parse the outputs | |
interpreter.set_tensor(input_details[0]['index'], image) | |
interpreter.invoke() | |
scores = interpreter.tensor( | |
interpreter.get_output_details()[0]['index'])() | |
geometry = interpreter.tensor( | |
interpreter.get_output_details()[1]['index'])() | |
scores = np.transpose(scores, (0, 3, 1, 2)) | |
geometry = np.transpose(geometry, (0, 3, 1, 2)) | |
return (scores, geometry) | |
# construct the argument parser and parse the arguments | |
ap = argparse.ArgumentParser() | |
ap.add_argument("-east", "--east", type=str, required=True, | |
help="path to input TFLite EAST text detector") | |
ap.add_argument("-v", "--video", type=str, | |
help="path to optinal input video file") | |
ap.add_argument("-c", "--min-confidence", type=float, default=0.5, | |
help="minimum probability required to inspect a region") | |
ap.add_argument("-w", "--width", type=int, default=320, | |
help="resized image width (should be multiple of 32)") | |
ap.add_argument("-e", "--height", type=int, default=320, | |
help="resized image height (should be multiple of 32)") | |
args = vars(ap.parse_args()) | |
# define the channel-wise mean array to perform mean subtraction | |
mean = np.array([123.68, 116.779, 103.939][::-1], dtype="float32") | |
# initialize the original frame dimensions, new frame dimensions, | |
# and ratio between the dimensions | |
(W, H) = (None, None) | |
(newW, newH) = (args["width"], args["height"]) | |
(rW, rH) = (None, None) | |
# if a video path was not supplied, grab the reference to the web cam | |
if not args.get("video", False): | |
print("[INFO] starting video stream...") | |
vs = VideoStream(src=0).start() | |
time.sleep(1.0) | |
# otherwise, grab a reference to the video file | |
else: | |
vs = cv2.VideoCapture(args["video"]) | |
# start the FPS throughput estimator | |
fps = FPS().start() | |
# loop over frames from the video stream | |
while True: | |
# grab the current frame, then handle if we are using a | |
# VideoStream or VideoCapture object | |
frame = vs.read() | |
frame = frame[1] if args.get("video", False) else frame | |
# check to see if we have reached the end of the stream | |
if frame is None: | |
break | |
# resize the frame, maintaining the aspect ratio | |
frame = imutils.resize(frame, width=1000) | |
orig = frame.copy() | |
# if our frame dimensions are None, we still need to compute the | |
# ratio of old frame dimensions to new frame dimensions | |
if W is None or H is None: | |
(H, W) = frame.shape[:2] | |
rW = W / float(newW) | |
rH = H / float(newH) | |
# resize the frame, this time ignoring aspect ratio and preprocess | |
# it | |
frame = cv2.resize(frame, (newW, newH)) | |
frame = preprocess_image(frame, mean) | |
# perform inference and parse the outputs | |
(scores, geometry) = run_inference(frame, args["east"]) | |
# decode the predictions, then apply non-maxima suppression to | |
# suppress weak, overlapping bounding boxes | |
(rects, confidences) = decode_predictions(scores, geometry) | |
boxes = non_max_suppression(np.array(rects), probs=confidences) | |
# loop over the bounding boxes | |
for (startX, startY, endX, endY) in boxes: | |
# scale the bounding box coordinates based on the respective | |
# ratios | |
startX = int(startX * rW) | |
startY = int(startY * rH) | |
endX = int(endX * rW) | |
endY = int(endY * rH) | |
# draw the bounding box on the frame | |
cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2) | |
# update the FPS counter | |
fps.update() | |
# show the output frame | |
cv2.imshow("Text Detection", orig) | |
key = cv2.waitKey(1) & 0xFF | |
# if the `q` key was pressed, break from the loop | |
if key == ord("q"): | |
break | |
# stop the timer and display FPS information | |
fps.stop() | |
print("[INFO] elasped time: {:.2f}".format(fps.elapsed())) | |
print("[INFO] approx. FPS: {:.2f}".format(fps.fps())) | |
# if we are using a webcam, release the pointer | |
if not args.get("video", False): | |
vs.stop() | |
# otherwise, release the file pointer | |
else: | |
vs.release() | |
# close all windows | |
cv2.destroyAllWindows() |
@khanhlvg is there a better way to load the TFLite model with the Python interpreter than what I have done here?
If you see after parsing each frame in the video, I am invoking the interpreter freshly which is probably introducing some added latency.
@sayakpaul You can reuse the interpreter instance and only run L86-L94 on each frame. I will save you some time but I suspect that it will be a huge difference.
@sayakpaul maybe try allocate tensor every inference?
@khanhlvg I modified the inference utility like this -
def run_inference(image):
# perform inference and parse the outputs
interpreter.set_tensor(input_details[0]['index'], image)
interpreter.invoke()
interpreter.allocate_tensors()
scores = interpreter.tensor(
interpreter.get_output_details()[0]['index'])()
geometry = interpreter.tensor(
interpreter.get_output_details()[1]['index'])()
scores = np.transpose(scores, (0, 3, 1, 2))
geometry = np.transpose(geometry, (0, 3, 1, 2))
return (scores, geometry)
Loading the model like so at the beginning (also allocating tensors here for the very first inference):
interpreter = tf.lite.Interpreter(model_path=args["east"])
input_details = interpreter.get_input_details()
interpreter.allocate_tensors()
It still results in the same error.
Here is a snippet on how to reuse the interpreter. The point is to not keep a reference to the tensor as described here
@khanhlvg were you referring to this one?
interpreter.allocate_tensors()
input = interpreter.tensor(interpreter.get_input_details()[0]["index"])
output = interpreter.tensor(interpreter.get_output_details()[0]["index"])
for i in range(10):
input().fill(3.)
interpreter.invoke()
print("inference %s" % output())
@khanhlvg here's how I was able to do it: https://gist.github.com/sayakpaul/3db5adf9be025eda70d8053183103ecd.
But you were right, the FPS did not improve much. But still, I think it's good to know about this possibility.
Results on my humble MacBook Air (13-inch, 2017) (Processor: 1.8 GHz Intel Core i5) (Memory: 8 GB 1600 MHz DDR3):
TensorFlow version: 2.3.0
Model: float16
Demo available here.