Last active
March 23, 2023 23:15
-
-
Save sayakpaul/24314074d16018c1ce1b7699cc8395ab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code adapted from: | |
# https://www.pyimagesearch.com/2018/08/20/opencv-text-detection-east-text-detector/ | |
# USAGE | |
# python text_detection_video.py --east east_model_float16.tflite | |
# import the necessary packages | |
from imutils.video import VideoStream | |
from imutils.video import FPS | |
from imutils.object_detection import non_max_suppression | |
import tensorflow as tf | |
import numpy as np | |
import argparse | |
import imutils | |
import time | |
import cv2 | |
def decode_predictions(scores, geometry): | |
# grab the number of rows and columns from the scores volume, then | |
# initialize our set of bounding box rectangles and corresponding | |
# confidence scores | |
(numRows, numCols) = scores.shape[2:4] | |
rects = [] | |
confidences = [] | |
# loop over the number of rows | |
for y in range(0, numRows): | |
# extract the scores (probabilities), followed by the | |
# geometrical data used to derive potential bounding box | |
# coordinates that surround text | |
scoresData = scores[0, 0, y] | |
xData0 = geometry[0, 0, y] | |
xData1 = geometry[0, 1, y] | |
xData2 = geometry[0, 2, y] | |
xData3 = geometry[0, 3, y] | |
anglesData = geometry[0, 4, y] | |
# loop over the number of columns | |
for x in range(0, numCols): | |
# if our score does not have sufficient probability, | |
# ignore it | |
if scoresData[x] < args["min_confidence"]: | |
continue | |
# compute the offset factor as our resulting feature | |
# maps will be 4x smaller than the input image | |
(offsetX, offsetY) = (x * 4.0, y * 4.0) | |
# extract the rotation angle for the prediction and | |
# then compute the sin and cosine | |
angle = anglesData[x] | |
cos = np.cos(angle) | |
sin = np.sin(angle) | |
# use the geometry volume to derive the width and height | |
# of the bounding box | |
h = xData0[x] + xData2[x] | |
w = xData1[x] + xData3[x] | |
# compute both the starting and ending (x, y)-coordinates | |
# for the text prediction bounding box | |
endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x])) | |
endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x])) | |
startX = int(endX - w) | |
startY = int(endY - h) | |
# add the bounding box coordinates and probability score | |
# to our respective lists | |
rects.append((startX, startY, endX, endY)) | |
confidences.append(scoresData[x]) | |
# return a tuple of the bounding boxes and associated confidences | |
return (rects, confidences) | |
def preprocess_image(image, mean): | |
image = image.astype("float32") | |
image -= mean | |
image = np.expand_dims(image, 0) | |
return image | |
def run_inference(image, model_path): | |
# initialize the TFLite interpreter | |
interpreter = tf.lite.Interpreter(model_path=model_path) | |
input_details = interpreter.get_input_details() | |
interpreter.allocate_tensors() | |
# perform inference and parse the outputs | |
interpreter.set_tensor(input_details[0]['index'], image) | |
interpreter.invoke() | |
scores = interpreter.tensor( | |
interpreter.get_output_details()[0]['index'])() | |
geometry = interpreter.tensor( | |
interpreter.get_output_details()[1]['index'])() | |
scores = np.transpose(scores, (0, 3, 1, 2)) | |
geometry = np.transpose(geometry, (0, 3, 1, 2)) | |
return (scores, geometry) | |
# construct the argument parser and parse the arguments | |
ap = argparse.ArgumentParser() | |
ap.add_argument("-east", "--east", type=str, required=True, | |
help="path to input TFLite EAST text detector") | |
ap.add_argument("-v", "--video", type=str, | |
help="path to optinal input video file") | |
ap.add_argument("-c", "--min-confidence", type=float, default=0.5, | |
help="minimum probability required to inspect a region") | |
ap.add_argument("-w", "--width", type=int, default=320, | |
help="resized image width (should be multiple of 32)") | |
ap.add_argument("-e", "--height", type=int, default=320, | |
help="resized image height (should be multiple of 32)") | |
args = vars(ap.parse_args()) | |
# define the channel-wise mean array to perform mean subtraction | |
mean = np.array([123.68, 116.779, 103.939][::-1], dtype="float32") | |
# initialize the original frame dimensions, new frame dimensions, | |
# and ratio between the dimensions | |
(W, H) = (None, None) | |
(newW, newH) = (args["width"], args["height"]) | |
(rW, rH) = (None, None) | |
# if a video path was not supplied, grab the reference to the web cam | |
if not args.get("video", False): | |
print("[INFO] starting video stream...") | |
vs = VideoStream(src=0).start() | |
time.sleep(1.0) | |
# otherwise, grab a reference to the video file | |
else: | |
vs = cv2.VideoCapture(args["video"]) | |
# start the FPS throughput estimator | |
fps = FPS().start() | |
# loop over frames from the video stream | |
while True: | |
# grab the current frame, then handle if we are using a | |
# VideoStream or VideoCapture object | |
frame = vs.read() | |
frame = frame[1] if args.get("video", False) else frame | |
# check to see if we have reached the end of the stream | |
if frame is None: | |
break | |
# resize the frame, maintaining the aspect ratio | |
frame = imutils.resize(frame, width=1000) | |
orig = frame.copy() | |
# if our frame dimensions are None, we still need to compute the | |
# ratio of old frame dimensions to new frame dimensions | |
if W is None or H is None: | |
(H, W) = frame.shape[:2] | |
rW = W / float(newW) | |
rH = H / float(newH) | |
# resize the frame, this time ignoring aspect ratio and preprocess | |
# it | |
frame = cv2.resize(frame, (newW, newH)) | |
frame = preprocess_image(frame, mean) | |
# perform inference and parse the outputs | |
(scores, geometry) = run_inference(frame, args["east"]) | |
# decode the predictions, then apply non-maxima suppression to | |
# suppress weak, overlapping bounding boxes | |
(rects, confidences) = decode_predictions(scores, geometry) | |
boxes = non_max_suppression(np.array(rects), probs=confidences) | |
# loop over the bounding boxes | |
for (startX, startY, endX, endY) in boxes: | |
# scale the bounding box coordinates based on the respective | |
# ratios | |
startX = int(startX * rW) | |
startY = int(startY * rH) | |
endX = int(endX * rW) | |
endY = int(endY * rH) | |
# draw the bounding box on the frame | |
cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2) | |
# update the FPS counter | |
fps.update() | |
# show the output frame | |
cv2.imshow("Text Detection", orig) | |
key = cv2.waitKey(1) & 0xFF | |
# if the `q` key was pressed, break from the loop | |
if key == ord("q"): | |
break | |
# stop the timer and display FPS information | |
fps.stop() | |
print("[INFO] elasped time: {:.2f}".format(fps.elapsed())) | |
print("[INFO] approx. FPS: {:.2f}".format(fps.fps())) | |
# if we are using a webcam, release the pointer | |
if not args.get("video", False): | |
vs.stop() | |
# otherwise, release the file pointer | |
else: | |
vs.release() | |
# close all windows | |
cv2.destroyAllWindows() |
@sayakpaul maybe try allocate tensor every inference?
@khanhlvg I modified the inference utility like this -
def run_inference(image):
# perform inference and parse the outputs
interpreter.set_tensor(input_details[0]['index'], image)
interpreter.invoke()
interpreter.allocate_tensors()
scores = interpreter.tensor(
interpreter.get_output_details()[0]['index'])()
geometry = interpreter.tensor(
interpreter.get_output_details()[1]['index'])()
scores = np.transpose(scores, (0, 3, 1, 2))
geometry = np.transpose(geometry, (0, 3, 1, 2))
return (scores, geometry)
Loading the model like so at the beginning (also allocating tensors here for the very first inference):
interpreter = tf.lite.Interpreter(model_path=args["east"])
input_details = interpreter.get_input_details()
interpreter.allocate_tensors()
It still results in the same error.
Here is a snippet on how to reuse the interpreter. The point is to not keep a reference to the tensor as described here
@khanhlvg were you referring to this one?
interpreter.allocate_tensors()
input = interpreter.tensor(interpreter.get_input_details()[0]["index"])
output = interpreter.tensor(interpreter.get_output_details()[0]["index"])
for i in range(10):
input().fill(3.)
interpreter.invoke()
print("inference %s" % output())
@khanhlvg here's how I was able to do it: https://gist.github.com/sayakpaul/3db5adf9be025eda70d8053183103ecd.
But you were right, the FPS did not improve much. But still, I think it's good to know about this possibility.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@khanhlvg here's what I did. It resulted in: