sayakpaul/text_detection_video.py

## text_detection_video.py
# Code adapted from:
# https://www.pyimagesearch.com/2018/08/20/opencv-text-detection-east-text-detector/
# USAGE
# python text_detection_video.py --east east_model_float16.tflite

# import the necessary packages
from imutils.video import VideoStream
from imutils.video import FPS
from imutils.object_detection import non_max_suppression
import tensorflow as tf
import numpy as np
import argparse
import imutils
import time
import cv2

def decode_predictions(scores, geometry):
	# grab the number of rows and columns from the scores volume, then
	# initialize our set of bounding box rectangles and corresponding
	# confidence scores
	(numRows, numCols) = scores.shape[2:4]
	rects = []
	confidences = []

	# loop over the number of rows
	for y in range(0, numRows):
		# extract the scores (probabilities), followed by the
		# geometrical data used to derive potential bounding box
		# coordinates that surround text
		scoresData = scores[0, 0, y]
		xData0 = geometry[0, 0, y]
		xData1 = geometry[0, 1, y]
		xData2 = geometry[0, 2, y]
		xData3 = geometry[0, 3, y]
		anglesData = geometry[0, 4, y]

		# loop over the number of columns
		for x in range(0, numCols):
			# if our score does not have sufficient probability,
			# ignore it
			if scoresData[x] < args["min_confidence"]:
				continue

			# compute the offset factor as our resulting feature
			# maps will be 4x smaller than the input image
			(offsetX, offsetY) = (x * 4.0, y * 4.0)

			# extract the rotation angle for the prediction and
			# then compute the sin and cosine
			angle = anglesData[x]
			cos = np.cos(angle)
			sin = np.sin(angle)

			# use the geometry volume to derive the width and height
			# of the bounding box
			h = xData0[x] + xData2[x]
			w = xData1[x] + xData3[x]

			# compute both the starting and ending (x, y)-coordinates
			# for the text prediction bounding box
			endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
			endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
			startX = int(endX - w)
			startY = int(endY - h)

			# add the bounding box coordinates and probability score
			# to our respective lists
			rects.append((startX, startY, endX, endY))
			confidences.append(scoresData[x])

	# return a tuple of the bounding boxes and associated confidences
	return (rects, confidences)

def preprocess_image(image, mean):
	image = image.astype("float32")
	image -= mean
	image = np.expand_dims(image, 0)
	return image

def run_inference(image, model_path):
	# initialize the TFLite interpreter
	interpreter = tf.lite.Interpreter(model_path=model_path)
	input_details = interpreter.get_input_details()
	interpreter.allocate_tensors()

	# perform inference and parse the outputs
	interpreter.set_tensor(input_details[0]['index'], image)
	interpreter.invoke()
	scores = interpreter.tensor(
    	interpreter.get_output_details()[0]['index'])()
	geometry = interpreter.tensor(
    	interpreter.get_output_details()[1]['index'])()
	scores = np.transpose(scores, (0, 3, 1, 2))
	geometry = np.transpose(geometry, (0, 3, 1, 2))

	return (scores, geometry)

# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-east", "--east", type=str, required=True,
	help="path to input TFLite EAST text detector")
ap.add_argument("-v", "--video", type=str,
	help="path to optinal input video file")
ap.add_argument("-c", "--min-confidence", type=float, default=0.5,
	help="minimum probability required to inspect a region")
ap.add_argument("-w", "--width", type=int, default=320,
	help="resized image width (should be multiple of 32)")
ap.add_argument("-e", "--height", type=int, default=320,
	help="resized image height (should be multiple of 32)")
args = vars(ap.parse_args())

# define the channel-wise mean array to perform mean subtraction
mean = np.array([123.68, 116.779, 103.939][::-1], dtype="float32")

# initialize the original frame dimensions, new frame dimensions,
# and ratio between the dimensions
(W, H) = (None, None)
(newW, newH) = (args["width"], args["height"])
(rW, rH) = (None, None)

# if a video path was not supplied, grab the reference to the web cam
if not args.get("video", False):
	print("[INFO] starting video stream...")
	vs = VideoStream(src=0).start()
	time.sleep(1.0)

# otherwise, grab a reference to the video file
else:
	vs = cv2.VideoCapture(args["video"])

# start the FPS throughput estimator
fps = FPS().start()

# loop over frames from the video stream
while True:
	# grab the current frame, then handle if we are using a
	# VideoStream or VideoCapture object
	frame = vs.read()
	frame = frame[1] if args.get("video", False) else frame

	# check to see if we have reached the end of the stream
	if frame is None:
		break

	# resize the frame, maintaining the aspect ratio
	frame = imutils.resize(frame, width=1000)
	orig = frame.copy()

	# if our frame dimensions are None, we still need to compute the
	# ratio of old frame dimensions to new frame dimensions
	if W is None or H is None:
		(H, W) = frame.shape[:2]
		rW = W / float(newW)
		rH = H / float(newH)

	# resize the frame, this time ignoring aspect ratio and preprocess
	# it
	frame = cv2.resize(frame, (newW, newH))
	frame = preprocess_image(frame, mean)

	# perform inference and parse the outputs
	(scores, geometry) = run_inference(frame, args["east"])

	# decode the predictions, then  apply non-maxima suppression to
	# suppress weak, overlapping bounding boxes
	(rects, confidences) = decode_predictions(scores, geometry)
	boxes = non_max_suppression(np.array(rects), probs=confidences)

	# loop over the bounding boxes
	for (startX, startY, endX, endY) in boxes:
		# scale the bounding box coordinates based on the respective
		# ratios
		startX = int(startX * rW)
		startY = int(startY * rH)
		endX = int(endX * rW)
		endY = int(endY * rH)

		# draw the bounding box on the frame
		cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)

	# update the FPS counter
	fps.update()

	# show the output frame
	cv2.imshow("Text Detection", orig)
	key = cv2.waitKey(1) & 0xFF

	# if the `q` key was pressed, break from the loop
	if key == ord("q"):
		break

# stop the timer and display FPS information
fps.stop()
print("[INFO] elasped time: {:.2f}".format(fps.elapsed()))
print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))

# if we are using a webcam, release the pointer
if not args.get("video", False):
	vs.stop()

# otherwise, release the file pointer
else:
	vs.release()

# close all windows
cv2.destroyAllWindows()
	# Code adapted from:
	# https://www.pyimagesearch.com/2018/08/20/opencv-text-detection-east-text-detector/
	# USAGE
	# python text_detection_video.py --east east_model_float16.tflite

	# import the necessary packages
	from imutils.video import VideoStream
	from imutils.video import FPS
	from imutils.object_detection import non_max_suppression
	import tensorflow as tf
	import numpy as np
	import argparse
	import imutils
	import time
	import cv2

	def decode_predictions(scores, geometry):
	# grab the number of rows and columns from the scores volume, then
	# initialize our set of bounding box rectangles and corresponding
	# confidence scores
	(numRows, numCols) = scores.shape[2:4]
	rects = []
	confidences = []

	# loop over the number of rows
	for y in range(0, numRows):
	# extract the scores (probabilities), followed by the
	# geometrical data used to derive potential bounding box
	# coordinates that surround text
	scoresData = scores[0, 0, y]
	xData0 = geometry[0, 0, y]
	xData1 = geometry[0, 1, y]
	xData2 = geometry[0, 2, y]
	xData3 = geometry[0, 3, y]
	anglesData = geometry[0, 4, y]

	# loop over the number of columns
	for x in range(0, numCols):
	# if our score does not have sufficient probability,
	# ignore it
	if scoresData[x] < args["min_confidence"]:
	continue

	# compute the offset factor as our resulting feature
	# maps will be 4x smaller than the input image
	(offsetX, offsetY) = (x * 4.0, y * 4.0)

	# extract the rotation angle for the prediction and
	# then compute the sin and cosine
	angle = anglesData[x]
	cos = np.cos(angle)
	sin = np.sin(angle)

	# use the geometry volume to derive the width and height
	# of the bounding box
	h = xData0[x] + xData2[x]
	w = xData1[x] + xData3[x]

	# compute both the starting and ending (x, y)-coordinates
	# for the text prediction bounding box
	endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
	endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
	startX = int(endX - w)
	startY = int(endY - h)

	# add the bounding box coordinates and probability score
	# to our respective lists
	rects.append((startX, startY, endX, endY))
	confidences.append(scoresData[x])

	# return a tuple of the bounding boxes and associated confidences
	return (rects, confidences)

	def preprocess_image(image, mean):
	image = image.astype("float32")
	image -= mean
	image = np.expand_dims(image, 0)
	return image

	def run_inference(image, model_path):
	# initialize the TFLite interpreter
	interpreter = tf.lite.Interpreter(model_path=model_path)
	input_details = interpreter.get_input_details()
	interpreter.allocate_tensors()

	# perform inference and parse the outputs
	interpreter.set_tensor(input_details[0]['index'], image)
	interpreter.invoke()
	scores = interpreter.tensor(
	interpreter.get_output_details()[0]['index'])()
	geometry = interpreter.tensor(
	interpreter.get_output_details()[1]['index'])()
	scores = np.transpose(scores, (0, 3, 1, 2))
	geometry = np.transpose(geometry, (0, 3, 1, 2))

	return (scores, geometry)

	# construct the argument parser and parse the arguments
	ap = argparse.ArgumentParser()
	ap.add_argument("-east", "--east", type=str, required=True,
	help="path to input TFLite EAST text detector")
	ap.add_argument("-v", "--video", type=str,
	help="path to optinal input video file")
	ap.add_argument("-c", "--min-confidence", type=float, default=0.5,
	help="minimum probability required to inspect a region")
	ap.add_argument("-w", "--width", type=int, default=320,
	help="resized image width (should be multiple of 32)")
	ap.add_argument("-e", "--height", type=int, default=320,
	help="resized image height (should be multiple of 32)")
	args = vars(ap.parse_args())

	# define the channel-wise mean array to perform mean subtraction
	mean = np.array([123.68, 116.779, 103.939][::-1], dtype="float32")

	# initialize the original frame dimensions, new frame dimensions,
	# and ratio between the dimensions
	(W, H) = (None, None)
	(newW, newH) = (args["width"], args["height"])
	(rW, rH) = (None, None)

	# if a video path was not supplied, grab the reference to the web cam
	if not args.get("video", False):
	print("[INFO] starting video stream...")
	vs = VideoStream(src=0).start()
	time.sleep(1.0)

	# otherwise, grab a reference to the video file
	else:
	vs = cv2.VideoCapture(args["video"])

	# start the FPS throughput estimator
	fps = FPS().start()

	# loop over frames from the video stream
	while True:
	# grab the current frame, then handle if we are using a
	# VideoStream or VideoCapture object
	frame = vs.read()
	frame = frame[1] if args.get("video", False) else frame

	# check to see if we have reached the end of the stream
	if frame is None:
	break

	# resize the frame, maintaining the aspect ratio
	frame = imutils.resize(frame, width=1000)
	orig = frame.copy()

	# if our frame dimensions are None, we still need to compute the
	# ratio of old frame dimensions to new frame dimensions
	if W is None or H is None:
	(H, W) = frame.shape[:2]
	rW = W / float(newW)
	rH = H / float(newH)

	# resize the frame, this time ignoring aspect ratio and preprocess
	# it
	frame = cv2.resize(frame, (newW, newH))
	frame = preprocess_image(frame, mean)

	# perform inference and parse the outputs
	(scores, geometry) = run_inference(frame, args["east"])

	# decode the predictions, then apply non-maxima suppression to
	# suppress weak, overlapping bounding boxes
	(rects, confidences) = decode_predictions(scores, geometry)
	boxes = non_max_suppression(np.array(rects), probs=confidences)

	# loop over the bounding boxes
	for (startX, startY, endX, endY) in boxes:
	# scale the bounding box coordinates based on the respective
	# ratios
	startX = int(startX * rW)
	startY = int(startY * rH)
	endX = int(endX * rW)
	endY = int(endY * rH)

	# draw the bounding box on the frame
	cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)

	# update the FPS counter
	fps.update()

	# show the output frame
	cv2.imshow("Text Detection", orig)
	key = cv2.waitKey(1) & 0xFF

	# if the `q` key was pressed, break from the loop
	if key == ord("q"):
	break

	# stop the timer and display FPS information
	fps.stop()
	print("[INFO] elasped time: {:.2f}".format(fps.elapsed()))
	print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))

	# if we are using a webcam, release the pointer
	if not args.get("video", False):
	vs.stop()

	# otherwise, release the file pointer
	else:
	vs.release()

	# close all windows
	cv2.destroyAllWindows()