nrjones8/parse_mn_covid_prison_image.py

## parse_mn_covid_prison_image.py
"""
Used image from 4/17/2020 at https://mn.gov/doc/assets/2020.04.17%20public%20COVID%20testing%20chart_tcm1089-425186.JPG
Context: https://twitter.com/seathebass92/status/1251184468066533376?s=20

Output looks like:

>> python parse_mn_image.py
Full string is:
Total 59 16 40 3 37 10 2 o
Parsed 16 positive cases from mn_prison_covid_table_from_website.jpeg
"""

import cv2
import pytesseract


def parse_num_cases_from_image(image_path):
    full_img = cv2.imread(image_path)
    height, width, channels = full_img.shape

    # Thanks to https://stackoverflow.com/a/15589825
    # The last row, which contains totals, is ~30px from the bottom
    bottom_30px = full_img[height - 30:height, 0:width]

    # https://nanonets.com/blog/ocr-with-tesseract/#ocrwithpytesseractandopencv
    # trial and error to get this combination working.
    # --psm 6 - "Assume a single uniform block of text."
    config = '--psm 6'
    img_as_string = pytesseract.image_to_string(bottom_30px, config=config)
    print('Full string is:')
    print(img_as_string)

    # looks like "Total 59 16 ..."
    # and "Confirmed Positive" is second 3rd column
    num_confirmed_positive = img_as_string.split()[2]

    print('Parsed {} confirmed positive cases from {}'.format(num_confirmed_positive, image_path))

if __name__ == '__main__':
    parse_num_cases_from_image('mn_prison_covid_table_from_website.jpeg')
	"""
	Used image from 4/17/2020 at https://mn.gov/doc/assets/2020.04.17%20public%20COVID%20testing%20chart_tcm1089-425186.JPG
	Context: https://twitter.com/seathebass92/status/1251184468066533376?s=20

	Output looks like:

	>> python parse_mn_image.py
	Full string is:
	Total 59 16 40 3 37 10 2 o
	Parsed 16 positive cases from mn_prison_covid_table_from_website.jpeg
	"""

	import cv2
	import pytesseract


	def parse_num_cases_from_image(image_path):
	full_img = cv2.imread(image_path)
	height, width, channels = full_img.shape

	# Thanks to https://stackoverflow.com/a/15589825
	# The last row, which contains totals, is ~30px from the bottom
	bottom_30px = full_img[height - 30:height, 0:width]

	# https://nanonets.com/blog/ocr-with-tesseract/#ocrwithpytesseractandopencv
	# trial and error to get this combination working.
	# --psm 6 - "Assume a single uniform block of text."
	config = '--psm 6'
	img_as_string = pytesseract.image_to_string(bottom_30px, config=config)
	print('Full string is:')
	print(img_as_string)

	# looks like "Total 59 16 ..."
	# and "Confirmed Positive" is second 3rd column
	num_confirmed_positive = img_as_string.split()[2]

	print('Parsed {} confirmed positive cases from {}'.format(num_confirmed_positive, image_path))

	if __name__ == '__main__':
	parse_num_cases_from_image('mn_prison_covid_table_from_website.jpeg')