remotephone/gist:caab1cdc6719487247ec7271cce3d98f

## gistfile1.txt
# Did this with macos, install tesseract with `brew install tesseract` and I used a virtual environment
# It's hit or miss for a lot of these and I'm not very good with this, but this will OCR some yara rules sometimes.
# This works really poorly on sigma rules, it doesnt preserve white space well.
# Apparently version 5 of tesseract on macos will do it, but brew currently installs 4.11 which doesn't?
# people just need to put it in a repo.

# Example to scan: https://archerint.com/what-are-yara-rules/
# most code from here https://stackoverflow.com/questions/9480013/image-processing-to-improve-tesseract-ocr-accuracy

# These are various image files I tested against, test against anything you find. Simple, clear text will work better.

# curl -o rule.png https://cdn-cybersecurity.att.com/blog-content/kins_yara.png
# curl -o test.png https://orangecyberdefense.com/global/wp-content/uploads/sites/12/2021/03/campo-loader-4.png
# curl -o rule.png https://i1.wp.com/archerint.com/wp-content/uploads/2020/02/YARA-RULE-BISTROMATH-DHS.png
# curl -o rule2.png https://i2.wp.com/archerint.com/wp-content/uploads/2020/02/YARA-rule-phishing-campaign-FireEye.png


import cv2
import pytesseract
import numpy as np

# I try with the simplest version first, the uncomment lines to see what works, mostly trial and error

img = cv2.imread('rule.png')
# img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
# img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# kernel = np.ones((1, 1), np.uint8)
# img = cv2.dilate(img, kernel, iterations=1)
# img = cv2.erode(img, kernel, iterations=1)
# cv2.threshold(cv2.medianBlur(img, 3), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
output = pytesseract.image_to_string(img)
print(output)
	# Did this with macos, install tesseract with `brew install tesseract` and I used a virtual environment
	# It's hit or miss for a lot of these and I'm not very good with this, but this will OCR some yara rules sometimes.
	# This works really poorly on sigma rules, it doesnt preserve white space well.
	# Apparently version 5 of tesseract on macos will do it, but brew currently installs 4.11 which doesn't?
	# people just need to put it in a repo.

	# Example to scan: https://archerint.com/what-are-yara-rules/
	# most code from here https://stackoverflow.com/questions/9480013/image-processing-to-improve-tesseract-ocr-accuracy

	# These are various image files I tested against, test against anything you find. Simple, clear text will work better.

	# curl -o rule.png https://cdn-cybersecurity.att.com/blog-content/kins_yara.png
	# curl -o test.png https://orangecyberdefense.com/global/wp-content/uploads/sites/12/2021/03/campo-loader-4.png
	# curl -o rule.png https://i1.wp.com/archerint.com/wp-content/uploads/2020/02/YARA-RULE-BISTROMATH-DHS.png
	# curl -o rule2.png https://i2.wp.com/archerint.com/wp-content/uploads/2020/02/YARA-rule-phishing-campaign-FireEye.png


	import cv2
	import pytesseract
	import numpy as np

	# I try with the simplest version first, the uncomment lines to see what works, mostly trial and error

	img = cv2.imread('rule.png')
	# img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
	# img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	# kernel = np.ones((1, 1), np.uint8)
	# img = cv2.dilate(img, kernel, iterations=1)
	# img = cv2.erode(img, kernel, iterations=1)
	# cv2.threshold(cv2.medianBlur(img, 3), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
	output = pytesseract.image_to_string(img)
	print(output)