danslinky/4rxehd0d9sx9cpjz.jpg

## 4rxehd0d9sx9cpjz.jpg

      
    Raw
  

              4rxehd0d9sx9cpjz.jpg
            
          
## README.md

      
    Raw
  

              README.md
            
          
    python extract text from image

Etsy has a problem with scam accounts pretending to be Etsy Support, and trying to phish. They're easy to spot, they have Etsy in their profile Pic.
Users should never interact with these accounts. Legitimate message from Etsy will appear under the From Etsy tab.
I think Etsy could be a bit more proactive on detecting them, so to make a point I've written some squiffy python to look for etsy, or support in the images.
It tries textract, easyocr and pytesseract, and returns True on the first successful match
I only have a small dataset, so it would be better if Etsy ran it themselves.

  
## Aquarius0174.jpg

      
    Raw
  

              Aquarius0174.jpg
            
          
## detect.py
import pytesseract
# import time
import requests
import os
from bs4 import BeautifulSoup
from PIL import Image, ImageEnhance, ImageFilter
#import qrtools
from pyzbar.pyzbar import decode
# import cv2
import textract
import io
import easyocr

phrases = ['etsy','support','billing','account']

profiles = [
    'Aquarius0174', # still an active account 18/3/24
    '4rxehd0d9sx9cpjz',
    'ygxanpmoi9bll6dm',
    'dpiqol46u7qv8s6h',
]

# demo of checking qrcodes sent to users
qrcodes = [
    'pics/qrcode.jpg',
    'pics/qrcode2.jpg',
]
def read_qrcode_data(image):
    data = decode(Image.open(image))
    return data
for i in qrcodes:
    data = read_qrcode_data(i)
    text = data[0].data.decode('utf-8')
    if any(phrase in text.lower() for phrase in phrases):
        print(f'WARN: {i} contains phrase: {text.lower()}')

# extract text from image
def read_image_text(image):

     # textract (hit a miss, but fast)
    try:
        text = textract.process(image)
        text = text.decode('utf-8')
        if check_text(text):
            return text
    except Exception as e:
        return ""

    # easy ocr (gets them all tbh, but slow)
    try:
        # I don't have a proper GPU...
        reader = easyocr.Reader(['en'], gpu=False, verbose=False)  # specify the language(s)
        result = reader.readtext(image)
        text = ' '.join([item[1] for item in result])
        if text:
            return text
    except Exception as e:
        return ""

    # pytesseract (reasonably fast, but misses some easy ones)
    for psm in range(14):
        try:
            print(f'INFO: Running psm {psm} {i}')
            text = pytesseract.image_to_string(i, config=f'--psm {psm}')
            return text
        except Exception as e:
            return ""

    return False


# slurp up images in pics directory
def get_local_images(dir):
    images = os.listdir(dir)
    image_extensions = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp']
    images = [i for i in images if any(i.endswith(ext) for ext in image_extensions)]
    return images

def generate_test_images(image):

    # Load the original image
    img = Image.open(image)

    # convert webp to jpeg
    if img.format == 'WEBP':
        print(f'INFO: Convert webp tp JPEG')

        # Save the image to a BytesIO object
        image_io = io.BytesIO()
        img.save(image_io, format='JPEG')

        # Load the image from the BytesIO object
        image_io.seek(0)
        img = Image.open(image_io)

    if img.format == "PNG":
        return False

    # Save the original image
    img.save('tests/original.jpg')

    # Convert the image to grayscale and save it
    img_gray = img.convert('L')
    img_gray.save('tests/grayscale.jpg')

    # Enhance the contrast of the image and save it
    enhancer = ImageEnhance.Contrast(img)
    img_enhanced = enhancer.enhance(2)
    img_enhanced.save('tests/enhanced.jpg')

    # Apply a median filter to the image and save it
    img_filtered = img.filter(ImageFilter.MedianFilter())
    img_filtered.save('tests/filtered.jpg')


def check_text(text):
    text = text.replace('\n', ' ')
    text = text.lower()
    if any(phrase in text for phrase in phrases):
        return True
    return False

# image = 'pics/pic1.webp'
images = get_local_images("pics")
for i in images:
    generate_test_images('pics/'+i)
    image_types = ['original', 'grayscale', 'enhanced', 'filtered']
    for t in [f'tests/{img_type}.jpg' for img_type in image_types]:
        text = read_image_text(t)
        if check_text(text):
            print(f'WARN: {i} is suspicious.')
            break


# # get profile images from etsy page
# def get_profile_image(profile):
#     # set user agent to that of curl, which WAF appears to allow
#     headers = {
#         'User-Agent': 'curl/7.64.1'
#     }
#     resp = requests.get(f'https://www.etsy.com/people/{profile}', headers=headers)
#     if resp.status_code != 200:
#         print(f'ERROR: Got a {resp.status_code} response')
#         return
#     html = resp.text

#     soup = BeautifulSoup(html, features="html.parser")

#     # get the src from <img> called data-user-avatar-img
#     img = soup.find('img', {'data-user-avatar-img': True})
#     if img:
#         img = img['src']
#         print("INFO: Downloading image from", img)
#         image = requests.get(img)
#         with open(f'{profile}.jpg', 'wb') as f:
#             f.write(image.content)
#         return f'{profile}.jpg'
#     else:
#         print('ERROR: Profile pic not found')
#         return

## instructions.jpg

      
    Raw
  

              instructions.jpg
            
          
## pic1.webp

      
    Raw
  

              pic1.webp
            
          
            View raw
        
    
## profile.jpg

      
    Raw
  

              profile.jpg
            
          
## qrcode.jpg

      
    Raw
  

              qrcode.jpg
            
          
## qrcode2.jpg

      
    Raw
  

              qrcode2.jpg
            
          
## requirements.txt
argcomplete==1.10.3
attrs==23.2.0
beautifulsoup4==4.8.2
bytecode==0.15.1
cattrs==23.2.3
certifi==2024.2.2
chardet==3.0.4
charset-normalizer==3.3.2
compressed_rtf==1.0.6
ddsketch==2.0.4
ddtrace==2.7.1
Deprecated==1.2.14
docx2txt==0.8
easyocr==1.7.1
ebcdic==1.1.1
envier==0.5.1
exceptiongroup==1.2.0
extract-msg==0.28.7
filelock==3.13.1
fsspec==2024.3.1
idna==3.6
imageio==2.34.0
IMAPClient==2.1.0
importlib-metadata==6.11.0
Jinja2==3.1.3
lazy_loader==0.3
lxml==5.1.0
MarkupSafe==2.1.5
mpmath==1.3.0
networkx==3.2.1
ninja==1.11.1.1
numpy==1.26.4
olefile==0.47
opencv-python==4.9.0.80
opencv-python-headless==4.9.0.80
opentelemetry-api==1.23.0
packaging==24.0
pdfminer.six==20191110
pillow==10.2.0
pyclipper==1.3.0.post5
pycryptodome==3.20.0
PyQRCode==1.2.1
pytesseract==0.3.10
python-bidi==0.4.2
python-pptx==0.6.23
PyYAML==6.0.1
pyzbar==0.1.9
requests==2.31.0
scikit-image==0.22.0
scipy==1.12.0
shapely==2.0.3
six==1.16.0
sortedcontainers==2.4.0
soupsieve==2.5
SpeechRecognition==3.8.1
sqlparse==0.4.4
sympy==1.12
textract==1.6.5
tifffile==2024.2.12
torch==2.2.1
torchvision==0.17.1
typing_extensions==4.10.0
tzlocal==5.2
urllib3==2.2.1
wrapt==1.16.0
xlrd==1.2.0
XlsxWriter==3.2.0
xmltodict==0.13.0
zipp==3.17.0
	import pytesseract
	# import time
	import requests
	import os
	from bs4 import BeautifulSoup
	from PIL import Image, ImageEnhance, ImageFilter
	#import qrtools
	from pyzbar.pyzbar import decode
	# import cv2
	import textract
	import io
	import easyocr

	phrases = ['etsy','support','billing','account']

	profiles = [
	'Aquarius0174', # still an active account 18/3/24
	'4rxehd0d9sx9cpjz',
	'ygxanpmoi9bll6dm',
	'dpiqol46u7qv8s6h',
	]

	# demo of checking qrcodes sent to users
	qrcodes = [
	'pics/qrcode.jpg',
	'pics/qrcode2.jpg',
	]
	def read_qrcode_data(image):
	data = decode(Image.open(image))
	return data
	for i in qrcodes:
	data = read_qrcode_data(i)
	text = data[0].data.decode('utf-8')
	if any(phrase in text.lower() for phrase in phrases):
	print(f'WARN: {i} contains phrase: {text.lower()}')

	# extract text from image
	def read_image_text(image):

	# textract (hit a miss, but fast)
	try:
	text = textract.process(image)
	text = text.decode('utf-8')
	if check_text(text):
	return text
	except Exception as e:
	return ""

	# easy ocr (gets them all tbh, but slow)
	try:
	# I don't have a proper GPU...
	reader = easyocr.Reader(['en'], gpu=False, verbose=False) # specify the language(s)
	result = reader.readtext(image)
	text = ' '.join([item[1] for item in result])
	if text:
	return text
	except Exception as e:
	return ""

	# pytesseract (reasonably fast, but misses some easy ones)
	for psm in range(14):
	try:
	print(f'INFO: Running psm {psm} {i}')
	text = pytesseract.image_to_string(i, config=f'--psm {psm}')
	return text
	except Exception as e:
	return ""

	return False


	# slurp up images in pics directory
	def get_local_images(dir):
	images = os.listdir(dir)
	image_extensions = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp']
	images = [i for i in images if any(i.endswith(ext) for ext in image_extensions)]
	return images

	def generate_test_images(image):

	# Load the original image
	img = Image.open(image)

	# convert webp to jpeg
	if img.format == 'WEBP':
	print(f'INFO: Convert webp tp JPEG')

	# Save the image to a BytesIO object
	image_io = io.BytesIO()
	img.save(image_io, format='JPEG')

	# Load the image from the BytesIO object
	image_io.seek(0)
	img = Image.open(image_io)

	if img.format == "PNG":
	return False

	# Save the original image
	img.save('tests/original.jpg')

	# Convert the image to grayscale and save it
	img_gray = img.convert('L')
	img_gray.save('tests/grayscale.jpg')

	# Enhance the contrast of the image and save it
	enhancer = ImageEnhance.Contrast(img)
	img_enhanced = enhancer.enhance(2)
	img_enhanced.save('tests/enhanced.jpg')

	# Apply a median filter to the image and save it
	img_filtered = img.filter(ImageFilter.MedianFilter())
	img_filtered.save('tests/filtered.jpg')


	def check_text(text):
	text = text.replace('\n', ' ')
	text = text.lower()
	if any(phrase in text for phrase in phrases):
	return True
	return False

	# image = 'pics/pic1.webp'
	images = get_local_images("pics")
	for i in images:
	generate_test_images('pics/'+i)
	image_types = ['original', 'grayscale', 'enhanced', 'filtered']
	for t in [f'tests/{img_type}.jpg' for img_type in image_types]:
	text = read_image_text(t)
	if check_text(text):
	print(f'WARN: {i} is suspicious.')
	break




	# # get profile images from etsy page
	# def get_profile_image(profile):
	# # set user agent to that of curl, which WAF appears to allow
	# headers = {
	# 'User-Agent': 'curl/7.64.1'
	# }
	# resp = requests.get(f'https://www.etsy.com/people/{profile}', headers=headers)
	# if resp.status_code != 200:
	# print(f'ERROR: Got a {resp.status_code} response')
	# return
	# html = resp.text

	# soup = BeautifulSoup(html, features="html.parser")

	# # get the src from <img> called data-user-avatar-img
	# img = soup.find('img', {'data-user-avatar-img': True})
	# if img:
	# img = img['src']
	# print("INFO: Downloading image from", img)
	# image = requests.get(img)
	# with open(f'{profile}.jpg', 'wb') as f:
	# f.write(image.content)
	# return f'{profile}.jpg'
	# else:
	# print('ERROR: Profile pic not found')
	# return
	argcomplete==1.10.3
	attrs==23.2.0
	beautifulsoup4==4.8.2
	bytecode==0.15.1
	cattrs==23.2.3
	certifi==2024.2.2
	chardet==3.0.4
	charset-normalizer==3.3.2
	compressed_rtf==1.0.6
	ddsketch==2.0.4
	ddtrace==2.7.1
	Deprecated==1.2.14
	docx2txt==0.8
	easyocr==1.7.1
	ebcdic==1.1.1
	envier==0.5.1
	exceptiongroup==1.2.0
	extract-msg==0.28.7
	filelock==3.13.1
	fsspec==2024.3.1
	idna==3.6
	imageio==2.34.0
	IMAPClient==2.1.0
	importlib-metadata==6.11.0
	Jinja2==3.1.3
	lazy_loader==0.3
	lxml==5.1.0
	MarkupSafe==2.1.5
	mpmath==1.3.0
	networkx==3.2.1
	ninja==1.11.1.1
	numpy==1.26.4
	olefile==0.47
	opencv-python==4.9.0.80
	opencv-python-headless==4.9.0.80
	opentelemetry-api==1.23.0
	packaging==24.0
	pdfminer.six==20191110
	pillow==10.2.0
	pyclipper==1.3.0.post5
	pycryptodome==3.20.0
	PyQRCode==1.2.1
	pytesseract==0.3.10
	python-bidi==0.4.2
	python-pptx==0.6.23
	PyYAML==6.0.1
	pyzbar==0.1.9
	requests==2.31.0
	scikit-image==0.22.0
	scipy==1.12.0
	shapely==2.0.3
	six==1.16.0
	sortedcontainers==2.4.0
	soupsieve==2.5
	SpeechRecognition==3.8.1
	sqlparse==0.4.4
	sympy==1.12
	textract==1.6.5
	tifffile==2024.2.12
	torch==2.2.1
	torchvision==0.17.1
	typing_extensions==4.10.0
	tzlocal==5.2
	urllib3==2.2.1
	wrapt==1.16.0
	xlrd==1.2.0
	XlsxWriter==3.2.0
	xmltodict==0.13.0
	zipp==3.17.0