Skip to content

Instantly share code, notes, and snippets.

@danslinky
Last active March 19, 2024 12:54
Show Gist options
  • Save danslinky/c3d3f2fa467cf8dfe25c77c33a82fc08 to your computer and use it in GitHub Desktop.
Save danslinky/c3d3f2fa467cf8dfe25c77c33a82fc08 to your computer and use it in GitHub Desktop.
python image OCR to detect Etsy scam accounts

python extract text from image

Etsy has a problem with scam accounts pretending to be Etsy Support, and trying to phish. They're easy to spot, they have Etsy in their profile Pic.

Users should never interact with these accounts. Legitimate message from Etsy will appear under the From Etsy tab.

I think Etsy could be a bit more proactive on detecting them, so to make a point I've written some squiffy python to look for etsy, or support in the images.

It tries textract, easyocr and pytesseract, and returns True on the first successful match

I only have a small dataset, so it would be better if Etsy ran it themselves.

import pytesseract
# import time
import requests
import os
from bs4 import BeautifulSoup
from PIL import Image, ImageEnhance, ImageFilter
#import qrtools
from pyzbar.pyzbar import decode
# import cv2
import textract
import io
import easyocr
phrases = ['etsy','support','billing','account']
profiles = [
'Aquarius0174', # still an active account 18/3/24
'4rxehd0d9sx9cpjz',
'ygxanpmoi9bll6dm',
'dpiqol46u7qv8s6h',
]
# demo of checking qrcodes sent to users
qrcodes = [
'pics/qrcode.jpg',
'pics/qrcode2.jpg',
]
def read_qrcode_data(image):
data = decode(Image.open(image))
return data
for i in qrcodes:
data = read_qrcode_data(i)
text = data[0].data.decode('utf-8')
if any(phrase in text.lower() for phrase in phrases):
print(f'WARN: {i} contains phrase: {text.lower()}')
# extract text from image
def read_image_text(image):
# textract (hit a miss, but fast)
try:
text = textract.process(image)
text = text.decode('utf-8')
if check_text(text):
return text
except Exception as e:
return ""
# easy ocr (gets them all tbh, but slow)
try:
# I don't have a proper GPU...
reader = easyocr.Reader(['en'], gpu=False, verbose=False) # specify the language(s)
result = reader.readtext(image)
text = ' '.join([item[1] for item in result])
if text:
return text
except Exception as e:
return ""
# pytesseract (reasonably fast, but misses some easy ones)
for psm in range(14):
try:
print(f'INFO: Running psm {psm} {i}')
text = pytesseract.image_to_string(i, config=f'--psm {psm}')
return text
except Exception as e:
return ""
return False
# slurp up images in pics directory
def get_local_images(dir):
images = os.listdir(dir)
image_extensions = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp']
images = [i for i in images if any(i.endswith(ext) for ext in image_extensions)]
return images
def generate_test_images(image):
# Load the original image
img = Image.open(image)
# convert webp to jpeg
if img.format == 'WEBP':
print(f'INFO: Convert webp tp JPEG')
# Save the image to a BytesIO object
image_io = io.BytesIO()
img.save(image_io, format='JPEG')
# Load the image from the BytesIO object
image_io.seek(0)
img = Image.open(image_io)
if img.format == "PNG":
return False
# Save the original image
img.save('tests/original.jpg')
# Convert the image to grayscale and save it
img_gray = img.convert('L')
img_gray.save('tests/grayscale.jpg')
# Enhance the contrast of the image and save it
enhancer = ImageEnhance.Contrast(img)
img_enhanced = enhancer.enhance(2)
img_enhanced.save('tests/enhanced.jpg')
# Apply a median filter to the image and save it
img_filtered = img.filter(ImageFilter.MedianFilter())
img_filtered.save('tests/filtered.jpg')
def check_text(text):
text = text.replace('\n', ' ')
text = text.lower()
if any(phrase in text for phrase in phrases):
return True
return False
# image = 'pics/pic1.webp'
images = get_local_images("pics")
for i in images:
generate_test_images('pics/'+i)
image_types = ['original', 'grayscale', 'enhanced', 'filtered']
for t in [f'tests/{img_type}.jpg' for img_type in image_types]:
text = read_image_text(t)
if check_text(text):
print(f'WARN: {i} is suspicious.')
break
# # get profile images from etsy page
# def get_profile_image(profile):
# # set user agent to that of curl, which WAF appears to allow
# headers = {
# 'User-Agent': 'curl/7.64.1'
# }
# resp = requests.get(f'https://www.etsy.com/people/{profile}', headers=headers)
# if resp.status_code != 200:
# print(f'ERROR: Got a {resp.status_code} response')
# return
# html = resp.text
# soup = BeautifulSoup(html, features="html.parser")
# # get the src from <img> called data-user-avatar-img
# img = soup.find('img', {'data-user-avatar-img': True})
# if img:
# img = img['src']
# print("INFO: Downloading image from", img)
# image = requests.get(img)
# with open(f'{profile}.jpg', 'wb') as f:
# f.write(image.content)
# return f'{profile}.jpg'
# else:
# print('ERROR: Profile pic not found')
# return
argcomplete==1.10.3
attrs==23.2.0
beautifulsoup4==4.8.2
bytecode==0.15.1
cattrs==23.2.3
certifi==2024.2.2
chardet==3.0.4
charset-normalizer==3.3.2
compressed_rtf==1.0.6
ddsketch==2.0.4
ddtrace==2.7.1
Deprecated==1.2.14
docx2txt==0.8
easyocr==1.7.1
ebcdic==1.1.1
envier==0.5.1
exceptiongroup==1.2.0
extract-msg==0.28.7
filelock==3.13.1
fsspec==2024.3.1
idna==3.6
imageio==2.34.0
IMAPClient==2.1.0
importlib-metadata==6.11.0
Jinja2==3.1.3
lazy_loader==0.3
lxml==5.1.0
MarkupSafe==2.1.5
mpmath==1.3.0
networkx==3.2.1
ninja==1.11.1.1
numpy==1.26.4
olefile==0.47
opencv-python==4.9.0.80
opencv-python-headless==4.9.0.80
opentelemetry-api==1.23.0
packaging==24.0
pdfminer.six==20191110
pillow==10.2.0
pyclipper==1.3.0.post5
pycryptodome==3.20.0
PyQRCode==1.2.1
pytesseract==0.3.10
python-bidi==0.4.2
python-pptx==0.6.23
PyYAML==6.0.1
pyzbar==0.1.9
requests==2.31.0
scikit-image==0.22.0
scipy==1.12.0
shapely==2.0.3
six==1.16.0
sortedcontainers==2.4.0
soupsieve==2.5
SpeechRecognition==3.8.1
sqlparse==0.4.4
sympy==1.12
textract==1.6.5
tifffile==2024.2.12
torch==2.2.1
torchvision==0.17.1
typing_extensions==4.10.0
tzlocal==5.2
urllib3==2.2.1
wrapt==1.16.0
xlrd==1.2.0
XlsxWriter==3.2.0
xmltodict==0.13.0
zipp==3.17.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment