Taxpayer checker via tesseract-ocr
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import shutil | |
import uuid | |
import pytesseract # pip install pytesseract and https://tesseract-ocr.github.io/tessdoc/Home.html | |
import requests # pip install requests | |
from PIL import Image # pip install pillow | |
from bs4 import BeautifulSoup # pip install beautifulsoup4 | |
# Create a requests session | |
session = requests.Session() | |
headers = { | |
'Accept-Language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7', | |
'Content-Type': 'application/x-www-form-urlencoded', | |
'Pragma': 'no-cache', | |
'Origin': 'https://sorgu.efatura.gov.tr', | |
'Referer': 'https://sorgu.efatura.gov.tr/kullanicilar/xliste.php', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.66 Safari/537.36 Edg/103.0.1264.44' | |
} | |
def taxpayer_check(tax_no: str) -> bool: | |
if not isinstance(tax_no, str) or not tax_no.isdigit(): # Check if the tax number is a string and if it is a number | |
return False | |
response = session.get(url='https://sorgu.efatura.gov.tr/kullanicilar/img.php', stream=True) # Get the image | |
chaptcha_image_file = f'{uuid.uuid4()}.jpg' | |
with open(chaptcha_image_file, 'wb') as out_file: | |
shutil.copyfileobj(response.raw, out_file) | |
# Delete the response object | |
del response | |
# Recognize the image using tesseract removing newline '\n' from tesseract return values | |
# https://stackoverflow.com/questions/67857988/removing-newline-n-from-tesseract-return-values | |
captcha = pytesseract.image_to_string(Image.open(chaptcha_image_file), config='--oem 3 --psm 6').replace('\n', '') | |
response = session.post(verify=False, url='https://sorgu.efatura.gov.tr/kullanicilar/xliste.php', headers=headers, data={'search_string': tax_no, 'captcha_code': str(captcha)}) | |
get_html_content = BeautifulSoup(response.text, features='html.parser') | |
mukellef = get_html_content.find('div', {'style': 'font-weight:bold;'}) | |
if mukellef and mukellef.text == "Mükellef kayıtlıdır.": | |
return True | |
os.remove(chaptcha_image_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment