Skip to content

Instantly share code, notes, and snippets.

@christianroman
Last active June 12, 2021 01:09
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 6 You must be signed in to fork a gist
  • Save christianroman/8485394 to your computer and use it in GitHub Desktop.
Save christianroman/8485394 to your computer and use it in GitHub Desktop.
Rompiendo Captcha de CURP usando Python, OpenCV, Tesseract OCR y Tornado
import tornado.ioloop
import tornado.web
import urllib2 as urllib
from PIL import Image
from cStringIO import StringIO
import numpy as np
import tesserwrap
import cv2
class MainHandler(tornado.web.RequestHandler):
def get(self):
# Obtenemos el captcha
url = "http://consultas.curp.gob.mx/CurpSP/imagenCatcha"
file = StringIO(urllib.urlopen(url).read())
original = Image.open(file)
# Convertimos formato PIL a CV2
cv_img = np.asarray(original)[:,:,::].copy()
# Convertimos imagen a scala de grises.
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
# Aplicamos filtro Canny para eliminar lineas.
edges = cv2.Canny(gray, 60, 200, apertureSize = 3)
# Obtenemos las lineas.
lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 1, None, 0, 0)
# Dibujamos las lineas encontradas en color blanco.
for x1, y1, x2, y2 in lines[0]:
cv2.line(cv_img, (x1, y1), (x2, y2), (255,255,255 ), 2)
# Creamos una copia de nuestra imagen limpia sin lineas.
processed = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
# Aplicamos un desenfoque gaussiano.
blur = cv2.GaussianBlur(processed, (3, 3), 0)
# Aplicamos threshold.
threshold = cv2.threshold(blur, 128, 255, cv2.THRESH_BINARY)[1]
# Aplicamos transformación morfologica.
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (6, 6))
morph = cv2.morphologyEx(threshold, cv2.MORPH_OPEN, kernel)
# Convertimos nuestra imagen final procesada a PIL.
pil_img = Image.fromarray(morph)
# Iniciamos tesseract y leemos la imagen.
tesseract = tesserwrap.tesseract()
tesseract.set_variable("tessedit_char_whitelist", "0123456789abcdefghijklmnopqrstuvwxyz")
tesseract.set_page_seg_mode(8)
text = tesseract.ocr_image(pil_img)
self.write(text.strip())
application = tornado.web.Application([
(r"/", MainHandler),
])
if __name__ == "__main__":
application.listen(8888)
tornado.ioloop.IOLoop.instance().start()
@bf
Copy link

bf commented May 23, 2014

Thanks, this helped me a lot! :-)

@eml-nx
Copy link

eml-nx commented Apr 8, 2015

Que galán te viste!

@aagp
Copy link

aagp commented Jul 18, 2018

Hola, me podrías ayudar?, estoy tratando de instalar tesserwrap pero no me deja:
λ python server.py
Traceback (most recent call last):
File "server.py", line 10, in
import tesserwrap
ImportError: No module named tesserwrap

λ pip install tesserwrap
Collecting tesserwrap
Using cached https://files.pythonhosted.org/packages/04/92/4c2134fc465d576c05d4426bc2f1ba7871652d78d3d913bec0bffe0afe8b/tesserwrap-0.1.6.tar.gz
Complete output from command python setup.py egg_info:
"ld" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
"ld" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
Traceback (most recent call last):
File "", line 1, in
File "c:\users\usuario\appdata\local\temp\pip-install-xcu3ya\tesserwrap\setup.py", line 45, in
extra_lib_paths)
File "c:\users\usuario\appdata\local\temp\pip-install-xcu3ya\tesserwrap\setup.py", line 30, in find_closest_libname
"Cannot find Tesseract via ldconfig, confirm it is installed.")
Exception: Cannot find Tesseract via ldconfig, confirm it is installed.

----------------------------------------

Command "python setup.py egg_info" failed with error code 1 in c:\users\usuario\appdata\local\temp\pip-install-xcu3ya\tesserwrap\

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment