Skip to content

Instantly share code, notes, and snippets.

@Bilio
Bilio / clean_html.py
Created July 17, 2018 01:02
Clean html with python lxml.html Cleaner
import codecs
import sys
from lxml import etree
from lxml.html.clean import Cleaner
def sanitize(dirty_html):
cleaner = Cleaner(page_structure=True,
meta=True,
@Bilio
Bilio / test.py
Created June 4, 2018 17:51 — forked from christianroman/test.py
Bypass Captcha using 10 lines of code with Python, OpenCV & Tesseract OCR engine
import cv2.cv as cv
import tesseract
gray = cv.LoadImage('captcha.jpeg', cv.CV_LOAD_IMAGE_GRAYSCALE)
cv.Threshold(gray, gray, 231, 255, cv.CV_THRESH_BINARY)
api = tesseract.TessBaseAPI()
api.Init(".","eng",tesseract.OEM_DEFAULT)
api.SetVariable("tessedit_char_whitelist", "0123456789abcdefghijklmnopqrstuvwxyz")
api.SetPageSegMode(tesseract.PSM_SINGLE_WORD)
tesseract.SetCvImage(gray,api)
print api.GetUTF8Text()