Wuvist/process_scan_pages.py

## process_scan_pages.py
from PIL import Image, ImageOps
import cv2
import numpy as np
import pytesseract

pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

crop_w = 1310
crop_h = 2020

# 度数转换
def DegreeTrans(theta):
    res = theta / np.pi * 180
    return res

# 逆时针旋转图像degree角度（原尺寸）
def rotateImage(src, degree):
    # 旋转中心为图像中心
    h, w = src.shape[:2]
    # 计算二维旋转的仿射变换矩阵
    RotateMatrix = cv2.getRotationMatrix2D((w/2.0, h/2.0), degree, 1)

    # 仿射变换，背景色填充为白色
    rotate = cv2.warpAffine(src, RotateMatrix, (w, h),
                            borderValue=(255, 255, 255))
    return rotate

# 通过霍夫变换计算角度
def CalcDegree(srcImage):
    midImage = cv2.cvtColor(srcImage, cv2.COLOR_BGR2GRAY)
    dstImage = cv2.Canny(midImage, 50, 200, 3)

    # 通过霍夫变换检测直线
    # 第4个参数就是阈值，阈值越大，检测精度越高
    # 如果第4个参数过小，有些（例如垂直）无法矫正
    lines = cv2.HoughLines(dstImage, 1, np.pi/180, 300)

    if lines is None:
        return 0
    sum = 0
    counts = 0
    # 依次画出每条线段
    for i in range(len(lines)):
        for rho, theta in lines[i]:
            # print("theta:", theta, " rho:", rho)
            a = np.cos(theta)
            b = np.sin(theta)
            x0 = a * rho
            y0 = b * rho
            x1 = int(round(x0 + 1000 * (-b)))
            y1 = int(round(y0 + 1000 * a))
            x2 = int(round(x0 - 1000 * (-b)))
            y2 = int(round(y0 - 1000 * a))
            # 只选角度最小的作为旋转角度
            if abs((y1-y2)) < 100:
                sum += theta
                counts += 1

    if counts == 0:
        return 0
    # 对所有角度求平均，这样做旋转效果会更好
    average = sum / counts
    angle = DegreeTrans(average) - 90
    return angle


def save(image, fname):
    open_cv_image = np.array(image)
    degree = CalcDegree(open_cv_image)
    print(fname, degree)
    rotate = rotateImage(open_cv_image, degree)

    image = Image.fromarray(rotate).convert('L')
    image.save(fname)
    pdf = pytesseract.image_to_pdf_or_hocr(fname, extension='pdf')
    with open(fname+'.pdf', 'w+b') as f:
        f.write(pdf)


for i in range(1, 264):
    fname = str(i) + ".jpg"
    im = Image.open(fname)
    out_fname = "book_" + str(i) + ".jpg"
    w, h = im.size
    if w > crop_w and h > crop_h:
        left = (w - crop_w)/2
        top = (h - crop_h)/2
        right = w - left
        bottom = h - top
        im1 = ImageOps.expand(
            im.crop((left, top, right, bottom)), border=20, fill=(255, 255, 255))
        save(im1, out_fname)
    elif h > crop_h:
        left = 0
        top = (h - crop_h)/2
        right = w
        bottom = h - top
        im1 = ImageOps.expand(
            im.crop((left, top, right, bottom)), border=20, fill=(255, 255, 255))
        save(im1, out_fname)
    elif w > crop_w:
        left = (w - crop_w)/2
        top = 0
        right = w - left
        bottom = h
        im1 = ImageOps.expand(
            im.crop((left, top, right, bottom)), border=20, fill=(255, 255, 255))
        save(im1, out_fname)
    else:
        print(fname)
        save(im, out_fname)
	from PIL import Image, ImageOps
	import cv2
	import numpy as np
	import pytesseract

	pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

	crop_w = 1310
	crop_h = 2020

	# 度数转换
	def DegreeTrans(theta):
	res = theta / np.pi * 180
	return res

	# 逆时针旋转图像degree角度（原尺寸）
	def rotateImage(src, degree):
	# 旋转中心为图像中心
	h, w = src.shape[:2]
	# 计算二维旋转的仿射变换矩阵
	RotateMatrix = cv2.getRotationMatrix2D((w/2.0, h/2.0), degree, 1)

	# 仿射变换，背景色填充为白色
	rotate = cv2.warpAffine(src, RotateMatrix, (w, h),
	borderValue=(255, 255, 255))
	return rotate

	# 通过霍夫变换计算角度
	def CalcDegree(srcImage):
	midImage = cv2.cvtColor(srcImage, cv2.COLOR_BGR2GRAY)
	dstImage = cv2.Canny(midImage, 50, 200, 3)

	# 通过霍夫变换检测直线
	# 第4个参数就是阈值，阈值越大，检测精度越高
	# 如果第4个参数过小，有些（例如垂直）无法矫正
	lines = cv2.HoughLines(dstImage, 1, np.pi/180, 300)

	if lines is None:
	return 0
	sum = 0
	counts = 0
	# 依次画出每条线段
	for i in range(len(lines)):
	for rho, theta in lines[i]:
	# print("theta:", theta, " rho:", rho)
	a = np.cos(theta)
	b = np.sin(theta)
	x0 = a * rho
	y0 = b * rho
	x1 = int(round(x0 + 1000 * (-b)))
	y1 = int(round(y0 + 1000 * a))
	x2 = int(round(x0 - 1000 * (-b)))
	y2 = int(round(y0 - 1000 * a))
	# 只选角度最小的作为旋转角度
	if abs((y1-y2)) < 100:
	sum += theta
	counts += 1

	if counts == 0:
	return 0
	# 对所有角度求平均，这样做旋转效果会更好
	average = sum / counts
	angle = DegreeTrans(average) - 90
	return angle


	def save(image, fname):
	open_cv_image = np.array(image)
	degree = CalcDegree(open_cv_image)
	print(fname, degree)
	rotate = rotateImage(open_cv_image, degree)

	image = Image.fromarray(rotate).convert('L')
	image.save(fname)
	pdf = pytesseract.image_to_pdf_or_hocr(fname, extension='pdf')
	with open(fname+'.pdf', 'w+b') as f:
	f.write(pdf)


	for i in range(1, 264):
	fname = str(i) + ".jpg"
	im = Image.open(fname)
	out_fname = "book_" + str(i) + ".jpg"
	w, h = im.size
	if w > crop_w and h > crop_h:
	left = (w - crop_w)/2
	top = (h - crop_h)/2
	right = w - left
	bottom = h - top
	im1 = ImageOps.expand(
	im.crop((left, top, right, bottom)), border=20, fill=(255, 255, 255))
	save(im1, out_fname)
	elif h > crop_h:
	left = 0
	top = (h - crop_h)/2
	right = w
	bottom = h - top
	im1 = ImageOps.expand(
	im.crop((left, top, right, bottom)), border=20, fill=(255, 255, 255))
	save(im1, out_fname)
	elif w > crop_w:
	left = (w - crop_w)/2
	top = 0
	right = w - left
	bottom = h
	im1 = ImageOps.expand(
	im.crop((left, top, right, bottom)), border=20, fill=(255, 255, 255))
	save(im1, out_fname)
	else:
	print(fname)
	save(im, out_fname)