Created
December 25, 2023 15:23
-
-
Save Wuvist/2fd232d0417954d5141f50bd3d319c27 to your computer and use it in GitHub Desktop.
Python Script to process scanned book pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PIL import Image, ImageOps | |
import cv2 | |
import numpy as np | |
import pytesseract | |
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' | |
crop_w = 1310 | |
crop_h = 2020 | |
# 度数转换 | |
def DegreeTrans(theta): | |
res = theta / np.pi * 180 | |
return res | |
# 逆时针旋转图像degree角度(原尺寸) | |
def rotateImage(src, degree): | |
# 旋转中心为图像中心 | |
h, w = src.shape[:2] | |
# 计算二维旋转的仿射变换矩阵 | |
RotateMatrix = cv2.getRotationMatrix2D((w/2.0, h/2.0), degree, 1) | |
# 仿射变换,背景色填充为白色 | |
rotate = cv2.warpAffine(src, RotateMatrix, (w, h), | |
borderValue=(255, 255, 255)) | |
return rotate | |
# 通过霍夫变换计算角度 | |
def CalcDegree(srcImage): | |
midImage = cv2.cvtColor(srcImage, cv2.COLOR_BGR2GRAY) | |
dstImage = cv2.Canny(midImage, 50, 200, 3) | |
# 通过霍夫变换检测直线 | |
# 第4个参数就是阈值,阈值越大,检测精度越高 | |
# 如果第4个参数过小,有些(例如垂直)无法矫正 | |
lines = cv2.HoughLines(dstImage, 1, np.pi/180, 300) | |
if lines is None: | |
return 0 | |
sum = 0 | |
counts = 0 | |
# 依次画出每条线段 | |
for i in range(len(lines)): | |
for rho, theta in lines[i]: | |
# print("theta:", theta, " rho:", rho) | |
a = np.cos(theta) | |
b = np.sin(theta) | |
x0 = a * rho | |
y0 = b * rho | |
x1 = int(round(x0 + 1000 * (-b))) | |
y1 = int(round(y0 + 1000 * a)) | |
x2 = int(round(x0 - 1000 * (-b))) | |
y2 = int(round(y0 - 1000 * a)) | |
# 只选角度最小的作为旋转角度 | |
if abs((y1-y2)) < 100: | |
sum += theta | |
counts += 1 | |
if counts == 0: | |
return 0 | |
# 对所有角度求平均,这样做旋转效果会更好 | |
average = sum / counts | |
angle = DegreeTrans(average) - 90 | |
return angle | |
def save(image, fname): | |
open_cv_image = np.array(image) | |
degree = CalcDegree(open_cv_image) | |
print(fname, degree) | |
rotate = rotateImage(open_cv_image, degree) | |
image = Image.fromarray(rotate).convert('L') | |
image.save(fname) | |
pdf = pytesseract.image_to_pdf_or_hocr(fname, extension='pdf') | |
with open(fname+'.pdf', 'w+b') as f: | |
f.write(pdf) | |
for i in range(1, 264): | |
fname = str(i) + ".jpg" | |
im = Image.open(fname) | |
out_fname = "book_" + str(i) + ".jpg" | |
w, h = im.size | |
if w > crop_w and h > crop_h: | |
left = (w - crop_w)/2 | |
top = (h - crop_h)/2 | |
right = w - left | |
bottom = h - top | |
im1 = ImageOps.expand( | |
im.crop((left, top, right, bottom)), border=20, fill=(255, 255, 255)) | |
save(im1, out_fname) | |
elif h > crop_h: | |
left = 0 | |
top = (h - crop_h)/2 | |
right = w | |
bottom = h - top | |
im1 = ImageOps.expand( | |
im.crop((left, top, right, bottom)), border=20, fill=(255, 255, 255)) | |
save(im1, out_fname) | |
elif w > crop_w: | |
left = (w - crop_w)/2 | |
top = 0 | |
right = w - left | |
bottom = h | |
im1 = ImageOps.expand( | |
im.crop((left, top, right, bottom)), border=20, fill=(255, 255, 255)) | |
save(im1, out_fname) | |
else: | |
print(fname) | |
save(im, out_fname) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment