Skip to content

Instantly share code, notes, and snippets.

@knzm
Last active July 20, 2019 05:52
Show Gist options
  • Save knzm/c1bc01860ec22086c02713f40a33e8f7 to your computer and use it in GitHub Desktop.
Save knzm/c1bc01860ec22086c02713f40a33e8f7 to your computer and use it in GitHub Desktop.
build tesserocr for Python 2.7 on Windows
SET BUILD_TARGET_32=0
SET UTF8_PATCH=1
SET DEBUG=0
git clone -b v2.2.2 https://github.com/sirfz/tesserocr.git
copy setup.py tesserocr\setup.py
copy configure.py tesserocr\configure.py
cd tesserocr
mkdir tesserocr
echo from ._tesserocr import * > tesserocr\__init__.py
python configure.py
python setup.py bdist_wheel
import logging
import sys
import os
import re
import shutil
import subprocess
import codecs
import json
from distutils.util import strtobool
from distutils.util import get_platform
import yaml
_LOGGER = logging.getLogger()
if strtobool(os.environ.get('DEBUG', '0')):
_LOGGER.setLevel(logging.DEBUG)
else:
_LOGGER.setLevel(logging.INFO)
_LOGGER.addHandler(logging.StreamHandler(sys.stderr))
LEPTONICA_DEFAULT_VERSION = '1.74.4'
TESSERACT_DEFAULT_VERSION = '3.5.2'
def version_to_int(version):
version = re.search(r'((?:\d+\.)+\d+)', version).group()
# Split the groups on ".", take only the first one, and print each group with leading 0 if needed
# To be safe, also handle cases where an extra group is added to the version string, or if one or two groups
# are dropped.
version_groups = (version.split('.') + [0, 0])[:3]
version_str = "{:02}{:02}{:02}".format(*map(int, version_groups))
return int(version_str, 16)
def generate_cppan_config(build_dir, generator, tesseract_cppan_version, leptonica_version):
cppan_config_tmpl = """\
local_settings:
cppan_dir: cppan
build_dir_type: local
build_dir: build
build:
generator: {generator}
projects:
dummy:
files: dummy.cpp
dependencies:
pvt.cppan.demo.danbloomberg.leptonica: {leptonica_version}
pvt.simonflueckiger.tesseract.libtesseract: {tesseract_cppan_version}
pvt.simonflueckiger.tesseract.tesseract: {tesseract_cppan_version}
"""
cppan_yaml_path = os.path.join(build_dir, 'cppan.yml')
with open(cppan_yaml_path, 'w') as fp:
fp.write(cppan_config_tmpl.format(
generator=generator,
leptonica_version=leptonica_version,
tesseract_cppan_version=tesseract_cppan_version))
return cppan_yaml_path
def generate_project_files(build_dir, tesseract_cppan_version):
package_name = 'pvt.simonflueckiger.tesseract.tesseract-{}'.format(tesseract_cppan_version)
cmd = ['cppan', '--generate', package_name]
_LOGGER.debug("cmd: {}".format(cmd))
subprocess.check_call(cmd, cwd=build_dir)
def build_tesseract_exe(build_dir, tesseract_cppan_version):
package_name = 'pvt.simonflueckiger.tesseract.tesseract-{}'.format(tesseract_cppan_version)
cmd = ['cppan', '--build-packages', package_name]
_LOGGER.debug("cmd: {}".format(cmd))
subprocess.check_call(cmd, cwd=build_dir)
def get_tesseract_version_from_exe(build_dir):
# get the tesseract version from executable
cmd = [os.path.join(build_dir, 'bin', 'tesseract.exe'), '-v']
_LOGGER.debug("cmd: {}".format(cmd))
output = subprocess.check_output(cmd, cwd=os.path.join(build_dir, 'bin'))
_LOGGER.debug("version string: {}".format(output))
m = re.search(r"tesseract ([0-9]+\.[0-9]+\.[0-9]+)", output)
if m is None:
return None
return m.group(1)
def build_dummy_exe(build_dir):
# create dummy.cpp file
with open(os.path.join(build_dir, 'dummy.cpp'), 'w') as fp:
fp.write('int main(int argc, char *argv[]) { return 0; }\n')
# build dummy.exe
cmd = ['cppan', '--build', '.']
_LOGGER.debug("cmd: {}".format(cmd))
subprocess.check_call(cmd, cwd=build_dir)
def get_magic_number_for_build_dir(root_dir):
# we should have only one subdir
subdirs = []
for name in os.listdir(root_dir):
path = os.path.join(root_dir, name)
if os.path.isdir(path):
subdirs.append(name)
assert(len(subdirs) == 1)
# the magic number, some kind of hash key, should be relate to architectures,
# compiler selection, debug/release, etc ...
return subdirs[0]
def get_cache_var(contents, key):
m = re.search(r"set_cache_var\({} (.+?)\)".format(key), contents)
if not m:
return None
return m.group(1)
class Patcher(object):
@classmethod
def enabled(cls):
return False
def _apply(self):
raise NotImplementedError
def apply(self):
if not self.enabled():
return False
ret = self._apply()
return True if ret is None else ret
class Utf8Patcher(Patcher):
def __init__(self, source_path):
self.source_path = source_path
@classmethod
def enabled(cls):
return bool(strtobool(os.environ.get('UTF8_PATCH', '0')))
def _apply(self):
cpp_file = os.path.join(self.source_path, "ccmain/equationdetect.cpp")
with open(cpp_file, "r+b") as fp:
contents = fp.read()
if contents.startswith(codecs.BOM_UTF8):
return False
fp.seek(0)
fp.write(codecs.BOM_UTF8)
fp.write(contents)
return True
def apply_patches(build_dir):
tesseract_build_dir = os.path.join(build_dir, 'build', 'cppan-build-tesseract')
build_number = get_magic_number_for_build_dir(tesseract_build_dir)
cmakefile_path = os.path.join(
tesseract_build_dir, build_number,
'cppan', 'CMakeLists.txt')
with open(cmakefile_path, 'r') as fp:
contents = fp.read()
libtesseract_dir = get_cache_var(contents, 'pvt_simonflueckiger_tesseract_libtesseract_DIR')
if not libtesseract_dir:
raise RuntimeError('cannot detect libtesseract source codes')
_LOGGER.info("libtesseract_dir: {}".format(libtesseract_dir))
patchers = [
Utf8Patcher(libtesseract_dir),
]
dirty_bit = False
for patcher in patchers:
if patcher.apply():
dirty_bit = True
if dirty_bit:
# delete lnk folder
cppan_lnk_dir = os.path.expanduser("~/.cppan/storage/lnk")
_LOGGER.info("deleting {}".format(cppan_lnk_dir))
shutil.rmtree(cppan_lnk_dir)
# delete obj folder
cppan_obj_dir = os.path.expanduser("~/.cppan/storage/obj")
_LOGGER.info("deleting {}".format(cppan_obj_dir))
shutil.rmtree(cppan_obj_dir)
return dirty_bit
def prepare_headers_for_leptonica(dest_dir, src_dir, build_dir):
origin_paths = []
for name in sorted(os.listdir(src_dir)):
path = os.path.join(src_dir, name)
if os.path.isfile(path) and name.endswith('.h'):
origin_paths.append(path)
# take care of generated header files
for name in sorted(os.listdir(build_dir)):
path = os.path.join(build_dir, name)
if os.path.isfile(path) and name.endswith('.h'):
origin_paths.append(path)
os.makedirs(dest_dir)
for path in origin_paths:
# _LOGGER.debug("copy {} to {}".format(path, dest_dir))
shutil.copy(path, dest_dir)
def prepare_headers_for_libtesseract(dest_dir, src_dir, build_dir):
# from libtesseract source tree
with open(os.path.join(src_dir, 'cppan.yml'), 'r') as fp:
cppan_cfg = yaml.safe_load(fp)
origin_paths = []
subdirs = cppan_cfg['include_directories']['public']
for subdir_name in subdirs:
subdir = os.path.normpath(os.path.join(src_dir, subdir_name))
for name in sorted(os.listdir(subdir)):
path = os.path.join(subdir, name)
if os.path.isfile(path) and name.endswith('.h'):
origin_paths.append(path)
# take care of generated header files
for name in sorted(os.listdir(build_dir)):
path = os.path.join(build_dir, name)
if os.path.isfile(path) and name.endswith('.h'):
origin_paths.append(path)
os.makedirs(dest_dir)
for path in origin_paths:
# _LOGGER.debug("copy {} to {}".format(path, dest_dir))
shutil.copy(path, dest_dir)
# need to patch the host.h
with open(os.path.join(dest_dir, 'host.h'), 'r+') as fp:
contents = fp.read()
contents = contents.replace('#include <cstdint>', '// #include <cstdint>')
fp.truncate(0)
fp.seek(0)
fp.write(contents)
def get_cppan_build_dir(src_dir, build_number):
m = re.search('/[0-9a-fA-F]{2}/[0-9a-fA-F]{2}/[0-9a-fA-F]{4}$', src_dir)
if not m:
return None
package_hash = m.group(0).replace('/', '')
return os.path.join(
src_dir.replace('/src/', '/obj/', 1),
'build', build_number,
'cppan', package_hash)
def get_directories_from_cmakefile(build_dir):
dummy_project_name = os.path.split(build_dir)[-1]
dummy_build_dir = os.path.join(build_dir, 'build', 'cppan-build-{}'.format(dummy_project_name))
build_number = get_magic_number_for_build_dir(dummy_build_dir)
_LOGGER.info("dummy_build_dir: {}".format(dummy_build_dir))
_LOGGER.info("build_number: {}".format(build_number))
cmakefile_path = os.path.join(dummy_build_dir, build_number, 'cppan', 'CMakeLists.txt')
with open(cmakefile_path, 'r') as fp:
contents = fp.read()
leptonica_top_dir = get_cache_var(contents, 'pvt_cppan_demo_danbloomberg_leptonica_DIR')
if not leptonica_top_dir:
raise RuntimeError('cannot detect leptonica source codes')
libtesseract_top_dir = get_cache_var(contents, 'pvt_simonflueckiger_tesseract_libtesseract_DIR')
if not libtesseract_top_dir:
raise RuntimeError('cannot detect libtesseract source codes')
leptonica_build_dir = get_cppan_build_dir(leptonica_top_dir, build_number)
if not leptonica_build_dir:
raise RuntimeError('unexpected leptonica source location directory name')
libtesseract_build_dir = get_cppan_build_dir(libtesseract_top_dir, build_number)
if not libtesseract_build_dir:
raise RuntimeError('unexpected libtesseract source location directory name')
leptonica_src_dir = os.path.join(leptonica_top_dir, 'src')
libtesseract_src_dir = libtesseract_top_dir
return {
'leptonica_src_dir': leptonica_src_dir,
'leptonica_build_dir': leptonica_build_dir,
'libtesseract_src_dir': libtesseract_src_dir,
'libtesseract_build_dir': libtesseract_build_dir,
}
def prepare_tesseract_env(build_dir, generator, tesseract_cppan_version, tesseract_version, leptonica_version, patch_func=None):
_LOGGER.info("Cleaning build_dir: {}".format(build_dir))
# remove the old build directory
if os.path.exists(build_dir):
shutil.rmtree(build_dir)
# create the empty build directory
os.makedirs(build_dir)
# create cppan.yml cppan configuration file
_LOGGER.info("Generating cppan.yml")
cppan_yaml_path = generate_cppan_config(
build_dir, generator, tesseract_cppan_version, leptonica_version)
_LOGGER.debug("create {}".format(cppan_yaml_path))
_LOGGER.info("")
_LOGGER.info("Generating project files")
_LOGGER.info("----------------------------------------")
# generate project files
generate_project_files(build_dir, tesseract_cppan_version)
_LOGGER.info("----------------------------------------")
_LOGGER.info("Generating project files done")
_LOGGER.info("")
_LOGGER.info("Patching...")
_LOGGER.info("----------------------------------------")
apply_patches(build_dir)
_LOGGER.info("----------------------------------------")
_LOGGER.info("Patching done")
_LOGGER.info("")
_LOGGER.info("Building packages")
_LOGGER.info("----------------------------------------")
# build tesseract.exe
build_tesseract_exe(build_dir, tesseract_cppan_version)
_LOGGER.info("----------------------------------------")
_LOGGER.info("Building packages done")
_LOGGER.info("")
_LOGGER.info("Checking version")
_LOGGER.info("----------------------------------------")
version = get_tesseract_version_from_exe(build_dir)
if version:
# change tesseract version to what we found in executable
tesseract_version = version
_LOGGER.info("extracted tesseract version from executable: {}".format(version))
else:
_LOGGER.warning('Unable to extract tesseract version from executable! Using env variable TESSERACT_VERSION')
# tesseract_version_number = int(''.join(tesseract_version.split('.')), 16)
tesseract_version_number = version_to_int(tesseract_version)
# _LOGGER.info("tesseract version: {}".format(tesseract_version))
# _LOGGER.info("tesseract version number: {}".format(tesseract_version_number))
_LOGGER.info("----------------------------------------")
_LOGGER.info("Checking version done")
_LOGGER.info("")
_LOGGER.info("Building objects")
_LOGGER.info("----------------------------------------")
# build dummy.exe
build_dummy_exe(build_dir)
_LOGGER.info("----------------------------------------")
_LOGGER.info("Building objects done")
_LOGGER.info("")
# prepare header files
dirs = get_directories_from_cmakefile(build_dir)
leptonica_src_dir = dirs['leptonica_src_dir']
leptonica_build_dir = dirs['leptonica_build_dir']
libtesseract_src_dir = dirs['libtesseract_src_dir']
libtesseract_build_dir = dirs['libtesseract_build_dir']
_LOGGER.info("leptonica_src_dir: {}".format(leptonica_src_dir))
_LOGGER.info("leptonica_build_dir: {}".format(leptonica_build_dir))
_LOGGER.info("libtesseract_src_dir: {}".format(libtesseract_src_dir))
_LOGGER.info("libtesseract_build_dir: {}".format(libtesseract_build_dir))
# figure out our configuration
files = sorted(os.listdir(os.path.join(build_dir, 'bin')))
_LOGGER.debug("files: {}".format(files))
libraries = []
tesseract_dll_files = []
for name in files:
path = os.path.join(build_dir, 'bin', name)
if os.path.isfile(path):
if name.endswith('.lib'):
libraries.append(os.path.splitext(name)[0])
if name.endswith('.dll'):
tesseract_dll_files.append(path)
prepare_headers_for_leptonica(
os.path.join(build_dir, 'include', 'leptonica'),
leptonica_src_dir,
leptonica_build_dir)
prepare_headers_for_libtesseract(
os.path.join(build_dir, 'include', 'tesseract'),
libtesseract_src_dir,
libtesseract_build_dir)
include_dirs = [os.path.join(build_dir, 'include')]
library_dirs = [os.path.join(build_dir, 'bin')]
config = {
'include_dirs': include_dirs,
'library_dirs': library_dirs,
'libraries': libraries,
'cython_compile_time_env': {
'TESSERACT_VERSION': tesseract_version_number,
},
}
if tesseract_version_number >= 0x040000:
# tesseract >= 4.00 requires c++11 compiler support
config['extra_compile_args'] = ['-std=c++11']
return tesseract_dll_files, config
def get_generator():
generator = os.environ.get('GENERATOR_BASE', 'Visual Studio 15 2017')
if get_platform() == 'win-amd64':
if strtobool(os.environ.get('BUILD_TARGET_32', '0')):
_LOGGER.debug('building for Win32')
else:
_LOGGER.debug('building for Win64')
generator += ' Win64'
elif get_platform() == 'win32':
_LOGGER.debug('building for Win32')
else:
_LOGGER.error('platform not supported')
return generator
def main():
top_dir = os.path.dirname(os.path.abspath(__file__))
build_dir = os.path.join(top_dir, 'build', 'tesseract_build')
leptonica_version = os.environ.get('LEPTONICA_VERSION', LEPTONICA_DEFAULT_VERSION)
tesseract_version = os.environ.get('TESSERACT_VERSION', TESSERACT_DEFAULT_VERSION)
tesseract_cppan_version = os.environ.get('CPPAN_TESSERACT_VERSION', tesseract_version)
generator = get_generator()
_LOGGER.info("----------------------------------------")
_LOGGER.info("leptonica version: {}".format(leptonica_version))
_LOGGER.info("tesseract version: {}".format(tesseract_version))
_LOGGER.info("tesseract cppan version: {}".format(tesseract_cppan_version))
_LOGGER.info('generator: {}'.format(generator))
_LOGGER.info("----------------------------------------")
dll_files, build_args = prepare_tesseract_env(
build_dir, generator,
tesseract_cppan_version=tesseract_cppan_version,
tesseract_version=tesseract_version,
leptonica_version=leptonica_version)
obj = {
'dll_files': dll_files,
'build_args': build_args,
}
with open(os.path.join(top_dir, "configure.json"), "w") as fp:
json.dump(obj, fp)
_LOGGER.info("configuration:")
_LOGGER.info(json.dumps(obj, indent=2))
if __name__ == '__main__':
main()
import codecs
import os
import re
import shutil
import json
from setuptools import setup, Extension
from Cython.Distutils import build_ext
here = os.path.abspath(os.path.dirname(__file__))
tesseract_dll_files = []
build_args = {}
config_file = os.path.join(here, "configure.json")
if os.path.exists(config_file):
with open(config_file) as fp:
obj = json.load(fp)
tesseract_dll_files = obj['dll_files']
build_args = obj['build_args']
# due to "error: 'libraries' must be a list of strings"
build_args["libraries"] = [lib.encode('utf-8') for lib in build_args["libraries"]]
class BuildTesseract(build_ext):
def initialize_options(self):
build_ext.initialize_options(self)
for k, v in build_args.items():
setattr(self, k, v)
def build_extension(self, ext):
if hasattr(ext, 'dlls'):
dll_dest_dir = os.path.dirname(self.get_ext_fullpath(ext.name))
for dll_name_pattern in ext.dlls:
dll_src_dir, dll_name = os.path.split(dll_name_pattern)
if dll_name == '*.dll':
raise NotImplementedError('not implemented')
if not os.path.isabs(dll_name_pattern):
# how to handle relative path???
raise NotImplementedError('not implemented')
try:
shutil.copy(dll_name_pattern, dll_dest_dir)
except shutil.SameFileError:
pass
return build_ext.build_extension(self, ext)
class ExtensionWithDLL(Extension):
def __init__(self, name, sources, *args, **kw):
self.dlls = kw.pop("dlls", [])
Extension.__init__(self, name, sources, *args, **kw)
def read_file(path):
with codecs.open(os.path.join(here, path), 'r') as fp:
return fp.read()
# find_version from pip https://github.com/pypa/pip/blob/1.5.6/setup.py#L33
def find_version(path):
content = read_file(path)
m = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", content, re.MULTILINE)
if m is None:
raise RuntimeError("Unable to find version string.")
return m.group(1)
setup(
name='tesserocr',
version=find_version('tesserocr.pyx'),
description='A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython',
long_description=read_file('README.rst'),
url='https://github.com/sirfz/tesserocr',
author='Fayez Zouheiry',
author_email='iamfayez@gmail.com',
license='MIT',
classifiers=[
'Development Status :: 5 - Production/Stable',
'Intended Audience :: Developers',
'Topic :: Multimedia :: Graphics :: Capture :: Scanners',
'Topic :: Multimedia :: Graphics :: Graphics Conversion',
'Topic :: Scientific/Engineering :: Image Recognition',
'License :: OSI Approved :: MIT License',
'Operating System :: POSIX',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.2',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: Implementation :: CPython',
'Programming Language :: Python :: Implementation :: PyPy',
'Programming Language :: Cython'
],
keywords='Tesseract,tesseract-ocr,OCR,optical character recognition,PIL,Pillow,Cython',
cmdclass={'build_ext': BuildTesseract},
ext_modules=[ExtensionWithDLL("tesserocr._tesserocr",
sources=["tesserocr.pyx"],
language="c++",
dlls=tesseract_dll_files)],
packages=['tesserocr'],
test_suite='tests'
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment