|
import logging |
|
import sys |
|
import os |
|
import re |
|
import shutil |
|
import subprocess |
|
import codecs |
|
import json |
|
|
|
from distutils.util import strtobool |
|
from distutils.util import get_platform |
|
|
|
import yaml |
|
|
|
_LOGGER = logging.getLogger() |
|
if strtobool(os.environ.get('DEBUG', '0')): |
|
_LOGGER.setLevel(logging.DEBUG) |
|
else: |
|
_LOGGER.setLevel(logging.INFO) |
|
_LOGGER.addHandler(logging.StreamHandler(sys.stderr)) |
|
|
|
LEPTONICA_DEFAULT_VERSION = '1.74.4' |
|
TESSERACT_DEFAULT_VERSION = '3.5.2' |
|
|
|
|
|
def version_to_int(version): |
|
version = re.search(r'((?:\d+\.)+\d+)', version).group() |
|
# Split the groups on ".", take only the first one, and print each group with leading 0 if needed |
|
# To be safe, also handle cases where an extra group is added to the version string, or if one or two groups |
|
# are dropped. |
|
version_groups = (version.split('.') + [0, 0])[:3] |
|
version_str = "{:02}{:02}{:02}".format(*map(int, version_groups)) |
|
|
|
return int(version_str, 16) |
|
|
|
|
|
def generate_cppan_config(build_dir, generator, tesseract_cppan_version, leptonica_version): |
|
cppan_config_tmpl = """\ |
|
local_settings: |
|
cppan_dir: cppan |
|
build_dir_type: local |
|
build_dir: build |
|
build: |
|
generator: {generator} |
|
|
|
projects: |
|
dummy: |
|
files: dummy.cpp |
|
dependencies: |
|
pvt.cppan.demo.danbloomberg.leptonica: {leptonica_version} |
|
pvt.simonflueckiger.tesseract.libtesseract: {tesseract_cppan_version} |
|
pvt.simonflueckiger.tesseract.tesseract: {tesseract_cppan_version} |
|
""" |
|
|
|
cppan_yaml_path = os.path.join(build_dir, 'cppan.yml') |
|
with open(cppan_yaml_path, 'w') as fp: |
|
fp.write(cppan_config_tmpl.format( |
|
generator=generator, |
|
leptonica_version=leptonica_version, |
|
tesseract_cppan_version=tesseract_cppan_version)) |
|
|
|
return cppan_yaml_path |
|
|
|
|
|
def generate_project_files(build_dir, tesseract_cppan_version): |
|
package_name = 'pvt.simonflueckiger.tesseract.tesseract-{}'.format(tesseract_cppan_version) |
|
|
|
cmd = ['cppan', '--generate', package_name] |
|
_LOGGER.debug("cmd: {}".format(cmd)) |
|
subprocess.check_call(cmd, cwd=build_dir) |
|
|
|
|
|
def build_tesseract_exe(build_dir, tesseract_cppan_version): |
|
package_name = 'pvt.simonflueckiger.tesseract.tesseract-{}'.format(tesseract_cppan_version) |
|
|
|
cmd = ['cppan', '--build-packages', package_name] |
|
_LOGGER.debug("cmd: {}".format(cmd)) |
|
subprocess.check_call(cmd, cwd=build_dir) |
|
|
|
|
|
def get_tesseract_version_from_exe(build_dir): |
|
# get the tesseract version from executable |
|
cmd = [os.path.join(build_dir, 'bin', 'tesseract.exe'), '-v'] |
|
_LOGGER.debug("cmd: {}".format(cmd)) |
|
output = subprocess.check_output(cmd, cwd=os.path.join(build_dir, 'bin')) |
|
|
|
_LOGGER.debug("version string: {}".format(output)) |
|
|
|
m = re.search(r"tesseract ([0-9]+\.[0-9]+\.[0-9]+)", output) |
|
if m is None: |
|
return None |
|
return m.group(1) |
|
|
|
|
|
def build_dummy_exe(build_dir): |
|
# create dummy.cpp file |
|
with open(os.path.join(build_dir, 'dummy.cpp'), 'w') as fp: |
|
fp.write('int main(int argc, char *argv[]) { return 0; }\n') |
|
|
|
# build dummy.exe |
|
cmd = ['cppan', '--build', '.'] |
|
_LOGGER.debug("cmd: {}".format(cmd)) |
|
subprocess.check_call(cmd, cwd=build_dir) |
|
|
|
|
|
def get_magic_number_for_build_dir(root_dir): |
|
# we should have only one subdir |
|
subdirs = [] |
|
for name in os.listdir(root_dir): |
|
path = os.path.join(root_dir, name) |
|
if os.path.isdir(path): |
|
subdirs.append(name) |
|
assert(len(subdirs) == 1) |
|
|
|
# the magic number, some kind of hash key, should be relate to architectures, |
|
# compiler selection, debug/release, etc ... |
|
return subdirs[0] |
|
|
|
|
|
def get_cache_var(contents, key): |
|
m = re.search(r"set_cache_var\({} (.+?)\)".format(key), contents) |
|
if not m: |
|
return None |
|
return m.group(1) |
|
|
|
|
|
class Patcher(object): |
|
@classmethod |
|
def enabled(cls): |
|
return False |
|
|
|
def _apply(self): |
|
raise NotImplementedError |
|
|
|
def apply(self): |
|
if not self.enabled(): |
|
return False |
|
ret = self._apply() |
|
return True if ret is None else ret |
|
|
|
|
|
class Utf8Patcher(Patcher): |
|
def __init__(self, source_path): |
|
self.source_path = source_path |
|
|
|
@classmethod |
|
def enabled(cls): |
|
return bool(strtobool(os.environ.get('UTF8_PATCH', '0'))) |
|
|
|
def _apply(self): |
|
cpp_file = os.path.join(self.source_path, "ccmain/equationdetect.cpp") |
|
with open(cpp_file, "r+b") as fp: |
|
contents = fp.read() |
|
if contents.startswith(codecs.BOM_UTF8): |
|
return False |
|
fp.seek(0) |
|
fp.write(codecs.BOM_UTF8) |
|
fp.write(contents) |
|
return True |
|
|
|
|
|
def apply_patches(build_dir): |
|
tesseract_build_dir = os.path.join(build_dir, 'build', 'cppan-build-tesseract') |
|
build_number = get_magic_number_for_build_dir(tesseract_build_dir) |
|
|
|
cmakefile_path = os.path.join( |
|
tesseract_build_dir, build_number, |
|
'cppan', 'CMakeLists.txt') |
|
with open(cmakefile_path, 'r') as fp: |
|
contents = fp.read() |
|
|
|
libtesseract_dir = get_cache_var(contents, 'pvt_simonflueckiger_tesseract_libtesseract_DIR') |
|
if not libtesseract_dir: |
|
raise RuntimeError('cannot detect libtesseract source codes') |
|
|
|
_LOGGER.info("libtesseract_dir: {}".format(libtesseract_dir)) |
|
|
|
patchers = [ |
|
Utf8Patcher(libtesseract_dir), |
|
] |
|
|
|
dirty_bit = False |
|
for patcher in patchers: |
|
if patcher.apply(): |
|
dirty_bit = True |
|
|
|
if dirty_bit: |
|
# delete lnk folder |
|
cppan_lnk_dir = os.path.expanduser("~/.cppan/storage/lnk") |
|
_LOGGER.info("deleting {}".format(cppan_lnk_dir)) |
|
shutil.rmtree(cppan_lnk_dir) |
|
|
|
# delete obj folder |
|
cppan_obj_dir = os.path.expanduser("~/.cppan/storage/obj") |
|
_LOGGER.info("deleting {}".format(cppan_obj_dir)) |
|
shutil.rmtree(cppan_obj_dir) |
|
|
|
return dirty_bit |
|
|
|
|
|
def prepare_headers_for_leptonica(dest_dir, src_dir, build_dir): |
|
origin_paths = [] |
|
|
|
for name in sorted(os.listdir(src_dir)): |
|
path = os.path.join(src_dir, name) |
|
if os.path.isfile(path) and name.endswith('.h'): |
|
origin_paths.append(path) |
|
|
|
# take care of generated header files |
|
for name in sorted(os.listdir(build_dir)): |
|
path = os.path.join(build_dir, name) |
|
if os.path.isfile(path) and name.endswith('.h'): |
|
origin_paths.append(path) |
|
|
|
os.makedirs(dest_dir) |
|
for path in origin_paths: |
|
# _LOGGER.debug("copy {} to {}".format(path, dest_dir)) |
|
shutil.copy(path, dest_dir) |
|
|
|
|
|
def prepare_headers_for_libtesseract(dest_dir, src_dir, build_dir): |
|
# from libtesseract source tree |
|
with open(os.path.join(src_dir, 'cppan.yml'), 'r') as fp: |
|
cppan_cfg = yaml.safe_load(fp) |
|
|
|
origin_paths = [] |
|
|
|
subdirs = cppan_cfg['include_directories']['public'] |
|
for subdir_name in subdirs: |
|
subdir = os.path.normpath(os.path.join(src_dir, subdir_name)) |
|
for name in sorted(os.listdir(subdir)): |
|
path = os.path.join(subdir, name) |
|
if os.path.isfile(path) and name.endswith('.h'): |
|
origin_paths.append(path) |
|
|
|
# take care of generated header files |
|
for name in sorted(os.listdir(build_dir)): |
|
path = os.path.join(build_dir, name) |
|
if os.path.isfile(path) and name.endswith('.h'): |
|
origin_paths.append(path) |
|
|
|
os.makedirs(dest_dir) |
|
for path in origin_paths: |
|
# _LOGGER.debug("copy {} to {}".format(path, dest_dir)) |
|
shutil.copy(path, dest_dir) |
|
|
|
# need to patch the host.h |
|
with open(os.path.join(dest_dir, 'host.h'), 'r+') as fp: |
|
contents = fp.read() |
|
contents = contents.replace('#include <cstdint>', '// #include <cstdint>') |
|
fp.truncate(0) |
|
fp.seek(0) |
|
fp.write(contents) |
|
|
|
|
|
def get_cppan_build_dir(src_dir, build_number): |
|
m = re.search('/[0-9a-fA-F]{2}/[0-9a-fA-F]{2}/[0-9a-fA-F]{4}$', src_dir) |
|
if not m: |
|
return None |
|
package_hash = m.group(0).replace('/', '') |
|
|
|
return os.path.join( |
|
src_dir.replace('/src/', '/obj/', 1), |
|
'build', build_number, |
|
'cppan', package_hash) |
|
|
|
|
|
def get_directories_from_cmakefile(build_dir): |
|
dummy_project_name = os.path.split(build_dir)[-1] |
|
dummy_build_dir = os.path.join(build_dir, 'build', 'cppan-build-{}'.format(dummy_project_name)) |
|
build_number = get_magic_number_for_build_dir(dummy_build_dir) |
|
|
|
_LOGGER.info("dummy_build_dir: {}".format(dummy_build_dir)) |
|
_LOGGER.info("build_number: {}".format(build_number)) |
|
|
|
cmakefile_path = os.path.join(dummy_build_dir, build_number, 'cppan', 'CMakeLists.txt') |
|
with open(cmakefile_path, 'r') as fp: |
|
contents = fp.read() |
|
|
|
leptonica_top_dir = get_cache_var(contents, 'pvt_cppan_demo_danbloomberg_leptonica_DIR') |
|
if not leptonica_top_dir: |
|
raise RuntimeError('cannot detect leptonica source codes') |
|
|
|
libtesseract_top_dir = get_cache_var(contents, 'pvt_simonflueckiger_tesseract_libtesseract_DIR') |
|
if not libtesseract_top_dir: |
|
raise RuntimeError('cannot detect libtesseract source codes') |
|
|
|
leptonica_build_dir = get_cppan_build_dir(leptonica_top_dir, build_number) |
|
if not leptonica_build_dir: |
|
raise RuntimeError('unexpected leptonica source location directory name') |
|
|
|
libtesseract_build_dir = get_cppan_build_dir(libtesseract_top_dir, build_number) |
|
if not libtesseract_build_dir: |
|
raise RuntimeError('unexpected libtesseract source location directory name') |
|
|
|
leptonica_src_dir = os.path.join(leptonica_top_dir, 'src') |
|
libtesseract_src_dir = libtesseract_top_dir |
|
|
|
return { |
|
'leptonica_src_dir': leptonica_src_dir, |
|
'leptonica_build_dir': leptonica_build_dir, |
|
'libtesseract_src_dir': libtesseract_src_dir, |
|
'libtesseract_build_dir': libtesseract_build_dir, |
|
} |
|
|
|
|
|
def prepare_tesseract_env(build_dir, generator, tesseract_cppan_version, tesseract_version, leptonica_version, patch_func=None): |
|
_LOGGER.info("Cleaning build_dir: {}".format(build_dir)) |
|
|
|
# remove the old build directory |
|
if os.path.exists(build_dir): |
|
shutil.rmtree(build_dir) |
|
|
|
# create the empty build directory |
|
os.makedirs(build_dir) |
|
|
|
# create cppan.yml cppan configuration file |
|
_LOGGER.info("Generating cppan.yml") |
|
cppan_yaml_path = generate_cppan_config( |
|
build_dir, generator, tesseract_cppan_version, leptonica_version) |
|
_LOGGER.debug("create {}".format(cppan_yaml_path)) |
|
|
|
_LOGGER.info("") |
|
_LOGGER.info("Generating project files") |
|
_LOGGER.info("----------------------------------------") |
|
|
|
# generate project files |
|
generate_project_files(build_dir, tesseract_cppan_version) |
|
|
|
_LOGGER.info("----------------------------------------") |
|
_LOGGER.info("Generating project files done") |
|
_LOGGER.info("") |
|
|
|
_LOGGER.info("Patching...") |
|
_LOGGER.info("----------------------------------------") |
|
|
|
apply_patches(build_dir) |
|
|
|
_LOGGER.info("----------------------------------------") |
|
_LOGGER.info("Patching done") |
|
_LOGGER.info("") |
|
|
|
_LOGGER.info("Building packages") |
|
_LOGGER.info("----------------------------------------") |
|
|
|
# build tesseract.exe |
|
build_tesseract_exe(build_dir, tesseract_cppan_version) |
|
|
|
_LOGGER.info("----------------------------------------") |
|
_LOGGER.info("Building packages done") |
|
_LOGGER.info("") |
|
|
|
_LOGGER.info("Checking version") |
|
_LOGGER.info("----------------------------------------") |
|
|
|
version = get_tesseract_version_from_exe(build_dir) |
|
if version: |
|
# change tesseract version to what we found in executable |
|
tesseract_version = version |
|
_LOGGER.info("extracted tesseract version from executable: {}".format(version)) |
|
else: |
|
_LOGGER.warning('Unable to extract tesseract version from executable! Using env variable TESSERACT_VERSION') |
|
|
|
# tesseract_version_number = int(''.join(tesseract_version.split('.')), 16) |
|
tesseract_version_number = version_to_int(tesseract_version) |
|
|
|
# _LOGGER.info("tesseract version: {}".format(tesseract_version)) |
|
# _LOGGER.info("tesseract version number: {}".format(tesseract_version_number)) |
|
|
|
_LOGGER.info("----------------------------------------") |
|
_LOGGER.info("Checking version done") |
|
_LOGGER.info("") |
|
|
|
_LOGGER.info("Building objects") |
|
_LOGGER.info("----------------------------------------") |
|
|
|
# build dummy.exe |
|
build_dummy_exe(build_dir) |
|
|
|
_LOGGER.info("----------------------------------------") |
|
_LOGGER.info("Building objects done") |
|
_LOGGER.info("") |
|
|
|
# prepare header files |
|
dirs = get_directories_from_cmakefile(build_dir) |
|
leptonica_src_dir = dirs['leptonica_src_dir'] |
|
leptonica_build_dir = dirs['leptonica_build_dir'] |
|
libtesseract_src_dir = dirs['libtesseract_src_dir'] |
|
libtesseract_build_dir = dirs['libtesseract_build_dir'] |
|
|
|
_LOGGER.info("leptonica_src_dir: {}".format(leptonica_src_dir)) |
|
_LOGGER.info("leptonica_build_dir: {}".format(leptonica_build_dir)) |
|
_LOGGER.info("libtesseract_src_dir: {}".format(libtesseract_src_dir)) |
|
_LOGGER.info("libtesseract_build_dir: {}".format(libtesseract_build_dir)) |
|
|
|
# figure out our configuration |
|
files = sorted(os.listdir(os.path.join(build_dir, 'bin'))) |
|
_LOGGER.debug("files: {}".format(files)) |
|
|
|
libraries = [] |
|
tesseract_dll_files = [] |
|
|
|
for name in files: |
|
path = os.path.join(build_dir, 'bin', name) |
|
if os.path.isfile(path): |
|
if name.endswith('.lib'): |
|
libraries.append(os.path.splitext(name)[0]) |
|
if name.endswith('.dll'): |
|
tesseract_dll_files.append(path) |
|
|
|
prepare_headers_for_leptonica( |
|
os.path.join(build_dir, 'include', 'leptonica'), |
|
leptonica_src_dir, |
|
leptonica_build_dir) |
|
|
|
prepare_headers_for_libtesseract( |
|
os.path.join(build_dir, 'include', 'tesseract'), |
|
libtesseract_src_dir, |
|
libtesseract_build_dir) |
|
|
|
include_dirs = [os.path.join(build_dir, 'include')] |
|
library_dirs = [os.path.join(build_dir, 'bin')] |
|
|
|
config = { |
|
'include_dirs': include_dirs, |
|
'library_dirs': library_dirs, |
|
'libraries': libraries, |
|
'cython_compile_time_env': { |
|
'TESSERACT_VERSION': tesseract_version_number, |
|
}, |
|
} |
|
|
|
if tesseract_version_number >= 0x040000: |
|
# tesseract >= 4.00 requires c++11 compiler support |
|
config['extra_compile_args'] = ['-std=c++11'] |
|
|
|
return tesseract_dll_files, config |
|
|
|
|
|
def get_generator(): |
|
generator = os.environ.get('GENERATOR_BASE', 'Visual Studio 15 2017') |
|
if get_platform() == 'win-amd64': |
|
if strtobool(os.environ.get('BUILD_TARGET_32', '0')): |
|
_LOGGER.debug('building for Win32') |
|
else: |
|
_LOGGER.debug('building for Win64') |
|
generator += ' Win64' |
|
elif get_platform() == 'win32': |
|
_LOGGER.debug('building for Win32') |
|
else: |
|
_LOGGER.error('platform not supported') |
|
|
|
return generator |
|
|
|
|
|
def main(): |
|
top_dir = os.path.dirname(os.path.abspath(__file__)) |
|
build_dir = os.path.join(top_dir, 'build', 'tesseract_build') |
|
|
|
leptonica_version = os.environ.get('LEPTONICA_VERSION', LEPTONICA_DEFAULT_VERSION) |
|
tesseract_version = os.environ.get('TESSERACT_VERSION', TESSERACT_DEFAULT_VERSION) |
|
tesseract_cppan_version = os.environ.get('CPPAN_TESSERACT_VERSION', tesseract_version) |
|
generator = get_generator() |
|
|
|
_LOGGER.info("----------------------------------------") |
|
_LOGGER.info("leptonica version: {}".format(leptonica_version)) |
|
_LOGGER.info("tesseract version: {}".format(tesseract_version)) |
|
_LOGGER.info("tesseract cppan version: {}".format(tesseract_cppan_version)) |
|
_LOGGER.info('generator: {}'.format(generator)) |
|
_LOGGER.info("----------------------------------------") |
|
|
|
dll_files, build_args = prepare_tesseract_env( |
|
build_dir, generator, |
|
tesseract_cppan_version=tesseract_cppan_version, |
|
tesseract_version=tesseract_version, |
|
leptonica_version=leptonica_version) |
|
|
|
obj = { |
|
'dll_files': dll_files, |
|
'build_args': build_args, |
|
} |
|
with open(os.path.join(top_dir, "configure.json"), "w") as fp: |
|
json.dump(obj, fp) |
|
|
|
_LOGGER.info("configuration:") |
|
_LOGGER.info(json.dumps(obj, indent=2)) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |