sed -i 's,imageFilename=",imageFilename="page/",' page/*.xml
ocrd workspace init
# tei2textpages.py | |
import lxml.etree as ET | |
import sys | |
import re | |
tei_file = sys.argv[1] | |
with open(tei_file, 'r') as f: | |
tei_xml = f.read() | |
tei_xml = re.sub(r'<\?xml version="1.0" encoding="UTF-8"\?>', '', tei_xml) |
#!/bin/bash | |
ocrd () { | |
"C:/Users/kb/AppData/Local/Programs/Python/Python39/Scripts/ocrd.exe" "$@" |tr -d '\r' | |
} | |
layouteval () { | |
set -x | |
"/cygdrive/c/Users/kb/Downloads/LayoutEvalCmd_1-9-106/LayoutEvalCmd 1.9/layoutevalcmd-1-9-106.exe" \ | |
-printWarnings \ | |
-csvValues \ |
#!/bin/bash | |
set -e | |
ADD_TO_WORKSPACE=true | |
INPUT_FILE_GROUP=OCR-D-OCR-TESS | |
OUTPUT_FILE_GROUP=TSV | |
DIRECTORY=$PWD | |
PPN= |
#!/usr/bin/env bash | |
# reset environment variables that could interfere with normal usage | |
unset GREP_OPTIONS | |
# put all utility functions here | |
# make a temporary file | |
git_extra_mktemp() { | |
mktemp -t "$(basename "$0")".XXXXXXX | |
} |
time ocrd process --overwrite "tesserocr-segment -P find_tables true -I OCR-D-IMG -O TESS" -g PHYS_0020 | |
12:45:37.784 INFO ocrd.task_sequence.run_tasks - Start processing task 'tesserocr-segment -I OCR-D-IMG -O TESS -p '{"find_tables": true, "dpi": 0, "padding": 4, "shrink_polygons": false, "block_polygons": false, "find_staves": false, "sparse_text": false}'' | |
12:45:38.758 INFO processor.TesserocrSegment - INPUT FILE 0 / PHYS_0020 | |
12:45:38.835 INFO processor.TesserocrSegment - Page 'PHYS_0020' images will use 300 DPI from image meta-data | |
12:45:38.835 INFO processor.TesserocrSegment - Processing page 'PHYS_0020' | |
libpng warning: iCCP: profile 'ICC Profile': 'desc': ICC profile tag start not a multiple of 4 | |
libpng warning: iCCP: profile 'ICC Profile': 'rXYZ': ICC profile tag start not a multiple of 4 | |
libpng warning: iCCP: profile 'ICC Profile': 'gXYZ': ICC profile tag start not a multiple of 4 | |
libpng warning: iCCP: profile 'ICC Profile': 'bXYZ': ICC profile tag start not a multiple of 4 | |
libpng warning: iCCP: prof |
<?xml version="1.0" encoding="UTF-8"?> | |
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xsi:schemaLocation="info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-0.xsd http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-6.xsd http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version17/mets.v1-7.xsd http://www.loc.gov/mix/v10 http://www.loc.gov/standards/mix/mix10/mix10.xsd"> | |
<mets:metsHdr CREATEDATE="2017-11-30T16:18:26"> | |
<mets:agent OTHERTYPE="SOFTWARE" ROLE="CREATOR" TYPE="OTHER"> | |
<mets:name>DFG-Koordinierungsprojekt zur Weiterentwicklung von Verfahren der Optical Character Recognition (OCR-D)</mets:name> | |
<mets:note>OCR-D</mets:note> | |
</mets:agent> | |
<mets:agent TYPE="OTHER" OTHERTYPE="SOFTWARE" ROLE="OTHER" OTHERROLE="preprocessing/optimization/binarization"> | |
<mets:name>ocrd-cis-ocropy-binarize v0.1.5</mets:name> | |
<mets:note xmlns: |