Ggf. anpassen der Bildschirmauflösung in der Virtuellen Maschine
xrandr --output VGA-1 --mode 1280x800
(1280x800 durch gewünschte Bildschirmauflösung ersetzen)
Installation des Texteditors sublime
import click | |
import json | |
import requests | |
import os | |
from tqdm import tqdm | |
def get_text(obj, sub_entries=None): | |
if sub_entries is None: |
convert -density 300 -depth 8 -alpha Off -limit area 1 foo.pdf foo_%04d.tif |
const ocrd = { | |
default: [ | |
"\uF1AC \u00AD \u00AC \u00BD \u00C0 \u00C3 \u00C4 \u00C6 \u00E0 \u00E3 \u00E4 \u00E6 \u0101 \u023A \u2C65 \uE42C", | |
"\uEFA1 \uF500 \uF532 \u0253 \uF524 \u00C7 \u00E7 \u0107 \uEEC4 \uEEC5 \uF501 \uF502 \uF517 \uF520 \uF522 \uF531", | |
"\uF50A \uF51B \u00C8 \u00C9 \u00CB \u00E8 \u00E9 \u00EB \u0113 \u0118 \u0119 \u0256 \u0247 \u1EBD \u204A \uE4E1", | |
"\uF158 \uF219 \uF515 \uFB00 \uFB01 \uFB02 \uFB03 \uA7A0 \uA7A1 \uF504 \uF505 \uF506 \uF521 \uF525 \u00CD \u00ED", | |
"\u00EF \u0129 \u012B \u0133 \uA76D \uF220 \uF533 \uEBE3 \uA742 \uA743 \uA7A2 \uA7A3 \u0141 \u0142 \uF4F9 \uF50B", | |
"\uE5B8 \uF519 \u00D1 \u00F1 \uA7A4 \uA7A5 \uE1DC \uE5DC \u00D2 \u00D5 \u00D6 \u00D8 \u00F2 \u00F5 \u00F6 {shift}" | |
], | |
shift: [ |
:Start | |
@Echo off | |
Set _SourcePath=C:\path\to\images\*.tif | |
Set _OutputPath=C:\path\to\output\ | |
Set _Tesseract=C:\path\to\tesseract\tesseract.exe | |
Set _TesseractLang=lang | |
Set _TesseractOutputFormat=alto | |
:Convert | |
For &&A in (%_SourcePath%) Do Echo Processing %%A...&%_Tesseract% -l %_TesseractLang% %%A %_OutputPath%\%%~nA %_TesseractOutputFormat% | |
:End |
C
Python
Python
Python
Python
Python
Python
C++
C
#!/usr/bin/env python | |
# Copyright 2017 Google Inc. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# |
Programmatic access to the digitised collections and digitised newspapers of the Staatsbibliothek zu Berlin - Preußischer Kulturbesitz (SBB) is currently possible via two distinct APIs.
Retrieval of metadata for objects in the digitised collections is established by use of the The Open Archives Initiative Protocol for Metadata Harvesting (OAI-PMH) standard. A wide range of client applications for OAI-PMH in numerous programming languages are freely available on the web.
The base URL for the OAI-PMH endpoint of the digitised collections of the SBB is
FOR /R %%G IN (*.hocr) DO java -jar saxon9he.jar -s:"%%G" -xsl:hocr2text.xsl -o:"%%~nG.txt" |