This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val ocrDir = file("/data/texts/ocr") | |
def all = ocrDir.children(_.extL == "txt") | |
// all.size | |
def findOCR2(words: List[String], contentsMaxChars: Int = 72, maxFileName: Int = 72): Unit = | |
new Thread { | |
override def run(): Unit = { | |
all.sorted(File.NameOrdering).foreach { f => | |
// println(f.name) | |
val fIn = new java.io.FileInputStream(f) | |
val contents = try { | |
val arr = new Array[Byte](fIn.available()) | |
fIn.read(arr) | |
new String(arr, "UTF-8") | |
} finally { | |
fIn.close() | |
} | |
val contentsL = contents.toLowerCase | |
var ix = words.map(w => (w -> contentsL.indexOf(w))).filter(_._2 >= 0) | |
// val j = contentsL.indexOf(not) | |
for ((word, i0) <- ix) { | |
var i = i0 | |
while (i >= 0) { | |
val j = math.max(0, i - contentsMaxChars) | |
val k = math.min(contents.length, i + math.max(word.length, contentsMaxChars)) | |
val sub0 = contentsL.substring(j, k) | |
val sub = sub0.replaceAll("\\n", " ") | |
// val split = sub.split("\\s+") | |
// val numSplit = split.length | |
// val overhead = numSplit - contextMaxWords | |
// val sel = if (overhead <= 0) split else split.slice(overhead/2, numSplit - ((overhead + 1)/2)) | |
// val text = sel.mkString(" ", " ", "") | |
val m0 = " " * contentsMaxChars | |
val n0 = m0 + sub + m0 | |
val o0 = n0.indexOf(word) | |
val text0 = n0.substring(o0 - contentsMaxChars, o0 + word.length + contentsMaxChars) | |
val m = text0.indexOf(".") | |
val text1 = if (m < 0 || m >= text0.toLowerCase.indexOf(word)) text0 else text0.substring(m + 1) | |
val n = text1.toLowerCase.indexOf(word) | |
val o = text1.indexOf(".", n + 1) | |
val text2 = if (o < 0) text1 else text1.substring(0, o + 1) | |
val p = text2.toLowerCase.indexOf(word) | |
val text3 = (" " * (contentsMaxChars - p)) ++ text2 | |
val text4 = text3 + (" " * (contentsMaxChars * 2 + word.length + 1 - text3.length)) | |
val text = text4.map { ch => | |
if (ch >= ' ' && ch < 128) ch | |
else if ("äöüßÄÖÜáàéèíìóòúù'ÁÀÉÈÍÌÓÒÚÙ".contains(ch)) ch | |
else ' ' | |
} | |
println(text + f.base.take(maxFileName)) | |
i = contentsL.indexOf(word, i + word.length) | |
} | |
} | |
} | |
println("\nDone.") | |
} | |
start() | |
} | |
findOCR2(List("operationali"), contentsMaxChars = 64, maxFileName = 70) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment