Skip to content

Instantly share code, notes, and snippets.

@Sciss
Created March 6, 2022 14:33
Show Gist options
  • Save Sciss/a5be9243a666e57dad2c9ee03b718111 to your computer and use it in GitHub Desktop.
Save Sciss/a5be9243a666e57dad2c9ee03b718111 to your computer and use it in GitHub Desktop.
val ocrDir = file("/data/texts/ocr")
def all = ocrDir.children(_.extL == "txt")
// all.size
def findOCR2(words: List[String], contentsMaxChars: Int = 72, maxFileName: Int = 72): Unit =
new Thread {
override def run(): Unit = {
all.sorted(File.NameOrdering).foreach { f =>
// println(f.name)
val fIn = new java.io.FileInputStream(f)
val contents = try {
val arr = new Array[Byte](fIn.available())
fIn.read(arr)
new String(arr, "UTF-8")
} finally {
fIn.close()
}
val contentsL = contents.toLowerCase
var ix = words.map(w => (w -> contentsL.indexOf(w))).filter(_._2 >= 0)
// val j = contentsL.indexOf(not)
for ((word, i0) <- ix) {
var i = i0
while (i >= 0) {
val j = math.max(0, i - contentsMaxChars)
val k = math.min(contents.length, i + math.max(word.length, contentsMaxChars))
val sub0 = contentsL.substring(j, k)
val sub = sub0.replaceAll("\\n", " ")
// val split = sub.split("\\s+")
// val numSplit = split.length
// val overhead = numSplit - contextMaxWords
// val sel = if (overhead <= 0) split else split.slice(overhead/2, numSplit - ((overhead + 1)/2))
// val text = sel.mkString(" ", " ", "")
val m0 = " " * contentsMaxChars
val n0 = m0 + sub + m0
val o0 = n0.indexOf(word)
val text0 = n0.substring(o0 - contentsMaxChars, o0 + word.length + contentsMaxChars)
val m = text0.indexOf(".")
val text1 = if (m < 0 || m >= text0.toLowerCase.indexOf(word)) text0 else text0.substring(m + 1)
val n = text1.toLowerCase.indexOf(word)
val o = text1.indexOf(".", n + 1)
val text2 = if (o < 0) text1 else text1.substring(0, o + 1)
val p = text2.toLowerCase.indexOf(word)
val text3 = (" " * (contentsMaxChars - p)) ++ text2
val text4 = text3 + (" " * (contentsMaxChars * 2 + word.length + 1 - text3.length))
val text = text4.map { ch =>
if (ch >= ' ' && ch < 128) ch
else if ("äöüßÄÖÜáàéèíìóòúù'ÁÀÉÈÍÌÓÒÚÙ".contains(ch)) ch
else ' '
}
println(text + f.base.take(maxFileName))
i = contentsL.indexOf(word, i + word.length)
}
}
}
println("\nDone.")
}
start()
}
findOCR2(List("operationali"), contentsMaxChars = 64, maxFileName = 70)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment