Skip to content

Instantly share code, notes, and snippets.

@fintanmm
Last active October 13, 2023 15:02
Show Gist options
  • Save fintanmm/65b0533f5165d1ce9f3e1a299fec10cf to your computer and use it in GitHub Desktop.
Save fintanmm/65b0533f5165d1ce9f3e1a299fec10cf to your computer and use it in GitHub Desktop.
Simple TikaOCR CLI made with jbang
///usr/bin/env jbang "$0" "$@" ; exit $?
//DEPS info.picocli:picocli:4.7.5
//DEPS org.apache.tika:tika-core:2.9.0
//DEPS org.apache.tika:tika-parsers-standard-package:2.9.0
//DEPS dev.langchain4j:langchain4j:0.23.0
//DEPS dev.langchain4j:langchain4j-local-ai:0.23.0
//DEPS ch.qos.reload4j:reload4j:1.2.19
//DEPS me.tongfei:progressbar:0.10.0
//DEPS io.vavr:vavr:0.10.4
//DEPS io.vavr:vavr-render:0.9.0
import dev.langchain4j.data.message.AiMessage;
import dev.langchain4j.model.StreamingResponseHandler;
import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.model.input.Prompt;
import dev.langchain4j.model.input.PromptTemplate;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.model.input.structured.StructuredPrompt;
import dev.langchain4j.model.input.structured.StructuredPromptProcessor;
import dev.langchain4j.model.localai.LocalAiChatModel;
import dev.langchain4j.model.localai.LocalAiStreamingLanguageModel;
import dev.langchain4j.model.output.Response;
import io.vavr.control.Option;
import io.vavr.control.Try;
import me.tongfei.progressbar.ProgressBar;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import picocli.CommandLine;
import picocli.CommandLine.Command;
import picocli.CommandLine.Parameters;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.util.Arrays.asList;
@Command(name = "TikaOCR", mixinStandardHelpOptions = true, version = "TikaOCR 0.2", description = "Simple TikaOCR CLI made with jbang")
class TikaOCR implements Runnable {
static final Logger Log = Logger.getLogger(TikaOCR.class);
private final Tika tika = new Tika();
private final Metadata metadata = new Metadata();
private final ContentHandler contentHandler = new BodyContentHandler();
private static final LocalAiStreamingLanguageModel localAiStreamingLanguageModel = new LocalAiStreamingLanguageModel("http://localhost:4891/v1", "Wizard v1.1", 0.28, 0.95, 50, Duration.ofSeconds(720), true, true);
private static final LocalAiChatModel localModel = new LocalAiChatModel("http://localhost:4891/v1", "Wizard v1.1", 0.28, 0.95, 50, Duration.ofSeconds(30), 1, true, true);
@Parameters(index = "0..*", description = "The files whose OCR is to preformed on.", arity = "1")
private static List<File> files;
public static void main(String... args) {
BasicConfigurator.configure();
new CommandLine(new TikaOCR()).execute(args);
}
@Override
public void run() {
ProgressBar progressBar = new ProgressBar("Processing files", 3L * files.size());
files.parallelStream().forEach(file -> tikaProcess(file, progressBar));
progressBar.close();
}
public void callAi(CreateDocumentPrompt createDocumentPrompt) {
Prompt prompt = StructuredPromptProcessor.toPrompt(createDocumentPrompt);
localAiStreamingLanguageModel.generate(prompt, new StreamingResponseHandler<String>() {
@Override
public void onNext(String token) {
Log.debug("New token: '" + token + "'");
}
@Override
public void onComplete(Response<String> response) {
Log.debug("Streaming completed: " + response);
}
@Override
public void onError(Throwable error) {
error.printStackTrace();
}
});
}
public void tikaProcess(File file, ProgressBar progressBar) {
Try.of(() -> {
Log.debug("Processing file: " + file.getAbsolutePath());
String text = tika.parseToString(Files.newInputStream(file.toPath()), metadata);
Option.of(metadata.get("X-TIKA:content")).onEmpty(() -> metadata.set("X-TIKA:content", text));
tika.getParser().parse(Files.newInputStream(file.toPath()), contentHandler, metadata, new ParseContext());
progressBar.step();
return text;
}).andThenTry(text -> {
Log.debug("Writing to file: " + file.getAbsolutePath());
String outputFileName = file.getName().replaceFirst("[.][^.]+$", "") + ".txt";
File outputFile = new File(file.getParentFile(), outputFileName);
Log.info("Writing to file: " + outputFile.getAbsolutePath());
Files.writeString(Path.of(outputFile.getAbsolutePath()), text);
progressBar.step();
}).andThenTry(text -> {
Log.debug("Calling AI");
CreateDocumentPrompt createDocumentPrompt = new CreateDocumentPrompt();
createDocumentPrompt.name = file.getName();
createDocumentPrompt.content = text;
callAi(createDocumentPrompt);
// String outputFileName = file.getName().replaceFirst("[.][^.]+$", "") + ".ai.txt";
// File outputFile = new File(file.getParentFile(), outputFileName);
// Files.writeString(Path.of(outputFile.getAbsolutePath()), s);
progressBar.step();
}).onFailure(e -> {
Log.debug(e.getMessage());
progressBar.step();
});
}
@StructuredPrompt({
"Please review the attached document for spelling mistakes and provide a summary. Additionally, return a dictionary or list of all the spelling mistakes found in the document. ",
"The document is named {{name}}.",
"The document is attached to this prompt: {{content}}",
"Structure your answer in the following way:",
"Name: ...",
"Summary: ...",
"Mistakes: ...",
})
static class CreateDocumentPrompt {
private String name;
private String summary;
private String content;
private List<String> mistakes;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment