Last active
October 13, 2023 15:02
-
-
Save fintanmm/65b0533f5165d1ce9f3e1a299fec10cf to your computer and use it in GitHub Desktop.
Simple TikaOCR CLI made with jbang
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
///usr/bin/env jbang "$0" "$@" ; exit $? | |
//DEPS info.picocli:picocli:4.7.5 | |
//DEPS org.apache.tika:tika-core:2.9.0 | |
//DEPS org.apache.tika:tika-parsers-standard-package:2.9.0 | |
//DEPS dev.langchain4j:langchain4j:0.23.0 | |
//DEPS dev.langchain4j:langchain4j-local-ai:0.23.0 | |
//DEPS ch.qos.reload4j:reload4j:1.2.19 | |
//DEPS me.tongfei:progressbar:0.10.0 | |
//DEPS io.vavr:vavr:0.10.4 | |
//DEPS io.vavr:vavr-render:0.9.0 | |
import dev.langchain4j.data.message.AiMessage; | |
import dev.langchain4j.model.StreamingResponseHandler; | |
import dev.langchain4j.model.embedding.EmbeddingModel; | |
import dev.langchain4j.model.input.Prompt; | |
import dev.langchain4j.model.input.PromptTemplate; | |
import dev.langchain4j.data.document.Document; | |
import dev.langchain4j.model.input.structured.StructuredPrompt; | |
import dev.langchain4j.model.input.structured.StructuredPromptProcessor; | |
import dev.langchain4j.model.localai.LocalAiChatModel; | |
import dev.langchain4j.model.localai.LocalAiStreamingLanguageModel; | |
import dev.langchain4j.model.output.Response; | |
import io.vavr.control.Option; | |
import io.vavr.control.Try; | |
import me.tongfei.progressbar.ProgressBar; | |
import org.apache.log4j.BasicConfigurator; | |
import org.apache.log4j.Logger; | |
import org.apache.tika.Tika; | |
import org.apache.tika.metadata.Metadata; | |
import org.apache.tika.parser.ParseContext; | |
import org.apache.tika.sax.BodyContentHandler; | |
import org.xml.sax.ContentHandler; | |
import picocli.CommandLine; | |
import picocli.CommandLine.Command; | |
import picocli.CommandLine.Parameters; | |
import java.io.File; | |
import java.nio.file.Files; | |
import java.nio.file.Path; | |
import java.time.Duration; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import static java.util.Arrays.asList; | |
@Command(name = "TikaOCR", mixinStandardHelpOptions = true, version = "TikaOCR 0.2", description = "Simple TikaOCR CLI made with jbang") | |
class TikaOCR implements Runnable { | |
static final Logger Log = Logger.getLogger(TikaOCR.class); | |
private final Tika tika = new Tika(); | |
private final Metadata metadata = new Metadata(); | |
private final ContentHandler contentHandler = new BodyContentHandler(); | |
private static final LocalAiStreamingLanguageModel localAiStreamingLanguageModel = new LocalAiStreamingLanguageModel("http://localhost:4891/v1", "Wizard v1.1", 0.28, 0.95, 50, Duration.ofSeconds(720), true, true); | |
private static final LocalAiChatModel localModel = new LocalAiChatModel("http://localhost:4891/v1", "Wizard v1.1", 0.28, 0.95, 50, Duration.ofSeconds(30), 1, true, true); | |
@Parameters(index = "0..*", description = "The files whose OCR is to preformed on.", arity = "1") | |
private static List<File> files; | |
public static void main(String... args) { | |
BasicConfigurator.configure(); | |
new CommandLine(new TikaOCR()).execute(args); | |
} | |
@Override | |
public void run() { | |
ProgressBar progressBar = new ProgressBar("Processing files", 3L * files.size()); | |
files.parallelStream().forEach(file -> tikaProcess(file, progressBar)); | |
progressBar.close(); | |
} | |
public void callAi(CreateDocumentPrompt createDocumentPrompt) { | |
Prompt prompt = StructuredPromptProcessor.toPrompt(createDocumentPrompt); | |
localAiStreamingLanguageModel.generate(prompt, new StreamingResponseHandler<String>() { | |
@Override | |
public void onNext(String token) { | |
Log.debug("New token: '" + token + "'"); | |
} | |
@Override | |
public void onComplete(Response<String> response) { | |
Log.debug("Streaming completed: " + response); | |
} | |
@Override | |
public void onError(Throwable error) { | |
error.printStackTrace(); | |
} | |
}); | |
} | |
public void tikaProcess(File file, ProgressBar progressBar) { | |
Try.of(() -> { | |
Log.debug("Processing file: " + file.getAbsolutePath()); | |
String text = tika.parseToString(Files.newInputStream(file.toPath()), metadata); | |
Option.of(metadata.get("X-TIKA:content")).onEmpty(() -> metadata.set("X-TIKA:content", text)); | |
tika.getParser().parse(Files.newInputStream(file.toPath()), contentHandler, metadata, new ParseContext()); | |
progressBar.step(); | |
return text; | |
}).andThenTry(text -> { | |
Log.debug("Writing to file: " + file.getAbsolutePath()); | |
String outputFileName = file.getName().replaceFirst("[.][^.]+$", "") + ".txt"; | |
File outputFile = new File(file.getParentFile(), outputFileName); | |
Log.info("Writing to file: " + outputFile.getAbsolutePath()); | |
Files.writeString(Path.of(outputFile.getAbsolutePath()), text); | |
progressBar.step(); | |
}).andThenTry(text -> { | |
Log.debug("Calling AI"); | |
CreateDocumentPrompt createDocumentPrompt = new CreateDocumentPrompt(); | |
createDocumentPrompt.name = file.getName(); | |
createDocumentPrompt.content = text; | |
callAi(createDocumentPrompt); | |
// String outputFileName = file.getName().replaceFirst("[.][^.]+$", "") + ".ai.txt"; | |
// File outputFile = new File(file.getParentFile(), outputFileName); | |
// Files.writeString(Path.of(outputFile.getAbsolutePath()), s); | |
progressBar.step(); | |
}).onFailure(e -> { | |
Log.debug(e.getMessage()); | |
progressBar.step(); | |
}); | |
} | |
@StructuredPrompt({ | |
"Please review the attached document for spelling mistakes and provide a summary. Additionally, return a dictionary or list of all the spelling mistakes found in the document. ", | |
"The document is named {{name}}.", | |
"The document is attached to this prompt: {{content}}", | |
"Structure your answer in the following way:", | |
"Name: ...", | |
"Summary: ...", | |
"Mistakes: ...", | |
}) | |
static class CreateDocumentPrompt { | |
private String name; | |
private String summary; | |
private String content; | |
private List<String> mistakes; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment