Skip to content

Instantly share code, notes, and snippets.

@ice09
Last active October 25, 2023 08:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ice09/ae5409de706fbd5e08183f7632b44434 to your computer and use it in GitHub Desktop.
Save ice09/ae5409de706fbd5e08183f7632b44434 to your computer and use it in GitHub Desktop.
Downloads three posts of Vitaliks blog and creates embeddings with langchain4j which can then be queried with OpenAI GPT (see https://hackmd.io/@alculexum/embedding4j)
//DEPS dev.langchain4j:langchain4j:0.23.0
//DEPS dev.langchain4j:langchain4j-open-ai:0.23.0
//DEPS dev.langchain4j:langchain4j-embeddings-all-minilm-l6-v2:0.23.0
//DEPS commons-io:commons-io:2.14.0
//DEPS org.apache.commons:commons-text:1.10.0
//DEPS org.jsoup:jsoup:1.16.1
//DEPS org.slf4j:slf4j-simple:2.0.9
package dev.indus340;
import dev.langchain4j.chain.ConversationalRetrievalChain;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentType;
import dev.langchain4j.data.document.parser.TextDocumentParser;
import dev.langchain4j.data.document.splitter.DocumentSplitters;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.memory.chat.MessageWindowChatMemory;
import dev.langchain4j.model.embedding.AllMiniLmL6V2EmbeddingModel;
import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.model.openai.OpenAiChatModel;
import dev.langchain4j.retriever.EmbeddingStoreRetriever;
import dev.langchain4j.store.embedding.EmbeddingStore;
import dev.langchain4j.store.embedding.EmbeddingStoreIngestor;
import dev.langchain4j.store.embedding.inmemory.InMemoryEmbeddingStore;
import org.apache.commons.io.IOUtils;
import org.apache.commons.text.WordUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import java.io.IOException;
import java.time.Duration;
import java.util.Scanner;
public class TalkToEmbeddedButerin {
private static final String[] blogPosts = new String[] {
// Should Ethereum be okay with enshrining more things in the protocol?
"https://vitalik.ca/general/2023/09/30/enshrinement.html",
// What do I think about Community Notes?
"https://vitalik.ca/general/2023/08/16/communitynotes.html",
// What do I think about biometric proof of personhood?
"https://vitalik.ca/general/2023/07/24/biometric.html"
};
// args[0] should be OPENAI_API_KEY
public static void main(String[] args) throws IOException {
if (args.length != 1) {
System.out.println("OPENAI_API_KEY has to be provided as first argument.");
return;
}
String filesContent = downloadPostsContent();
interactWithEmbeddedDocuments(args[0], filesContent);
}
private static void interactWithEmbeddedDocuments(String openaiKey, String content) {
EmbeddingModel embeddingModel = new AllMiniLmL6V2EmbeddingModel();
EmbeddingStore<TextSegment> embeddingStore = new InMemoryEmbeddingStore<>();
EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder()
// the Devoxx sample used 500, for my texts 1000 seems better.
.documentSplitter(DocumentSplitters.recursive(1000, 0))
.embeddingModel(embeddingModel)
.embeddingStore(embeddingStore)
.build();
System.out.println("creating embeddings");
Document document = new TextDocumentParser(DocumentType.TXT).parse(IOUtils.toInputStream(content));
ingestor.ingest(document);
OpenAiChatModel assistant =
OpenAiChatModel.builder()
.modelName("gpt-3.5-turbo")
.apiKey(openaiKey)
.timeout(Duration.ofMinutes(5)).build();
ConversationalRetrievalChain chain = ConversationalRetrievalChain.builder()
.chatLanguageModel(assistant)
.retriever(EmbeddingStoreRetriever.from(embeddingStore, embeddingModel))
// be careful when uncommenting the chat memory window.
// to verify the difference between embeddings and plain GPTs
// the chat memory has to be deactivated as plain GPT does not
// use the chain but the model directly which has no chat memory.
//.chatMemory(MessageWindowChatMemory.withMaxMessages(20))
.build();
String question = "";
Scanner scanner = new Scanner(System.in);
while (!question.equalsIgnoreCase("q")) {
System.out.print("\nWhat's your question: ");
question = scanner.nextLine();
System.out.println("\n### With embeddings\n" + WordUtils.wrap(chain.execute(question), 80));
System.out.println("\n### Plain GPT\n" + WordUtils.wrap(assistant.generate(question), 80));
}
scanner.close();
}
private static String downloadPostsContent() throws IOException {
StringBuilder allPostsCombined = new StringBuilder();
for (String url : blogPosts) {
System.out.println("downloading " + url);
org.jsoup.nodes.Document document = Jsoup.connect(url).get();
Element postBody = document.body();
allPostsCombined.append(postBody.text());
}
return allPostsCombined.toString();
}
}
@ice09
Copy link
Author

ice09 commented Oct 20, 2023

Can be run with JBang:

jbang https://gist.github.com/ice09/ae5409de706fbd5e08183f7632b44434 <OPENAI_API_KEY>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment