Skip to content

Instantly share code, notes, and snippets.

@MichelNivard
Last active February 3, 2023 13:45
Show Gist options
  • Save MichelNivard/ca0ab87bb5e618ce42061321a4b333f2 to your computer and use it in GitHub Desktop.
Save MichelNivard/ca0ab87bb5e618ce42061321a4b333f2 to your computer and use it in GitHub Desktop.
Sys.setenv( # get an API key here: https://platform.openai.com/account/api-keys
OPENAI_API_KEY = 'YOUR_API_KEY_HERE'
)
### Make a text "database" to search:
library(tm)
library(dplyr)
library(corpus)
library(rjson)
library(stringi)
library(reticulate)
library(openai)
# Check if python is available, some help setting this up is here: https://rpubs.com/eR_ic/transfoRmers
reticulate::py_available()
# Importing 🤗 transformers into R session (source: https://rpubs.com/eR_ic/transfoRmers)
transformers <- reticulate::import("transformers")
# read the news articles, source of news: https://www.kaggle.com/competitions/learn-ai-bbc/data
BBC.News.Train <- read.csv("BBC_News_Train.csv")
# Search based on this example: http://www.dataperspective.info/2017/11/information-retrieval-document-search-using-vector-space-model-in-r.html
### Function to pick documents related to your query:
doc.picker <- function(query,docs){
length <- nrow(docs)
ds <- DataframeSource(docs)
x <- Corpus(ds)
my.corpus <- x
#remove punctuation
my.corpus <- tm_map(my.corpus, removePunctuation)
#remove numbers, uppercase, additional spaces
my.corpus <- tm_map(my.corpus, removeNumbers)
my.corpus <- tm_map(my.corpus, content_transformer(tolower))
my.corpus <- tm_map(my.corpus, stripWhitespace)
#create document matrix in a format that is efficient
term.doc.matrix.stm <- TermDocumentMatrix(my.corpus)
#constructing the Vector Space Model
get.tf.idf.weights <- function(tf.vec) {
# Compute tfidf weights from term frequency vector
n.docs <- length(tf.vec)
doc.frequency <- length(tf.vec[tf.vec > 0])
weights <- rep(0, length(tf.vec))
weights[tf.vec > 0] <- (1 + log2(tf.vec[tf.vec > 0])) * log2(n.docs/doc.frequency)
return(weights)
}
# normalized frequency of words in each document:
tfidf.matrix <- t(apply(term.doc.matrix.stm, 1,
FUN = function(row) {get.tf.idf.weights(row)}))
colnames(tfidf.matrix) <- colnames(term.doc.matrix.stm)
tfidf.matrix <- scale(tfidf.matrix, center = FALSE,
scale = sqrt(colSums(tfidf.matrix^2)))
# split Q from rest of documents:
query.vector <- tfidf.matrix[,length ]
tfidf.matrix <- tfidf.matrix[, 1:(length -1)]
# score the documents:
doc.scores <- t(query.vector) %*% tfidf.matrix
# collect results
results.df <- data.frame(doc = docs[1:(length-1),]$doc_id, score = t(doc.scores))
# rank the docs:
results.df <- results.df[order(results.df$score, decreasing = TRUE), ]
# return the docs but ranked:
results.df
}
# Perform a very poor news corpus search
news.search <- function(query,docs=BBC.News.Train){
# reead the document corpus, append the search term:
docs <- data.frame(doc_id = c(BBC.News.Train[,1],9999),
text = c(BBC.News.Train[,2],query),
dmeta1 = c(BBC.News.Train[,3],query),
stringsAsFactors = FALSE)
# search all docs in the newslibrary:
ranked.docs <- doc.picker(query = query,texts = texts,docs=docs)
docs[docs[,1] %in% ranked.docs[1:3,1],2]
}
# ask OpenAI's GPT a question, and provide it documents as context:
news.search.gpt <- function(query){
question = query
context = news.search(query = query)
# this is what we will ask GPT 3.5:
prompt = paste0("you are a chatbot anwsering a question about the news,
you are provided with both a question, and for context some related news articles from the BBC,
use the context provided in formulating an anwser.
The user question is:",question,".
The following information is the context for anwsering the question:")
# here we mush together all the context:
full.prompt <- paste(prompt,paste(context[1],context[2],context[3]))
# Get an Awnser from OpenAI:
anwser <- create_completion(
model = "text-davinci-003",
prompt = full.prompt,
max_tokens = 500
)
list( GPT_response = anwser$choices, actual_context_provided = context)
}
### Huggin Face implementation, this uses open source language models, not OpenAI:
news.seach.hf <- function(query){
# Specify task
reader <- transformers$pipeline(task = "question-answering", model = "deepset/minilm-uncased-squad2")
searched <- news.search(query = query)
context <- paste0(searched[1],searched[2],searched[3])
outputs <- reader(question = query, context = context)
outputs
}
# get an awnser from a different AI model:
anwser <- news.seach.hf("Why did Alex Ferguson want Thierry Henry punished in 2004?")
anwser # o wow not great...
# Get an Answer from OpenAI with outr context:
news.search.gpt("Why did Alex Ferguson want Thierry Henry punished in 2004?") # context aware implementation
# Get an Answer from Vanilla GPT 3.5 without context:
create_completion(model = "text-davinci-003", prompt = "Why did Alex Ferguson want Thierry Henry punished in 2004?", max_tokens = 500)
# get an awnser from a different AI model:
anwser <- news.seach.hf("What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?")
anwser # o wow not great...
# Get an Awnser from OpenAI with our context:
news.search.gpt("What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?") # context aware implementation
# Get an Awnser from Vanilla GPT 3.5 without context:
create_completion(model = "text-davinci-003", prompt = "What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?", max_tokens = 500)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment