Sys.setenv( # get an API key here:
### Make a text "database" to search:
# Check if python is available, some help setting this up is here:
# Importing 🤗 transformers into R session (source:
transformers <- reticulate::import("transformers")
# read the news articles, source of news:
BBC.News.Train <- read.csv("BBC_News_Train.csv")
# Search based on this example:
### Function to pick documents related to your query:
doc.picker <- function(query,docs){
length <- nrow(docs)
ds <- DataframeSource(docs)
x <- Corpus(ds)
my.corpus <- x
#remove punctuation
my.corpus <- tm_map(my.corpus, removePunctuation)
#remove numbers, uppercase, additional spaces
my.corpus <- tm_map(my.corpus, removeNumbers)
my.corpus <- tm_map(my.corpus, content_transformer(tolower))
my.corpus <- tm_map(my.corpus, stripWhitespace)
#create document matrix in a format that is efficient
term.doc.matrix.stm <- TermDocumentMatrix(my.corpus)
#constructing the Vector Space Model <- function(tf.vec) {
# Compute tfidf weights from term frequency vector <- length(tf.vec)
doc.frequency <- length(tf.vec[tf.vec > 0])
weights <- rep(0, length(tf.vec))
weights[tf.vec > 0] <- (1 + log2(tf.vec[tf.vec > 0])) * log2(
# normalized frequency of words in each document:
tfidf.matrix <- t(apply(term.doc.matrix.stm, 1,
FUN = function(row) {}))
colnames(tfidf.matrix) <- colnames(term.doc.matrix.stm)
tfidf.matrix <- scale(tfidf.matrix, center = FALSE,
scale = sqrt(colSums(tfidf.matrix^2)))
# split Q from rest of documents:
query.vector <- tfidf.matrix[,length ]
tfidf.matrix <- tfidf.matrix[, 1:(length -1)]
# score the documents:
doc.scores <- t(query.vector) %*% tfidf.matrix
# collect results
results.df <- data.frame(doc = docs[1:(length-1),]$doc_id, score = t(doc.scores))
# rank the docs:
results.df <- results.df[order(results.df$score, decreasing = TRUE), ]
# return the docs but ranked:
# Perform a very poor news corpus search <- function(query,docs=BBC.News.Train){
# reead the document corpus, append the search term:
docs <- data.frame(doc_id = c(BBC.News.Train[,1],9999),
text = c(BBC.News.Train[,2],query),
dmeta1 = c(BBC.News.Train[,3],query),
stringsAsFactors = FALSE)
# search all docs in the newslibrary: <- doc.picker(query = query,texts = texts,docs=docs)
docs[docs[,1] %in%[1:3,1],2]
# ask OpenAI's GPT a question, and provide it documents as context: <- function(query){
question = query
context = = query)
# this is what we will ask GPT 3.5:
prompt = paste0("you are a chatbot anwsering a question about the news,
you are provided with both a question, and for context some related news articles from the BBC,
use the context provided in formulating an anwser.
The user question is:",question,".
The following information is the context for anwsering the question:")
# here we mush together all the context:
full.prompt <- paste(prompt,paste(context[1],context[2],context[3]))
# Get an Awnser from OpenAI:
anwser <- create_completion(
model = "text-davinci-003",
prompt = full.prompt,
max_tokens = 500
list( GPT_response = anwser$choices, actual_context_provided = context)
### Huggin Face implementation, this uses open source language models, not OpenAI:
news.seach.hf <- function(query){
# Specify task
reader <- transformers$pipeline(task = "question-answering", model = "deepset/minilm-uncased-squad2")
searched <- = query)
context <- paste0(searched[1],searched[2],searched[3])
outputs <- reader(question = query, context = context)
# get an awnser from a different AI model:
anwser <- news.seach.hf("Why did Alex Ferguson want Thierry Henry punished in 2004?")
anwser # o wow not great...
# Get an Answer from OpenAI with outr context:"Why did Alex Ferguson want Thierry Henry punished in 2004?") # context aware implementation
# Get an Answer from Vanilla GPT 3.5 without context:
create_completion(model = "text-davinci-003", prompt = "Why did Alex Ferguson want Thierry Henry punished in 2004?", max_tokens = 500)
# get an awnser from a different AI model:
anwser <- news.seach.hf("What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?")
anwser # o wow not great...
# Get an Awnser from OpenAI with our context:"What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?") # context aware implementation
# Get an Awnser from Vanilla GPT 3.5 without context:
create_completion(model = "text-davinci-003", prompt = "What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?", max_tokens = 500)
