MichelNivard/Search_guided_GPT.r

## Search_guided_GPT.r
Sys.setenv( # get an API key here: https://platform.openai.com/account/api-keys
  OPENAI_API_KEY = 'YOUR_API_KEY_HERE'
)


### Make a text "database" to search:
library(tm)
library(dplyr)
library(corpus)
library(rjson)
library(stringi)
library(reticulate)
library(openai)

# Check if python is available, some help setting this up is here:  https://rpubs.com/eR_ic/transfoRmers
reticulate::py_available()

# Importing 🤗 transformers into R session (source: https://rpubs.com/eR_ic/transfoRmers)
transformers <- reticulate::import("transformers")

# read the news articles, source of news: https://www.kaggle.com/competitions/learn-ai-bbc/data
BBC.News.Train <- read.csv("BBC_News_Train.csv")


# Search based on this example: http://www.dataperspective.info/2017/11/information-retrieval-document-search-using-vector-space-model-in-r.html

### Function to pick documents related to your query:
doc.picker <- function(query,docs){

  length <- nrow(docs)
  ds <- DataframeSource(docs)
  x <- Corpus(ds)

  my.corpus <- x
  #remove punctuation
  my.corpus <- tm_map(my.corpus, removePunctuation)

  #remove numbers, uppercase, additional spaces
  my.corpus <- tm_map(my.corpus, removeNumbers)
  my.corpus <- tm_map(my.corpus, content_transformer(tolower))
  my.corpus <- tm_map(my.corpus, stripWhitespace)

  #create document matrix in a format that is efficient
  term.doc.matrix.stm <- TermDocumentMatrix(my.corpus)

  #constructing the Vector Space Model
  get.tf.idf.weights <- function(tf.vec) {
    # Compute tfidf weights from term frequency vector
    n.docs <- length(tf.vec)
    doc.frequency <- length(tf.vec[tf.vec > 0])
    weights <- rep(0, length(tf.vec))
    weights[tf.vec > 0] <- (1 + log2(tf.vec[tf.vec > 0])) * log2(n.docs/doc.frequency)
    return(weights)
  }

  # normalized frequency of words in each document:
  tfidf.matrix <- t(apply(term.doc.matrix.stm, 1,
                          FUN = function(row) {get.tf.idf.weights(row)}))

  colnames(tfidf.matrix) <- colnames(term.doc.matrix.stm)


  tfidf.matrix <- scale(tfidf.matrix, center = FALSE,
                        scale = sqrt(colSums(tfidf.matrix^2)))


  # split Q from rest of documents:
  query.vector <- tfidf.matrix[,length ]
  tfidf.matrix <- tfidf.matrix[, 1:(length -1)]

  # score the documents:
  doc.scores <- t(query.vector) %*% tfidf.matrix

  # collect results
  results.df <- data.frame(doc = docs[1:(length-1),]$doc_id, score = t(doc.scores))
  # rank the docs:
  results.df <- results.df[order(results.df$score, decreasing = TRUE), ]
  # return the docs but ranked:
  results.df
}

# Perform a very poor news corpus search
news.search <- function(query,docs=BBC.News.Train){

  # reead the document corpus, append the search term:

  docs <- data.frame(doc_id = c(BBC.News.Train[,1],9999),
                     text = c(BBC.News.Train[,2],query),
                     dmeta1 = c(BBC.News.Train[,3],query),
                     stringsAsFactors = FALSE)


  # search all docs in the newslibrary:
  ranked.docs <-  doc.picker(query = query,texts = texts,docs=docs)

  docs[docs[,1] %in% ranked.docs[1:3,1],2]
}

# ask OpenAI's GPT a question, and provide it documents as context:
news.search.gpt <- function(query){

  question = query
  context = news.search(query = query)

  # this is what we will ask GPT 3.5:
  prompt = paste0("you are a chatbot anwsering a question about the news,
                  you are provided with both a question, and for context some related news articles from the BBC,
                  use the context provided in formulating an anwser.
                  The user question is:",question,".
                  The following information is the context for anwsering the question:")

  # here we mush together all the context:
  full.prompt <- paste(prompt,paste(context[1],context[2],context[3]))

  # Get an Awnser from OpenAI:
  anwser <- create_completion(
    model = "text-davinci-003",
    prompt = full.prompt,
    max_tokens = 500
  )

  list( GPT_response = anwser$choices, actual_context_provided = context)

}


### Huggin Face implementation, this uses open source language models, not OpenAI:
news.seach.hf <- function(query){
  # Specify task
  reader <- transformers$pipeline(task = "question-answering", model = "deepset/minilm-uncased-squad2")
  searched <- news.search(query = query)
  context <- paste0(searched[1],searched[2],searched[3])
  outputs <- reader(question = query, context = context)
  outputs
}


# get an awnser from a different AI model:
anwser <- news.seach.hf("Why did Alex Ferguson want Thierry Henry punished in 2004?")
anwser # o wow not great...

# Get an Answer from OpenAI with outr context:
news.search.gpt("Why did Alex Ferguson want Thierry Henry punished in 2004?") # context aware implementation

# Get an Answer from Vanilla GPT 3.5 without context:
create_completion(model = "text-davinci-003", prompt = "Why did Alex Ferguson want Thierry Henry punished in 2004?", max_tokens = 500)


# get an awnser from a different AI model:
anwser <- news.seach.hf("What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?")
anwser # o wow not great...

# Get an Awnser from OpenAI with our context:
news.search.gpt("What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?") # context aware implementation

# Get an Awnser from Vanilla GPT 3.5 without context:
create_completion(model = "text-davinci-003", prompt = "What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?", max_tokens = 500)
	Sys.setenv( # get an API key here: https://platform.openai.com/account/api-keys
	OPENAI_API_KEY = 'YOUR_API_KEY_HERE'
	)


	### Make a text "database" to search:
	library(tm)
	library(dplyr)
	library(corpus)
	library(rjson)
	library(stringi)
	library(reticulate)
	library(openai)

	# Check if python is available, some help setting this up is here: https://rpubs.com/eR_ic/transfoRmers
	reticulate::py_available()

	# Importing 🤗 transformers into R session (source: https://rpubs.com/eR_ic/transfoRmers)
	transformers <- reticulate::import("transformers")

	# read the news articles, source of news: https://www.kaggle.com/competitions/learn-ai-bbc/data
	BBC.News.Train <- read.csv("BBC_News_Train.csv")



	# Search based on this example: http://www.dataperspective.info/2017/11/information-retrieval-document-search-using-vector-space-model-in-r.html

	### Function to pick documents related to your query:
	doc.picker <- function(query,docs){

	length <- nrow(docs)
	ds <- DataframeSource(docs)
	x <- Corpus(ds)

	my.corpus <- x
	#remove punctuation
	my.corpus <- tm_map(my.corpus, removePunctuation)

	#remove numbers, uppercase, additional spaces
	my.corpus <- tm_map(my.corpus, removeNumbers)
	my.corpus <- tm_map(my.corpus, content_transformer(tolower))
	my.corpus <- tm_map(my.corpus, stripWhitespace)

	#create document matrix in a format that is efficient
	term.doc.matrix.stm <- TermDocumentMatrix(my.corpus)

	#constructing the Vector Space Model
	get.tf.idf.weights <- function(tf.vec) {
	# Compute tfidf weights from term frequency vector
	n.docs <- length(tf.vec)
	doc.frequency <- length(tf.vec[tf.vec > 0])
	weights <- rep(0, length(tf.vec))
	weights[tf.vec > 0] <- (1 + log2(tf.vec[tf.vec > 0])) * log2(n.docs/doc.frequency)
	return(weights)
	}

	# normalized frequency of words in each document:
	tfidf.matrix <- t(apply(term.doc.matrix.stm, 1,
	FUN = function(row) {get.tf.idf.weights(row)}))

	colnames(tfidf.matrix) <- colnames(term.doc.matrix.stm)


	tfidf.matrix <- scale(tfidf.matrix, center = FALSE,
	scale = sqrt(colSums(tfidf.matrix^2)))


	# split Q from rest of documents:
	query.vector <- tfidf.matrix[,length ]
	tfidf.matrix <- tfidf.matrix[, 1:(length -1)]

	# score the documents:
	doc.scores <- t(query.vector) %*% tfidf.matrix

	# collect results
	results.df <- data.frame(doc = docs[1:(length-1),]$doc_id, score = t(doc.scores))
	# rank the docs:
	results.df <- results.df[order(results.df$score, decreasing = TRUE), ]
	# return the docs but ranked:
	results.df
	}

	# Perform a very poor news corpus search
	news.search <- function(query,docs=BBC.News.Train){

	# reead the document corpus, append the search term:

	docs <- data.frame(doc_id = c(BBC.News.Train[,1],9999),
	text = c(BBC.News.Train[,2],query),
	dmeta1 = c(BBC.News.Train[,3],query),
	stringsAsFactors = FALSE)


	# search all docs in the newslibrary:
	ranked.docs <- doc.picker(query = query,texts = texts,docs=docs)

	docs[docs[,1] %in% ranked.docs[1:3,1],2]
	}

	# ask OpenAI's GPT a question, and provide it documents as context:
	news.search.gpt <- function(query){

	question = query
	context = news.search(query = query)

	# this is what we will ask GPT 3.5:
	prompt = paste0("you are a chatbot anwsering a question about the news,
	you are provided with both a question, and for context some related news articles from the BBC,
	use the context provided in formulating an anwser.
	The user question is:",question,".
	The following information is the context for anwsering the question:")

	# here we mush together all the context:
	full.prompt <- paste(prompt,paste(context[1],context[2],context[3]))

	# Get an Awnser from OpenAI:
	anwser <- create_completion(
	model = "text-davinci-003",
	prompt = full.prompt,
	max_tokens = 500
	)

	list( GPT_response = anwser$choices, actual_context_provided = context)

	}


	### Huggin Face implementation, this uses open source language models, not OpenAI:
	news.seach.hf <- function(query){
	# Specify task
	reader <- transformers$pipeline(task = "question-answering", model = "deepset/minilm-uncased-squad2")
	searched <- news.search(query = query)
	context <- paste0(searched[1],searched[2],searched[3])
	outputs <- reader(question = query, context = context)
	outputs
	}


	# get an awnser from a different AI model:
	anwser <- news.seach.hf("Why did Alex Ferguson want Thierry Henry punished in 2004?")
	anwser # o wow not great...

	# Get an Answer from OpenAI with outr context:
	news.search.gpt("Why did Alex Ferguson want Thierry Henry punished in 2004?") # context aware implementation

	# Get an Answer from Vanilla GPT 3.5 without context:
	create_completion(model = "text-davinci-003", prompt = "Why did Alex Ferguson want Thierry Henry punished in 2004?", max_tokens = 500)



	# get an awnser from a different AI model:
	anwser <- news.seach.hf("What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?")
	anwser # o wow not great...

	# Get an Awnser from OpenAI with our context:
	news.search.gpt("What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?") # context aware implementation

	# Get an Awnser from Vanilla GPT 3.5 without context:
	create_completion(model = "text-davinci-003", prompt = "What kind of celebrity innitiated efforts where made to raise money for the asian christmas day tsunami aid effort?", max_tokens = 500)