Skip to content

Instantly share code, notes, and snippets.

@morganmcg1
Created May 20, 2023 17:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save morganmcg1/e8324610dbd2d0b78229a241de90eba1 to your computer and use it in GitHub Desktop.
Save morganmcg1/e8324610dbd2d0b78229a241de90eba1 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
# In this notebook we will automatically generate a set of evaluation questions based on wandb docs
import random
import wandb
import re
import openai
import os
from tqdm.auto import tqdm
import time
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import MarkdownTextSplitter
PROJECT = "wandbot_synth"
ENTITY = "wandbot"
import openai
from getpass import getpass
def get_openai_key():
if os.getenv("OPENAI_API_KEY") is None:
if any(['VSCODE' in x for x in os.environ.keys()]):
print('Please enter password in the VS Code prompt at the top of your VS Code window!')
os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
openai.api_key = os.getenv("OPENAI_API_KEY")
assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
print("OpenAI API key configured")
cohere_api_key = ""
get_openai_key()
# # Answer Questions with WandBot
import time
from typing import Any, Dict, List
import json
import wandb
from wandb.sdk.lib.runid import generate_id
from wandb.integration.langchain import WandbTracer
from langchain import LLMChain
from langchain.chains import HypotheticalDocumentEmbedder, RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.vectorstores.base import VectorStoreRetriever
from langchain.chains import HypotheticalDocumentEmbedder, RetrievalQAWithSourcesChain
from langchain.callbacks import get_openai_callback
class VectorStoreRetrieverWithScore(VectorStoreRetriever):
def get_relevant_documents(self, query: str) -> List[Document]:
if self.search_type == "similarity":
docs_and_scores = self.vectorstore.similarity_search_with_score(
query, **self.search_kwargs
)
docs = []
for doc, score in docs_and_scores:
doc.metadata["score"] = score
docs.append(doc)
elif self.search_type == "mmr":
docs = self.vectorstore.max_marginal_relevance_search(
query, **self.search_kwargs
)
else:
raise ValueError(f"search_type of {self.search_type} not allowed.")
return docs
class FAISSWithScore(FAISS):
def as_retriever(self) -> VectorStoreRetrieverWithScore:
return VectorStoreRetrieverWithScore(
vectorstore=self,
search_type="similarity",
search_kwargs={"k": 10},
)
class RetrievalQAWithSourcesChainWithScore(RetrievalQAWithSourcesChain):
reduce_k_below_max_tokens: bool = True
max_tokens_limit: int = 2816
def _get_docs(self, inputs: Dict[str, Any]) -> List[Document]:
question = inputs[self.question_key]
docs = self.retriever.get_relevant_documents(question)
return self._reduce_tokens_below_limit(docs)
def load_artifacts(config):
faiss_artifact = wandb.use_artifact(config.faiss_artifact, type="search_index")
faiss_artifact_dir = faiss_artifact.download()
hyde_prompt_artifact = wandb.use_artifact(
config.hyde_prompt_artifact, type="prompt"
)
hyde_artifact_dir = hyde_prompt_artifact.download()
hyde_prompt_file = f"{hyde_artifact_dir}/hyde_prompt.txt"
chat_prompt_artifact = wandb.use_artifact(
config.chat_prompt_artifact, type="prompt"
)
chat_artifact_dir = chat_prompt_artifact.download()
chat_prompt_file = f"{chat_artifact_dir}/chat_prompt.txt"
return {
"faiss": faiss_artifact_dir,
"hyde_prompt": hyde_prompt_file,
"chat_prompt": chat_prompt_file,
}
# In[10]:
import json
def parse_source_documents(source_documents):
source_docs_dict = {}
for i,source_doc in enumerate(source_documents):
source_docs_dict[f"source_doc_{i}"] ={
"page_content":source_doc.page_content,
"metadata":source_doc.metadata["source"],
"lookup_index":source_doc.lookup_index,
"lookup_str":source_doc.lookup_str,
}
return json.dumps(source_docs_dict)
# In[11]:
# qa_chain.json
# In[12]:
from types import SimpleNamespace
# login to openai with your api key
get_openai_key()
wandbot_config = SimpleNamespace(
faiss_artifact="parambharat/wandb_docs_bot/faiss_store:latest",
hyde_prompt_artifact="parambharat/wandb_docs_bot/hyde_prompt:latest",
chat_prompt_artifact="parambharat/wandb_docs_bot/system_prompt:latest",
model_name="gpt-3.5-turbo",
eval_model = 'command-nightly',
temperature=0,
hyde_llm_temperature=0.3,
command_llm_temperature=0.0,
cohere_generate_cost_usd = 0.0000025 # cost per characters (not tokens), $0.0025 per generation unit (1000 chars)
)
wandb.init(
name="synth_answer_generation_test",
project=PROJECT,
entity=ENTITY,
config=wandbot_config,
)
artifacts = load_artifacts(wandb.config)
# ### Prompts
# Load wandbot v1 prompts
# In[13]:
# LOAD DATA AND PROMPTS FROM ARTIFACTS
faiss_dir = artifacts["faiss"]
hyde_prompt_template = open(artifacts["hyde_prompt"]).read()
wandbot_v1_system_prompt_template = open(artifacts["chat_prompt"]).read()
human_message_prompt_template = "{question}"
# SETUP Hypothetical Document Embedder (HyDE)
hyde_messages = [
SystemMessagePromptTemplate.from_template(hyde_prompt_template),
HumanMessagePromptTemplate.from_template("{question}"),
]
hyde_prompt = ChatPromptTemplate.from_messages(hyde_messages)
# ### Alternative System Prompts
#
# Create alternate wandbot prompts to test
# In[14]:
system_prompts = {}
system_prompts["wandbot_v1_few_shot"] = wandbot_v1_system_prompt_template
system_prompts["wandbot_v1_zero_shot"]= """
As an AI assistant for the open source library wandb, your task is to answer questions based on
the given extracted parts of a long document and the question. You can provide a conversational
answer with a hyperlink to the documentation only if it is explicitly listed as a source in the context.
Provide a code block directly from the documentation wherever possible. If you do not know the answer,
you can say "Hmm, I'm not sure." If the question is not related to wandb or Weights & Biases, politely
inform the user that you can only answer questions related to wandb. The documentation for wandb can be
found at https://docs.wandb.ai.
Begin:
================
Question: {question}
================
{summaries}
================
Final Answer in Markdown:
"""
system_prompts["default_langchain_qa"]= """
Use the following pieces of context to answer the question at the end. If you don't know the answer,
just say that you don't know, don't try to make up an answer.
{summaries}
Question: {question}
Helpful Answer:
"""
# Get prompt token counts
# In[15]:
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4")
jj = enc.encode("hello world")
len(jj)
for k in system_prompts.keys():
print(f"{k} token count: {len(enc.encode(system_prompts[k]))}")
# ### Evaluation Propmt
# Cohere Command Grader Prompt
# In[16]:
human_prompt = "Human:"
assistant_prompt = "GRADER_RESPONSE:"
grade_command = """Grade the following WANDBOT_RESPONSE given the USER_QUESTION and SUPPORTING_DOCUMENTATION.
Grade the WANDBOT_RESPONSE based ONLY on its factual accuracy. It is OK if the WANDBOT_RESPONSE contains more information than in SUPPORTING_DOCUMENTATION, as long as it does not contain any conflicting statements.
Your GRADE should only be POSITIVE or NEGATIVE to indicate whether the WANDBOT_RESPONSE is accurate or not given the SUPPORTING_DOCUMENTATION, no other information is required.
If the WANDBOT_RESPONSE answers that there is no specific information provided in the context or that it doesn't know, then the GRADE is NEGATIVE.
Only respond with POSITIVE or NEGATIVE for GRADE."""
def command_eval_prompt_constructor(question, source_documents, answer, grade_command=grade_command):
evaluation_prompts_template = f"""As an experienced software quality assurance tester, you are evaluating the quality of the response (WANDBOT_RESPONSE) from a
Weights & Biasses (aka wandb, W&B) support bot called `wandbot`. Weights & Biasses is a machine learning ops (MLOps) python library and app.
Supporting documentation (SUPPORTING_DOCUMENTATION) is provided to help you assess the quality of the response. You job is to grade (GRADE) the response.
This is the example format of the input and a grade given to the `wandbot` support bot response:
=====================
USER_QUESTION: user question here
WANDBOT_RESPONSE: the response from the `wandbot` support bot here
SUPPORTING_DOCUMENTATION: retrieved documentation from the wandb docs here
{assistant_prompt} GRADE: POSITIVE or NEGATIVE here
=====================
this is a real examples:
=====================
USER_QUESTION: How do I create a wandb sweep?
WANDBOT_RESPONSE: To create a W&B Articfact, you can use the wandb.Artifact class like so ```artifact = wandb.Artifact(name='bicycle-dataset', type='dataset')```
SUPPORTING_DOCUMENTATION: '{{"source_doc_0": {{"page_content": "Use Weights & Biases Sweeps to automate hyperparameter search and explore the space of possible models. Create a sweep with a few lines of code.", "metadata": "https://docs.wandb.ai/guide", "lookup_index": 0, "lookup_str": ""}}}}'
{assistant_prompt} GRADE: NEGATIVE
=====================
{grade_command}
USER_QUESTION: {question}
WANDBOT_RESPONSE: {answer}
SUPPORTING_DOCUMENTATION: {source_documents}
{assistant_prompt} GRADE:"""
return evaluation_prompts_template
question = "what is wandb?"
answer = "Weights & Biases is a machine learning platform for teams."
source_documents = "[hey, ho]"
# print(command_eval_prompt_constructor(question, source_documents, answer, grade_command))
# Cohere Command prompt template
# In[17]:
from tokenizers import Tokenizer
eval_grader_prompt_template = command_eval_prompt_constructor("", "", "")
command_nightly_tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly")
prompt_enc = command_nightly_tokenizer.encode(eval_grader_prompt_template)
print(f"Command prompt template token count: {len(prompt_enc.ids)}")
# Claude Grader Prompt
# In[18]:
# def claude_eval_prompt_constructor(question, source_documents, answer):
# evaluation_prompts_template = f"""As an experienced software quality assurance tester, you are evaluating the quality of the response from a
# Weights & Biasses (aka wandb, W&B) support bot called wandbot. Weights & Biasses is a machine learning ops (MLOps) python library and app.
# Supporting documentation is provided to help you assess the quality of the response.
# Your feedback should only only be "POSITIVE" or "NEGATIVE" to indicate whether the response is accurate or not,
# no other information is required. For example:
# {anthropic.HUMAN_PROMPT}
# =====================
# USER_QUESTION: What is wandb?
# SUPPORTING_DOCUMENTATION: '{{"source_doc_0": {{"page_content": "Weights & Biases is the machine learning platform for developers to build better models faster", "metadata": "https://docs.wandb.ai/guide", "lookup_index": 0, "lookup_str": ""}}}}'
# WANDBOT_RESPONSE: Weights & Biases is a machine learning platform for teams.
# {anthropic.AI_PROMPT}
# POSITIVE
# {anthropic.HUMAN_PROMPT}
# =====================
# USER_QUESTION: How do I create a wandb sweep?
# SUPPORTING_DOCUMENTATION: '{{"source_doc_0": {{"page_content": "Use Weights & Biases Sweeps to automate hyperparameter search and explore the space of possible models. Create a sweep with a few lines of code.", "metadata": "https://docs.wandb.ai/guide", "lookup_index": 0, "lookup_str": ""}}}}'
# WANDBOT_RESPONSE: To create a W&B Articfact, you can use the wandb.Artifact class like so ```artifact = wandb.Artifact(name='bicycle-dataset', type='dataset')```
# {anthropic.AI_PROMPT}
# NEGATIVE
# {anthropic.HUMAN_PROMPT}
# =====================
# USER_QUESTION: {question}
# SUPPORTING_DOCUMENTATION: {source_documents}
# WANDBOT_RESPONSE: {answer}
# {anthropic.AI_PROMPT}"""
# return evaluation_prompts_template
# question = "what is wandb?"
# answer = "Weights & Biases is a machine learning platform for teams."
# print(claude_eval_prompt_constructor(question, source_documents, answer))
# ### Load Embeddings and Vector Store
# In[19]:
base_embeddings = OpenAIEmbeddings()
embeddings = HypotheticalDocumentEmbedder(
llm_chain=LLMChain(llm=ChatOpenAI(
temperature=wandb.config.hyde_llm_temperature,),
prompt=hyde_prompt),
base_embeddings=base_embeddings,
verbose=True
)
# LOAD FAISS VECTOR STORE
vector_store = FAISSWithScore.load_local(faiss_dir, embeddings)
# In[20]:
# LOAD QA CHAINS FOR EACH SYSTEM PROMPT
def load_qa_chain(system_prompt_template, vector_store=vector_store, chain_type="stuff"):
qa_messages = [
SystemMessagePromptTemplate.from_template(system_prompt_template, input_variables=["context", "question"]),
HumanMessagePromptTemplate.from_template(human_message_prompt_template),
]
qa_prompt = ChatPromptTemplate.from_messages(qa_messages)
llm = ChatOpenAI(
model_name=wandb.config.model_name,
temperature=wandb.config.temperature,
request_timeout=20
)
qa_chain = RetrievalQAWithSourcesChainWithScore.from_chain_type(
llm = llm,
chain_type=chain_type,
retriever=vector_store.as_retriever(),
chain_type_kwargs={"prompt": qa_prompt},
return_source_documents=True,
verbose=True
)
return qa_chain
# Create timestamps
# In[21]:
import pandas as pd
import numpy as np
def generate_timestamps(n=10000, start_date='2023-03-01', end_date='2023-05-31'):
# Range of datetimes with 1-minute intervals
rng = pd.date_range(start_date, end_date, freq='S')
# Create weights for all datetimes
weights = pd.Series(1, index=rng)
# Decrease weights for weekends
weights[rng.to_series().dt.dayofweek > 4] *= 0.5
# Decrease weight for Easter Sunday (2023-04-09)
easter = pd.to_datetime('2023-04-09')
weights[rng.to_series().between(easter, easter + pd.DateOffset(days=1))] *= 0.5
easter_monday = pd.to_datetime('2023-04-10')
weights[rng.to_series().between(easter, easter + pd.DateOffset(days=1))] *= 0.5
# Increase weights for 8am-6pm on weekdays
mask = ((rng.to_series().dt.hour >= 8) & (rng.to_series().dt.hour <= 18) & (rng.to_series().dt.dayofweek <= 4))
weights[mask] *= 1.2
# Increase weights for Tuesday, Wednesday, Thursday
mask = ((rng.to_series().dt.dayofweek >= 1) & (rng.to_series().dt.dayofweek <= 3))
weights[mask] *= 1.5
# Normalize weights
weights /= weights.sum()
# Sample 10000 datetimes using weights
sampled_datetimes = np.random.choice(rng, size=n, p=weights)
# Sort the datetimes
sampled_datetimes.sort()
return sampled_datetimes
timestamps = generate_timestamps()
len(timestamps)
# In[22]:
import matplotlib.pyplot as plt
def plot_timestamps(timestamps):
plt.figure(figsize=(10,6))
plt.hist(timestamps, bins=100, alpha=0.5, color='blue')
plt.xlabel('Datetime')
plt.ylabel('Frequency')
plt.title('Distribution of Timestamps')
plt.show()
timestamps = generate_timestamps()
# plot_timestamps(timestamps)
# In[23]:
# WANDB LOGGING CONFIG
wandb_config = {"project": PROJECT, "entity":ENTITY} # config for OpenAI autologger
table_cols = [
"request_timestamp","query_id", "query", "wandbot_answer", "retrived_source_documents",
"synth_user_feedback_signal", "elapsed_time_s",
"prompt_tokens", "completion_tokens", "total_tokens",
"answer_cost_usd", "successful_requests",
"system_prompt_version", "system_prompt_template", "human_message_prompt_template", "hyde_prompt_template", "eval_prompt_template",
"wandb_run_id", "wandbot_model", "wandbot_temperature", "hyde_llm_temperature",
"eval_model", "eval_elapsed_time_s",
"eval_total_chars", "eval_cost_usd", "eval_total_tokens", "eval_prompt_tokens", "eval_completion_tokens"
]
# ### Load Questions
# In[35]:
# import pandas as pd
# df = pd.read_csv('sythetic-user-questions_2023-05-14.csv')
# questions = df["question"].values
# questions = questions[:5]
# questions
artifact = wandb.use_artifact('wandbot/wandbot_synth/run-2cv1ao9n-generated_questions_table:v0', type='run_table')
# artifact_dir = artifact.download("data")
df = artifact.get("generated_questions_table").get_dataframe()
# with open('data/generated_questions_table.table.json') as f:
# js = json.load(f)
# columns = js['columns']
# data = js['data']
# df = pd.DataFrame(data, columns=columns)
questions = df["question"].values
# shuffle the questions
np.random.shuffle(questions)
print(len(questions))
df.head()
# Setup Evaluation Model
# In[28]:
import cohere
def calculate_eval_tokens(eval_grader_prompt, eval_completion):
prompt_enc = command_nightly_tokenizer.encode(eval_grader_prompt)
prompt_tokens_count = len(prompt_enc.ids)
completion_enc = command_nightly_tokenizer.encode(eval_completion)
completion_token_count = len(completion_enc.ids)
completion_total_tokens = prompt_tokens_count + completion_token_count
return completion_total_tokens, prompt_tokens_count, completion_token_count
co = cohere.Client(cohere_api_key)
# ## Run Synth WandBot
# Setup chain variants
# In[27]:
chains = {}
for system_prompt in system_prompts.keys():
chains[f"{system_prompt}"] = load_qa_chain(system_prompts[system_prompt], vector_store, chain_type="stuff")
# chains.keys()
# In[29]:
import traceback
import langchain;
langchain.debug=False
for i_q, question in enumerate(questions):
# USER QUERY
query_id = generate_id(length=16)
tstamp = timestamps[i_q]
# RUN CHAIN
system_prompt = random.choice(list(chains.keys()))
qa_chain = chains[system_prompt]
try:
start_time = time.time()
with get_openai_callback() as openai_cb:
response = qa_chain({"question": question},
callbacks=[WandbTracer(wandb_config)],
return_only_outputs=False,
)
end_time = time.time()
elapsed_time = end_time - start_time
answer = response["answer"]
# RETRIEVED DOCUMENTS
source_docs = response["source_documents"]
source_documents = parse_source_documents(source_docs)
# TOKEN METRICS
prompts_tokens = openai_cb.prompt_tokens
completion_tokens = openai_cb.completion_tokens
total_tokens = openai_cb.total_tokens
total_cost = openai_cb.total_cost
successful_requests = openai_cb.successful_requests
# GENERATE SYNTHETIC USER FEEDBACK
eval_grader_prompt = command_eval_prompt_constructor(question, source_documents, answer)
eval_completion = ""
eval_total_tokens, eval_prompt_tokens, eval_completion_tokens = calculate_eval_tokens(eval_grader_prompt, eval_completion)
eval_start_time = time.time()
response = co.generate(
model=wandb.config.eval_model,
prompt = eval_grader_prompt,
max_tokens=50,
temperature=wandb.config.command_llm_temperature,
stop_sequences=["====================="],
truncate="end"
)
eval_end_time = time.time()
eval_elapsed_time = eval_end_time - eval_start_time
eval_completion = response.generations[0].text
# Get eval token counts
eval_total_tokens, eval_prompt_tokens, eval_completion_tokens = calculate_eval_tokens(eval_grader_prompt, eval_completion)
eval_total_chars = len(eval_grader_prompt) + len(eval_completion)
eval_cost = eval_total_chars * wandb.config.cohere_generate_cost_usd
synth_user_feedback = "POSITIVE" if "positive" in eval_completion.lower() else "NEGATIVE"
# synth_user_feedback = "POSITIVE"
# LOG TO WANDB
wandb_table = wandb.Table(table_cols)
wandb_table.add_data(tstamp, query_id, question, answer, source_documents,
synth_user_feedback, elapsed_time,
prompts_tokens, completion_tokens, total_tokens,
total_cost, successful_requests,
system_prompt, system_prompts[system_prompt], human_message_prompt_template, hyde_prompt_template, eval_grader_prompt_template,
wandb.run.id, wandb.config.model_name, wandb.config.temperature, wandb.config.hyde_llm_temperature,
wandb.config.eval_model, eval_elapsed_time,
eval_total_chars, eval_cost, eval_total_tokens, eval_prompt_tokens, eval_completion_tokens)
wandb.log({"logs/qa_with_eval": wandb_table})
except Exception as e:
print(f"Question {i_q}, Error occured: {e}")
traceback.print_exc()
# break
if i_q % 20 == 0: print(i_q)
# In[27]:
print("DONE!")
# In[ ]:
# config_table_cols = ["query_id","run_id", "system_prompt_template",
# "human_message_prompt_template", "hyde_prompt_template", "eval_grader_prompt_template"]
# config_table = wandb.Table(config_table_cols)
# config_table.add_data(query_id, wandb.run.id, system_prompts[system_prompt],
# human_message_prompt_template, hyde_prompt_template, eval_grader_prompt_template)
# wandb.log({"logs/config_table_test": config_table})
# In[ ]:
# # ANTHROPIC EVALUATION
# eval_model = "claude-v1.3-100k" # "claude-v1",
# # anthropic_api = "XXX"
# # client = anthropic.Client(api_key=anthropic_api)
# # max_tokens_to_sample = 100000
# eval_prompt_template = claude_eval_prompt_constructor("", "", "") # Just to log the eval prompt template
# # eval_prompt = claude_eval_prompt_constructor(question, source_documents, answer)
# eval_start_time = time.time()
# # resp = client.completion(
# # prompt=eval_prompt,
# # stop_sequences=[anthropic.HUMAN_PROMPT],
# # model=eval_model,
# # max_tokens_to_sample=max_tokens_to_sample,
# # )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment