morganmcg1/wandbot_synth.py

## wandbot_synth.py
#!/usr/bin/env python
# coding: utf-8

# In this notebook we will automatically generate a set of evaluation questions based on wandb docs

import random
import wandb
import re
import openai
import os
from tqdm.auto import tqdm
import time

from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import MarkdownTextSplitter

PROJECT = "wandbot_synth"
ENTITY = "wandbot"

import openai
from getpass import getpass

def get_openai_key():
  if os.getenv("OPENAI_API_KEY") is None:
    if any(['VSCODE' in x for x in os.environ.keys()]):
      print('Please enter password in the VS Code prompt at the top of your VS Code window!')
    os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
    openai.api_key = os.getenv("OPENAI_API_KEY")
  assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "This doesn't look like a valid OpenAI API key"
  print("OpenAI API key configured")

cohere_api_key = ""
get_openai_key()

# # Answer Questions with WandBot


import time
from typing import Any, Dict, List

import json
import wandb
from wandb.sdk.lib.runid import generate_id
from wandb.integration.langchain import WandbTracer

from langchain import LLMChain
from langchain.chains import HypotheticalDocumentEmbedder, RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.vectorstores.base import VectorStoreRetriever
from langchain.chains import HypotheticalDocumentEmbedder, RetrievalQAWithSourcesChain
from langchain.callbacks import get_openai_callback


class VectorStoreRetrieverWithScore(VectorStoreRetriever):
    def get_relevant_documents(self, query: str) -> List[Document]:
        if self.search_type == "similarity":
            docs_and_scores = self.vectorstore.similarity_search_with_score(
                query, **self.search_kwargs
            )
            docs = []
            for doc, score in docs_and_scores:
                doc.metadata["score"] = score
                docs.append(doc)
        elif self.search_type == "mmr":
            docs = self.vectorstore.max_marginal_relevance_search(
                query, **self.search_kwargs
            )
        else:
            raise ValueError(f"search_type of {self.search_type} not allowed.")
        return docs


class FAISSWithScore(FAISS):
    def as_retriever(self) -> VectorStoreRetrieverWithScore:
        return VectorStoreRetrieverWithScore(
            vectorstore=self,
            search_type="similarity",
            search_kwargs={"k": 10},
        )


class RetrievalQAWithSourcesChainWithScore(RetrievalQAWithSourcesChain):
    reduce_k_below_max_tokens: bool = True
    max_tokens_limit: int = 2816

    def _get_docs(self, inputs: Dict[str, Any]) -> List[Document]:
        question = inputs[self.question_key]
        docs = self.retriever.get_relevant_documents(question)
        return self._reduce_tokens_below_limit(docs)


def load_artifacts(config):
    faiss_artifact = wandb.use_artifact(config.faiss_artifact, type="search_index")
    faiss_artifact_dir = faiss_artifact.download()

    hyde_prompt_artifact = wandb.use_artifact(
        config.hyde_prompt_artifact, type="prompt"
    )
    hyde_artifact_dir = hyde_prompt_artifact.download()
    hyde_prompt_file = f"{hyde_artifact_dir}/hyde_prompt.txt"

    chat_prompt_artifact = wandb.use_artifact(
        config.chat_prompt_artifact, type="prompt"
    )
    chat_artifact_dir = chat_prompt_artifact.download()
    chat_prompt_file = f"{chat_artifact_dir}/chat_prompt.txt"

    return {
        "faiss": faiss_artifact_dir,
        "hyde_prompt": hyde_prompt_file,
        "chat_prompt": chat_prompt_file,
    }


# In[10]:


import json
def parse_source_documents(source_documents):
    source_docs_dict = {}
    for i,source_doc in enumerate(source_documents):
        source_docs_dict[f"source_doc_{i}"] ={
            "page_content":source_doc.page_content,
            "metadata":source_doc.metadata["source"],
            "lookup_index":source_doc.lookup_index,
            "lookup_str":source_doc.lookup_str,
            }

    return json.dumps(source_docs_dict)


# In[11]:


# qa_chain.json


# In[12]:


from types import SimpleNamespace

# login to openai with your api key
get_openai_key()

wandbot_config = SimpleNamespace(
    faiss_artifact="parambharat/wandb_docs_bot/faiss_store:latest",
    hyde_prompt_artifact="parambharat/wandb_docs_bot/hyde_prompt:latest",
    chat_prompt_artifact="parambharat/wandb_docs_bot/system_prompt:latest",
    model_name="gpt-3.5-turbo",
    eval_model = 'command-nightly',
    temperature=0,
    hyde_llm_temperature=0.3,
    command_llm_temperature=0.0,
    cohere_generate_cost_usd = 0.0000025  # cost per characters (not tokens), $0.0025 per generation unit (1000 chars)
)

wandb.init(
        name="synth_answer_generation_test",
        project=PROJECT,
        entity=ENTITY,
        config=wandbot_config,
        )

artifacts = load_artifacts(wandb.config)


# ### Prompts

# Load wandbot v1 prompts

# In[13]:


# LOAD DATA AND PROMPTS FROM ARTIFACTS
faiss_dir = artifacts["faiss"]
hyde_prompt_template =  open(artifacts["hyde_prompt"]).read()
wandbot_v1_system_prompt_template = open(artifacts["chat_prompt"]).read()
human_message_prompt_template = "{question}"


# SETUP Hypothetical Document Embedder (HyDE)
hyde_messages = [
    SystemMessagePromptTemplate.from_template(hyde_prompt_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
hyde_prompt = ChatPromptTemplate.from_messages(hyde_messages)


# ### Alternative System Prompts
#
# Create alternate wandbot prompts to test

# In[14]:


system_prompts = {}

system_prompts["wandbot_v1_few_shot"] = wandbot_v1_system_prompt_template

system_prompts["wandbot_v1_zero_shot"]= """
As an AI assistant for the open source library wandb, your task is to answer questions based on
the given extracted parts of a long document and the question. You can provide a conversational
answer with a hyperlink to the documentation only if it is explicitly listed as a source in the context.

Provide a code block directly from the documentation wherever possible. If you do not know the answer,
you can say "Hmm, I'm not sure." If the question is not related to wandb or Weights & Biases, politely
inform the user that you can only answer questions related to wandb. The documentation for wandb can be
found at https://docs.wandb.ai.

Begin:
================

Question: {question}
================
{summaries}
================
Final Answer in Markdown:
"""

system_prompts["default_langchain_qa"]= """
Use the following pieces of context to answer the question at the end. If you don't know the answer,
just say that you don't know, don't try to make up an answer.

{summaries}

Question: {question}
Helpful Answer:
"""


# Get prompt token counts

# In[15]:


import tiktoken
enc = tiktoken.encoding_for_model("gpt-4")
jj = enc.encode("hello world")
len(jj)

for k in system_prompts.keys():
    print(f"{k} token count: {len(enc.encode(system_prompts[k]))}")


# ### Evaluation Propmt
# Cohere Command Grader Prompt

# In[16]:


human_prompt = "Human:"
assistant_prompt = "GRADER_RESPONSE:"

grade_command = """Grade the following WANDBOT_RESPONSE given the USER_QUESTION and SUPPORTING_DOCUMENTATION.
Grade the WANDBOT_RESPONSE based ONLY on its factual accuracy. It is OK if the WANDBOT_RESPONSE contains more information than in SUPPORTING_DOCUMENTATION, as long as it does not contain any conflicting statements.
Your GRADE should only be POSITIVE or NEGATIVE to indicate whether the WANDBOT_RESPONSE is accurate or not given the SUPPORTING_DOCUMENTATION, no other information is required.
If the WANDBOT_RESPONSE answers that there is no specific information provided in the context or that it doesn't know, then the GRADE is NEGATIVE.
Only respond with POSITIVE or NEGATIVE for GRADE."""


def command_eval_prompt_constructor(question, source_documents, answer, grade_command=grade_command):
    evaluation_prompts_template = f"""As an experienced software quality assurance tester, you are evaluating the quality of the response (WANDBOT_RESPONSE) from a
Weights & Biasses (aka wandb, W&B) support bot called `wandbot`. Weights & Biasses is a machine learning ops (MLOps) python library and app.
Supporting documentation (SUPPORTING_DOCUMENTATION) is provided to help you assess the quality of the response. You job is to grade (GRADE) the response.

This is the example format of the input and a grade given to the `wandbot` support bot response:

=====================
USER_QUESTION: user question here
WANDBOT_RESPONSE: the response from the `wandbot` support bot here
SUPPORTING_DOCUMENTATION: retrieved documentation from the wandb docs here
{assistant_prompt} GRADE: POSITIVE or NEGATIVE here
=====================

this is a real examples:

=====================
USER_QUESTION: How do I create a wandb sweep?
WANDBOT_RESPONSE: To create a W&B Articfact, you can use the wandb.Artifact class like so ```artifact = wandb.Artifact(name='bicycle-dataset', type='dataset')```
SUPPORTING_DOCUMENTATION: '{{"source_doc_0": {{"page_content": "Use Weights & Biases Sweeps to automate hyperparameter search and explore the space of possible models. Create a sweep with a few lines of code.", "metadata": "https://docs.wandb.ai/guide", "lookup_index": 0, "lookup_str": ""}}}}'
{assistant_prompt} GRADE: NEGATIVE
=====================

{grade_command}

USER_QUESTION: {question}
WANDBOT_RESPONSE: {answer}
SUPPORTING_DOCUMENTATION: {source_documents}
{assistant_prompt} GRADE:"""
    return evaluation_prompts_template

question = "what is wandb?"
answer = "Weights & Biases is a machine learning platform for teams."
source_documents = "[hey, ho]"
# print(command_eval_prompt_constructor(question, source_documents, answer, grade_command))


# Cohere Command prompt template

# In[17]:


from tokenizers import Tokenizer

eval_grader_prompt_template = command_eval_prompt_constructor("", "", "")

command_nightly_tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly")
prompt_enc = command_nightly_tokenizer.encode(eval_grader_prompt_template)
print(f"Command prompt template token count: {len(prompt_enc.ids)}")


# Claude Grader Prompt

# In[18]:


# def claude_eval_prompt_constructor(question, source_documents, answer):
#     evaluation_prompts_template = f"""As an experienced software quality assurance tester, you are evaluating the quality of the response from a
#     Weights & Biasses (aka wandb, W&B) support bot called wandbot. Weights & Biasses is a machine learning ops (MLOps) python library and app.
#     Supporting documentation is provided to help you assess the quality of the response.
#     Your feedback should only only be "POSITIVE" or "NEGATIVE" to indicate whether the response is accurate or not,
#     no other information is required. For example:
#     {anthropic.HUMAN_PROMPT}
#     =====================
#     USER_QUESTION: What is wandb?
#     SUPPORTING_DOCUMENTATION: '{{"source_doc_0": {{"page_content": "Weights & Biases is the machine learning platform for developers to build better models faster", "metadata": "https://docs.wandb.ai/guide", "lookup_index": 0, "lookup_str": ""}}}}'
#     WANDBOT_RESPONSE: Weights & Biases is a machine learning platform for teams.
#     {anthropic.AI_PROMPT}
#     POSITIVE
#     {anthropic.HUMAN_PROMPT}
#     =====================
#     USER_QUESTION: How do I create a wandb sweep?
#     SUPPORTING_DOCUMENTATION: '{{"source_doc_0": {{"page_content": "Use Weights & Biases Sweeps to automate hyperparameter search and explore the space of possible models. Create a sweep with a few lines of code.", "metadata": "https://docs.wandb.ai/guide", "lookup_index": 0, "lookup_str": ""}}}}'
#     WANDBOT_RESPONSE: To create a W&B Articfact, you can use the wandb.Artifact class like so ```artifact = wandb.Artifact(name='bicycle-dataset', type='dataset')```
#     {anthropic.AI_PROMPT}
#     NEGATIVE
#     {anthropic.HUMAN_PROMPT}
#     =====================
#     USER_QUESTION: {question}
#     SUPPORTING_DOCUMENTATION: {source_documents}
#     WANDBOT_RESPONSE: {answer}
#     {anthropic.AI_PROMPT}"""
#     return evaluation_prompts_template

# question = "what is wandb?"
# answer = "Weights & Biases is a machine learning platform for teams."
# print(claude_eval_prompt_constructor(question, source_documents, answer))


# ### Load Embeddings and Vector Store

# In[19]:


base_embeddings = OpenAIEmbeddings()
embeddings = HypotheticalDocumentEmbedder(
    llm_chain=LLMChain(llm=ChatOpenAI(
        temperature=wandb.config.hyde_llm_temperature,),
        prompt=hyde_prompt),
    base_embeddings=base_embeddings,
    verbose=True
)

# LOAD FAISS VECTOR STORE
vector_store = FAISSWithScore.load_local(faiss_dir, embeddings)


# In[20]:


# LOAD QA CHAINS FOR EACH SYSTEM PROMPT
def load_qa_chain(system_prompt_template, vector_store=vector_store, chain_type="stuff"):
    qa_messages = [
        SystemMessagePromptTemplate.from_template(system_prompt_template, input_variables=["context", "question"]),
        HumanMessagePromptTemplate.from_template(human_message_prompt_template),
    ]
    qa_prompt = ChatPromptTemplate.from_messages(qa_messages)

    llm = ChatOpenAI(
        model_name=wandb.config.model_name,
        temperature=wandb.config.temperature,
        request_timeout=20
        )

    qa_chain = RetrievalQAWithSourcesChainWithScore.from_chain_type(
        llm = llm,
        chain_type=chain_type,
        retriever=vector_store.as_retriever(),
        chain_type_kwargs={"prompt": qa_prompt},
        return_source_documents=True,
        verbose=True
    )
    return qa_chain


# Create timestamps

# In[21]:


import pandas as pd
import numpy as np

def generate_timestamps(n=10000, start_date='2023-03-01', end_date='2023-05-31'):
    # Range of datetimes with 1-minute intervals
    rng = pd.date_range(start_date, end_date, freq='S')

    # Create weights for all datetimes
    weights = pd.Series(1, index=rng)

    # Decrease weights for weekends
    weights[rng.to_series().dt.dayofweek > 4] *= 0.5

    # Decrease weight for Easter Sunday (2023-04-09)
    easter = pd.to_datetime('2023-04-09')
    weights[rng.to_series().between(easter, easter + pd.DateOffset(days=1))] *= 0.5
    easter_monday = pd.to_datetime('2023-04-10')
    weights[rng.to_series().between(easter, easter + pd.DateOffset(days=1))] *= 0.5

    # Increase weights for 8am-6pm on weekdays
    mask = ((rng.to_series().dt.hour >= 8) & (rng.to_series().dt.hour <= 18) & (rng.to_series().dt.dayofweek <= 4))
    weights[mask] *= 1.2

    # Increase weights for Tuesday, Wednesday, Thursday
    mask = ((rng.to_series().dt.dayofweek >= 1) & (rng.to_series().dt.dayofweek <= 3))
    weights[mask] *= 1.5

    # Normalize weights
    weights /= weights.sum()

    # Sample 10000 datetimes using weights
    sampled_datetimes = np.random.choice(rng, size=n, p=weights)

    # Sort the datetimes
    sampled_datetimes.sort()

    return sampled_datetimes

timestamps = generate_timestamps()
len(timestamps)


# In[22]:


import matplotlib.pyplot as plt

def plot_timestamps(timestamps):
    plt.figure(figsize=(10,6))
    plt.hist(timestamps, bins=100, alpha=0.5, color='blue')
    plt.xlabel('Datetime')
    plt.ylabel('Frequency')
    plt.title('Distribution of Timestamps')
    plt.show()

timestamps = generate_timestamps()
# plot_timestamps(timestamps)


# In[23]:


# WANDB LOGGING CONFIG
wandb_config = {"project": PROJECT, "entity":ENTITY}  # config for OpenAI autologger

table_cols = [
  "request_timestamp","query_id", "query", "wandbot_answer", "retrived_source_documents",
  "synth_user_feedback_signal", "elapsed_time_s",
  "prompt_tokens", "completion_tokens", "total_tokens",
  "answer_cost_usd", "successful_requests",
  "system_prompt_version", "system_prompt_template", "human_message_prompt_template", "hyde_prompt_template", "eval_prompt_template",
  "wandb_run_id", "wandbot_model", "wandbot_temperature", "hyde_llm_temperature",
  "eval_model", "eval_elapsed_time_s",
  "eval_total_chars", "eval_cost_usd",  "eval_total_tokens", "eval_prompt_tokens", "eval_completion_tokens"
]


# ### Load Questions


# In[35]:


# import pandas as pd
# df = pd.read_csv('sythetic-user-questions_2023-05-14.csv')
# questions = df["question"].values
# questions = questions[:5]
# questions

artifact = wandb.use_artifact('wandbot/wandbot_synth/run-2cv1ao9n-generated_questions_table:v0', type='run_table')
# artifact_dir = artifact.download("data")
df = artifact.get("generated_questions_table").get_dataframe()

# with open('data/generated_questions_table.table.json') as f:
#     js = json.load(f)

# columns = js['columns']
# data = js['data']
# df = pd.DataFrame(data, columns=columns)
questions = df["question"].values

# shuffle the questions
np.random.shuffle(questions)
print(len(questions))
df.head()


# Setup Evaluation Model

# In[28]:


import cohere

def calculate_eval_tokens(eval_grader_prompt, eval_completion):
    prompt_enc = command_nightly_tokenizer.encode(eval_grader_prompt)
    prompt_tokens_count = len(prompt_enc.ids)
    completion_enc = command_nightly_tokenizer.encode(eval_completion)
    completion_token_count = len(completion_enc.ids)
    completion_total_tokens = prompt_tokens_count + completion_token_count
    return completion_total_tokens, prompt_tokens_count, completion_token_count

co = cohere.Client(cohere_api_key)


# ## Run Synth WandBot

# Setup chain variants

# In[27]:


chains = {}
for system_prompt in system_prompts.keys():
    chains[f"{system_prompt}"] = load_qa_chain(system_prompts[system_prompt], vector_store, chain_type="stuff")

# chains.keys()


# In[29]:


import traceback

import langchain;
langchain.debug=False


for i_q, question in enumerate(questions):

    # USER QUERY
    query_id = generate_id(length=16)
    tstamp = timestamps[i_q]

    # RUN CHAIN
    system_prompt = random.choice(list(chains.keys()))
    qa_chain = chains[system_prompt]

    try:
        start_time = time.time()
        with get_openai_callback() as openai_cb:
            response = qa_chain({"question": question},
                                callbacks=[WandbTracer(wandb_config)],
                                return_only_outputs=False,
                                )
        end_time = time.time()
        elapsed_time = end_time - start_time
        answer = response["answer"]

        # RETRIEVED DOCUMENTS
        source_docs = response["source_documents"]
        source_documents = parse_source_documents(source_docs)

        # TOKEN METRICS
        prompts_tokens = openai_cb.prompt_tokens
        completion_tokens = openai_cb.completion_tokens
        total_tokens = openai_cb.total_tokens
        total_cost = openai_cb.total_cost
        successful_requests = openai_cb.successful_requests

        # GENERATE SYNTHETIC USER FEEDBACK
        eval_grader_prompt = command_eval_prompt_constructor(question, source_documents, answer)

        eval_completion = ""
        eval_total_tokens, eval_prompt_tokens, eval_completion_tokens = calculate_eval_tokens(eval_grader_prompt, eval_completion)

        eval_start_time = time.time()
        response = co.generate(
            model=wandb.config.eval_model,
            prompt = eval_grader_prompt,
            max_tokens=50,
            temperature=wandb.config.command_llm_temperature,
            stop_sequences=["====================="],
            truncate="end"
        )
        eval_end_time = time.time()
        eval_elapsed_time = eval_end_time - eval_start_time
        eval_completion = response.generations[0].text

        # Get eval token counts
        eval_total_tokens, eval_prompt_tokens, eval_completion_tokens = calculate_eval_tokens(eval_grader_prompt, eval_completion)
        eval_total_chars = len(eval_grader_prompt) + len(eval_completion)
        eval_cost = eval_total_chars * wandb.config.cohere_generate_cost_usd

        synth_user_feedback = "POSITIVE" if "positive" in eval_completion.lower() else "NEGATIVE"
        # synth_user_feedback = "POSITIVE"

        # LOG TO WANDB
        wandb_table = wandb.Table(table_cols)
        wandb_table.add_data(tstamp, query_id, question, answer, source_documents,
                            synth_user_feedback, elapsed_time,
                            prompts_tokens, completion_tokens, total_tokens,
                            total_cost, successful_requests,
                            system_prompt, system_prompts[system_prompt], human_message_prompt_template, hyde_prompt_template, eval_grader_prompt_template,
                             wandb.run.id, wandb.config.model_name, wandb.config.temperature, wandb.config.hyde_llm_temperature,
                             wandb.config.eval_model, eval_elapsed_time,
                             eval_total_chars, eval_cost, eval_total_tokens, eval_prompt_tokens, eval_completion_tokens)
        wandb.log({"logs/qa_with_eval": wandb_table})
    except Exception as e:
        print(f"Question {i_q}, Error occured: {e}")
        traceback.print_exc()

    # break
    if i_q % 20 == 0: print(i_q)


# In[27]:


print("DONE!")


# In[ ]:


# config_table_cols = ["query_id","run_id",  "system_prompt_template",
#                         "human_message_prompt_template", "hyde_prompt_template", "eval_grader_prompt_template"]
# config_table = wandb.Table(config_table_cols)
# config_table.add_data(query_id, wandb.run.id, system_prompts[system_prompt],
#                         human_message_prompt_template, hyde_prompt_template, eval_grader_prompt_template)
# wandb.log({"logs/config_table_test": config_table})


# In[ ]:


# # ANTHROPIC EVALUATION
# eval_model = "claude-v1.3-100k" # "claude-v1",
# # anthropic_api = "XXX"
# # client = anthropic.Client(api_key=anthropic_api)
# # max_tokens_to_sample = 100000

# eval_prompt_template = claude_eval_prompt_constructor("", "", "")  # Just to log the eval prompt template
# # eval_prompt = claude_eval_prompt_constructor(question, source_documents, answer)

# eval_start_time = time.time()
# # resp = client.completion(
# #     prompt=eval_prompt,
# #     stop_sequences=[anthropic.HUMAN_PROMPT],
# #     model=eval_model,
# #     max_tokens_to_sample=max_tokens_to_sample,
# # )