Last active
February 9, 2023 20:02
-
-
Save kirshiyin89/a3a7dfd529a944ec3bcc53069ffaf652 to your computer and use it in GitHub Desktop.
Question Answering app with Python and Pinecone
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import itertools | |
import pinecone | |
from sentence_transformers import SentenceTransformer | |
api_key = "YOUR API KEY FROM PINECONE CONSOLE" | |
pinecone.init(api_key=api_key, environment='us-west1-gcp') | |
index_name = "question-answering" | |
index = pinecone.Index(index_name=index_name) | |
DATA_FILE = f"comcast.csv" | |
pd.set_option("display.max_colwidth", 500) | |
df = pd.read_csv( | |
f"{DATA_FILE}", usecols=["Ticket", "CustomerComplaint"], index_col=False | |
) | |
df = df.sample(frac=1).reset_index(drop=True) | |
df.drop_duplicates(inplace=True) | |
df['Ticket'] = df['Ticket'].apply(str) | |
model = SentenceTransformer("average_word_embeddings_glove.6B.300d") | |
df["question_vector"] = df.CustomerComplaint.apply(lambda x: model.encode(str(x)).tolist()) | |
def chunks(iterable, batch_size=100): | |
it = iter(iterable) | |
chunk = tuple(itertools.islice(it, batch_size)) | |
while chunk: | |
yield chunk | |
chunk = tuple(itertools.islice(it, batch_size)) | |
for batch in chunks(zip(df.Ticket, df.question_vector)): | |
index.upsert(vectors=batch) | |
query_questions = [ | |
"Wifi Not working", | |
] | |
query_vectors = [model.encode(str(question)).tolist() for question in query_questions] | |
query_results = index.query(queries=query_vectors, top_k=5) | |
for question, res in zip(query_questions, query_results.results): | |
print("\n\n\n Original question : " + str(question)) | |
print("\n Most similar questions based on pinecone vector search: \n") | |
ids = [match.id for match in res.matches] | |
scores = [match.score for match in res.matches] | |
df_result = pd.DataFrame( | |
{ | |
"Ticket#": ids, | |
"Question": [ | |
df[df.Ticket == _id].CustomerComplaint.values[0] for _id in ids | |
], | |
"Score": scores, | |
} | |
) | |
print(df_result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment