Skip to content

Instantly share code, notes, and snippets.

@abhijeet-talaulikar
Last active September 5, 2023 15:46
Show Gist options
  • Save abhijeet-talaulikar/e3064d0bb2373c6b3b5225d8e9e5c1a8 to your computer and use it in GitHub Desktop.
Save abhijeet-talaulikar/e3064d0bb2373c6b3b5225d8e9e5c1a8 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import openai
# Enter your own key in here
openai.api_key = ""
# Load data
data = pd.read_csv("complaints.csv")
# Basic cleaning to remove empty texts and downsample to top 10 focus areas
data = data[~data['Consumer complaint narrative'].isna()]
focus_areas = data['Issue'].value_counts().head(10).index.to_list()
data = data[data['Issue'].isin(focus_areas)]
review_data = data.groupby('Issue').apply(lambda x: x.sample(frac=0.01))
# Fetch Open AI's text embeddings for our texts
def get_embedding(text, model="text-embedding-ada-002"):
text = text.replace("\n", " ")
return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
review_data['ada_embedding'] = review_data['Consumer complaint narrative'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment