Skip to content

Instantly share code, notes, and snippets.

@gd3kr
Created February 15, 2024 20:35
Show Gist options
  • Star 36 You must be signed in to star a gist
  • Fork 6 You must be signed in to fork a gist
  • Save gd3kr/c4c0687a5f7e91b1a84bcacea6500011 to your computer and use it in GitHub Desktop.
Save gd3kr/c4c0687a5f7e91b1a84bcacea6500011 to your computer and use it in GitHub Desktop.
compute embeddings for tweets in tweets.json
"""
a simple script that reads tweets inside a json file, uses openai to compute embeddings and creates two files, metadata.tsv and output.tsv, which cam be used to visualise the tweets and their embeddings in TensorFlow Projector (https://projector.tensorflow.org/)
"""
# obtain tweets.json from https://gist.github.com/gd3kr/948296cf675469f5028911f8eb276dbc
import pandas as pd
import json
from openai import OpenAI
client = OpenAI(api_key="<INSERT OPENAI API KEY HERE>")
import numpy as np
def sanitize_for_json(text):
return json.dumps(text)
# read tweets from tweets.json
with open('tweets.json', 'r') as file:
tweets = json.load(file)
# create empty lists to store embeddings and metadata
embeddings = []
metadata = []
for i in range(len(tweets)):
tweets[i] = sanitize_for_json(tweets[i])
# Chunking tweets into chunks of 500
for i in range(0, len(tweets), 500):
chunk = tweets[i:i+500]
chunk_array = np.array(chunk)
try:
response = client.embeddings.create(
input=chunk,
model="text-embedding-3-small"
)
chunk_embeddings = np.array([data.embedding for data in response.data])
embeddings.extend(chunk_embeddings)
print(chunk_embeddings.shape) # (n, 1536)
# save metadata as tsv
chunk_no_newlines = [tweet.replace('\n', ' ') for tweet in chunk]
metadata.extend(chunk_no_newlines)
except Exception as e:
print(f"Error occurred while generating embeddings: {e}")
# Convert list of all embeddings into data frame
embedding_df = pd.DataFrame(embeddings)
# Save dataframe as as TSV file without any index and header
embedding_df.to_csv('output.tsv', sep='\t', index=None, header=None)
# Convert list of all metadata into data frame
metadata_df = pd.DataFrame(metadata)
metadata_df.to_csv('metadata.tsv', sep='\t', index=None, header=None)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment