gd3kr/embeddings.py

## embeddings.py

"""
a simple script that reads tweets inside a json file, uses openai to compute embeddings and creates two files, metadata.tsv and output.tsv, which cam be used to visualise the tweets and their embeddings in TensorFlow Projector (https://projector.tensorflow.org/)
"""

# obtain tweets.json from https://gist.github.com/gd3kr/948296cf675469f5028911f8eb276dbc

import pandas as pd
import json
from openai import OpenAI
client = OpenAI(api_key="<INSERT OPENAI API KEY HERE>")
import numpy as np

def sanitize_for_json(text):
    return json.dumps(text)

# read tweets from tweets.json
with open('tweets.json', 'r') as file:
    tweets = json.load(file)

    # create empty lists to store embeddings and metadata
    embeddings = []
    metadata = []

    for i in range(len(tweets)):
        tweets[i] = sanitize_for_json(tweets[i])

    # Chunking tweets into chunks of 500
    for i in range(0, len(tweets), 500):
        chunk = tweets[i:i+500]
        chunk_array = np.array(chunk)

        try:
            response = client.embeddings.create(
                input=chunk,
                model="text-embedding-3-small"
            )

            chunk_embeddings = np.array([data.embedding for data in response.data])
            embeddings.extend(chunk_embeddings)

            print(chunk_embeddings.shape) # (n, 1536)

            # save metadata as tsv
            chunk_no_newlines = [tweet.replace('\n', ' ') for tweet in chunk]
            metadata.extend(chunk_no_newlines)
        except Exception as e:
            print(f"Error occurred while generating embeddings: {e}")

    # Convert list of all embeddings into data frame
    embedding_df = pd.DataFrame(embeddings)

    # Save dataframe as as TSV file without any index and header
    embedding_df.to_csv('output.tsv', sep='\t', index=None, header=None)

    # Convert list of all metadata into data frame
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv('metadata.tsv', sep='\t', index=None, header=None)

	"""
	a simple script that reads tweets inside a json file, uses openai to compute embeddings and creates two files, metadata.tsv and output.tsv, which cam be used to visualise the tweets and their embeddings in TensorFlow Projector (https://projector.tensorflow.org/)
	"""

	# obtain tweets.json from https://gist.github.com/gd3kr/948296cf675469f5028911f8eb276dbc

	import pandas as pd
	import json
	from openai import OpenAI
	client = OpenAI(api_key="<INSERT OPENAI API KEY HERE>")
	import numpy as np

	def sanitize_for_json(text):
	return json.dumps(text)

	# read tweets from tweets.json
	with open('tweets.json', 'r') as file:
	tweets = json.load(file)

	# create empty lists to store embeddings and metadata
	embeddings = []
	metadata = []

	for i in range(len(tweets)):
	tweets[i] = sanitize_for_json(tweets[i])

	# Chunking tweets into chunks of 500
	for i in range(0, len(tweets), 500):
	chunk = tweets[i:i+500]
	chunk_array = np.array(chunk)

	try:
	response = client.embeddings.create(
	input=chunk,
	model="text-embedding-3-small"
	)

	chunk_embeddings = np.array([data.embedding for data in response.data])
	embeddings.extend(chunk_embeddings)

	print(chunk_embeddings.shape) # (n, 1536)

	# save metadata as tsv
	chunk_no_newlines = [tweet.replace('\n', ' ') for tweet in chunk]
	metadata.extend(chunk_no_newlines)
	except Exception as e:
	print(f"Error occurred while generating embeddings: {e}")

	# Convert list of all embeddings into data frame
	embedding_df = pd.DataFrame(embeddings)

	# Save dataframe as as TSV file without any index and header
	embedding_df.to_csv('output.tsv', sep='\t', index=None, header=None)

	# Convert list of all metadata into data frame
	metadata_df = pd.DataFrame(metadata)
	metadata_df.to_csv('metadata.tsv', sep='\t', index=None, header=None)