dbasch/embed-tweets-with-instructor.py Secret

## embed-tweets-with-instructor.py
"""
Download your tweet archive from Twitter.
here will be a file called data/tweets.js. It will contain a single variable
assigned to an array of tweet objects.
Edit it, leave only the array and rename it to tweets.json.
This requires having chromadb and InstructorEmbedding installed via pip.
"""
from chromadb.config import Settings
from chromadb.utils import embedding_functions
import chromadb
import json
import time

dirname = "mytweets"
#remove the device parameter below if you don't have a cuda-capable gpu
embeddings = embedding_functions.InstructorEmbeddingFunction(device='cuda')

alltweets = json.load(open("tweets.json"))
tweets = [t['tweet'] for t in alltweets if not t['tweet']['full_text'].startswith("RT")]

client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
                                    persist_directory=dirname))

alltweets = json.load(open("tweets.json"))
tweets = [t['tweet'] for t in alltweets if not t['tweet']['full_text'].startswith("RT")]
total = len(tweets)
print(f"we have {total} tweets.")

coll = client.get_or_create_collection("tweets", embedding_function=embeddings)
if coll.count() != total:
    i = 0
    batch_size = 20 #that's how much my gpu can do at a time

    toembed = [t["full_text"] for t in tweets]
    ids = [str(i) for i in range(total)]

    before = time.time()
    while i < len(toembed):
        coll.add(documents=toembed[i:i+batch_size], metadatas=None, ids=ids[i:i+batch_size])
        i += batch_size
        print(f"embedded: {i}")
    t = time.time() - before

while True:
    query = input("query: ")
    try:
        response = coll.query(query_texts=query, n_results = 10)
        for i, t in enumerate(response['documents'][0]):
            print(i, t)
    except Exception as e:
        print(e)
	"""
	Download your tweet archive from Twitter.
	here will be a file called data/tweets.js. It will contain a single variable
	assigned to an array of tweet objects.
	Edit it, leave only the array and rename it to tweets.json.
	This requires having chromadb and InstructorEmbedding installed via pip.
	"""
	from chromadb.config import Settings
	from chromadb.utils import embedding_functions
	import chromadb
	import json
	import time

	dirname = "mytweets"
	#remove the device parameter below if you don't have a cuda-capable gpu
	embeddings = embedding_functions.InstructorEmbeddingFunction(device='cuda')

	alltweets = json.load(open("tweets.json"))
	tweets = [t['tweet'] for t in alltweets if not t['tweet']['full_text'].startswith("RT")]

	client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet",
	persist_directory=dirname))

	alltweets = json.load(open("tweets.json"))
	tweets = [t['tweet'] for t in alltweets if not t['tweet']['full_text'].startswith("RT")]
	total = len(tweets)
	print(f"we have {total} tweets.")

	coll = client.get_or_create_collection("tweets", embedding_function=embeddings)
	if coll.count() != total:
	i = 0
	batch_size = 20 #that's how much my gpu can do at a time

	toembed = [t["full_text"] for t in tweets]
	ids = [str(i) for i in range(total)]

	before = time.time()
	while i < len(toembed):
	coll.add(documents=toembed[i:i+batch_size], metadatas=None, ids=ids[i:i+batch_size])
	i += batch_size
	print(f"embedded: {i}")
	t = time.time() - before

	while True:
	query = input("query: ")
	try:
	response = coll.query(query_texts=query, n_results = 10)
	for i, t in enumerate(response['documents'][0]):
	print(i, t)
	except Exception as e:
	print(e)