bitbutter/create_embeddings.py

## create_embeddings.py
import json
import spacy
import zipfile
import os
import pickle

# File paths
zip_path = r"pathtotwitterarchive.zip" # Path to zipped twitter archive
extract_path = r"somepath\twitter_data" # Path to extract twitter data
embeddings_path = r"somepath\tweet_embeddings.pkl" # Path to save embeddings

# Load the large English model in SpaCy
print("Loading SpaCy model...")
nlp = spacy.load('en_core_web_lg')

# Extract the .zip file
print("Starting extraction of zip file...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("Finished extraction of zip file.")

# Load the Twitter archive
print("Starting loading of Twitter archive...")
with open(os.path.join(extract_path, 'data', 'tweets.js'), 'r', encoding='utf-8') as f:
    data = f.read().replace('window.YTD.tweets.part0 = ', '')
    raw_archive = json.loads(data)
print("Finished loading of Twitter archive.")

# Extract the actual tweets from the raw data
archive = [item['tweet'] for item in raw_archive]

# Generate embeddings
print("Starting generation of embeddings...")
tweets = []
for item in archive:
    # Extract text and URL
    text = item['full_text']
    url = f"https://twitter.com/i/web/status/{item['id_str']}"

    # Generate vector using SpaCy
    vector = nlp(text).vector

    # Append to the list of tweets
    tweets.append((text, url, vector))
print("Finished generation of embeddings.")

# Save embeddings to a file
print("Starting saving of embeddings to a file...")
with open(embeddings_path, 'wb') as f:
    pickle.dump(tweets, f)
print("Finished saving of embeddings to a file.")

print("Script completed.")

## semantic_tweet_search.py
import spacy
import pickle
import os
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter
import webbrowser
from colorama import Fore, Style

# Define a minimum length for tweets
min_tweet_length = 30

# Load the large English model in SpaCy
nlp = spacy.load('en_core_web_lg')

# Load the embeddings
embeddings_path = r"somepath\tweet_embeddings.pkl" # Path to embeddings
with open(embeddings_path, 'rb') as f:
    tweets = pickle.load(f)

# Filter out short tweets and tweets with more than one other handle
tweets = [tweet for tweet in tweets if len(tweet[0]) >= min_tweet_length and tweet[0].count('@') == 0]

# Get user input
text = input("Enter the text for a new tweet: ")

# Calculate the embedding of the input text
input_vector = nlp(text).vector

# Calculate similarities with the existing tweets
similarities = [(tweet[0], tweet[1], cosine_similarity([input_vector], [tweet[2]])) for tweet in tweets]

# Sort by similarity
similarities.sort(key=itemgetter(2), reverse=True)

# Print the 20 most similar tweets
print("The 20 most similar tweets are:")
for i in range(20):
    print(f"{i+1}. {Fore.LIGHTBLUE_EX}{similarities[i][0]}{Style.RESET_ALL} (Similarity: {similarities[i][2][0][0]})")
    print(f"URL: {similarities[i][1]}\n\n")
	import json
	import spacy
	import zipfile
	import os
	import pickle

	# File paths
	zip_path = r"pathtotwitterarchive.zip" # Path to zipped twitter archive
	extract_path = r"somepath\twitter_data" # Path to extract twitter data
	embeddings_path = r"somepath\tweet_embeddings.pkl" # Path to save embeddings

	# Load the large English model in SpaCy
	print("Loading SpaCy model...")
	nlp = spacy.load('en_core_web_lg')

	# Extract the .zip file
	print("Starting extraction of zip file...")
	with zipfile.ZipFile(zip_path, 'r') as zip_ref:
	zip_ref.extractall(extract_path)
	print("Finished extraction of zip file.")

	# Load the Twitter archive
	print("Starting loading of Twitter archive...")
	with open(os.path.join(extract_path, 'data', 'tweets.js'), 'r', encoding='utf-8') as f:
	data = f.read().replace('window.YTD.tweets.part0 = ', '')
	raw_archive = json.loads(data)
	print("Finished loading of Twitter archive.")

	# Extract the actual tweets from the raw data
	archive = [item['tweet'] for item in raw_archive]

	# Generate embeddings
	print("Starting generation of embeddings...")
	tweets = []
	for item in archive:
	# Extract text and URL
	text = item['full_text']
	url = f"https://twitter.com/i/web/status/{item['id_str']}"

	# Generate vector using SpaCy
	vector = nlp(text).vector

	# Append to the list of tweets
	tweets.append((text, url, vector))
	print("Finished generation of embeddings.")

	# Save embeddings to a file
	print("Starting saving of embeddings to a file...")
	with open(embeddings_path, 'wb') as f:
	pickle.dump(tweets, f)
	print("Finished saving of embeddings to a file.")

	print("Script completed.")