Skip to content

Instantly share code, notes, and snippets.

@bitbutter
Created July 30, 2023 16:53
Show Gist options
  • Save bitbutter/f1d9ffe68c621a48541dbc7bc23d359d to your computer and use it in GitHub Desktop.
Save bitbutter/f1d9ffe68c621a48541dbc7bc23d359d to your computer and use it in GitHub Desktop.
create embeddings from tweets, then semantic search them
import json
import spacy
import zipfile
import os
import pickle
# File paths
zip_path = r"pathtotwitterarchive.zip" # Path to zipped twitter archive
extract_path = r"somepath\twitter_data" # Path to extract twitter data
embeddings_path = r"somepath\tweet_embeddings.pkl" # Path to save embeddings
# Load the large English model in SpaCy
print("Loading SpaCy model...")
nlp = spacy.load('en_core_web_lg')
# Extract the .zip file
print("Starting extraction of zip file...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
print("Finished extraction of zip file.")
# Load the Twitter archive
print("Starting loading of Twitter archive...")
with open(os.path.join(extract_path, 'data', 'tweets.js'), 'r', encoding='utf-8') as f:
data = f.read().replace('window.YTD.tweets.part0 = ', '')
raw_archive = json.loads(data)
print("Finished loading of Twitter archive.")
# Extract the actual tweets from the raw data
archive = [item['tweet'] for item in raw_archive]
# Generate embeddings
print("Starting generation of embeddings...")
tweets = []
for item in archive:
# Extract text and URL
text = item['full_text']
url = f"https://twitter.com/i/web/status/{item['id_str']}"
# Generate vector using SpaCy
vector = nlp(text).vector
# Append to the list of tweets
tweets.append((text, url, vector))
print("Finished generation of embeddings.")
# Save embeddings to a file
print("Starting saving of embeddings to a file...")
with open(embeddings_path, 'wb') as f:
pickle.dump(tweets, f)
print("Finished saving of embeddings to a file.")
print("Script completed.")
import spacy
import pickle
import os
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter
import webbrowser
from colorama import Fore, Style
# Define a minimum length for tweets
min_tweet_length = 30
# Load the large English model in SpaCy
nlp = spacy.load('en_core_web_lg')
# Load the embeddings
embeddings_path = r"somepath\tweet_embeddings.pkl" # Path to embeddings
with open(embeddings_path, 'rb') as f:
tweets = pickle.load(f)
# Filter out short tweets and tweets with more than one other handle
tweets = [tweet for tweet in tweets if len(tweet[0]) >= min_tweet_length and tweet[0].count('@') == 0]
# Get user input
text = input("Enter the text for a new tweet: ")
# Calculate the embedding of the input text
input_vector = nlp(text).vector
# Calculate similarities with the existing tweets
similarities = [(tweet[0], tweet[1], cosine_similarity([input_vector], [tweet[2]])) for tweet in tweets]
# Sort by similarity
similarities.sort(key=itemgetter(2), reverse=True)
# Print the 20 most similar tweets
print("The 20 most similar tweets are:")
for i in range(20):
print(f"{i+1}. {Fore.LIGHTBLUE_EX}{similarities[i][0]}{Style.RESET_ALL} (Similarity: {similarities[i][2][0][0]})")
print(f"URL: {similarities[i][1]}\n\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment