Skip to content

Instantly share code, notes, and snippets.

View MaartenGr's full-sized avatar

Maarten Grootendorst MaartenGr

View GitHub Profile
fig = concept_model.visualize_concepts(concepts=[concept for concept, _ in search_results])
fig = concept_model.visualize_concepts()
>>> search_results = concept_model.find_concepts("beach")
>>> search_results
[(100, 0.277577825349102),
(53, 0.27431058773894657),
(95, 0.25973751319723837),
(77, 0.2560122597417548),
(97, 0.25361988261846297)]
from concept import ConceptModel
# Train Concept using the pre-trained image embeddings
concept_model = ConceptModel()
concepts = concept_model.fit_transform(img_names,
image_embeddings=image_embeddings,
docs=selected_nouns)
from concept import ConceptModel
concept_model = ConceptModel()
concepts = concept_model.fit_transform(img_names, docs=selected_nouns)
import random
import nltk
nltk.download("wordnet")
from nltk.corpus import wordnet as wn
all_nouns = [word for synset in wn.all_synsets('n') for word in synset.lemma_names()
if "_" not in word]
selected_nouns = random.sample(all_nouns, 50_000)
import pickle
from sentence_transformers import util
# Load pre-trained image embeddings
emb_filename = 'unsplash-25k-photos-embeddings.pkl'
if not os.path.exists(emb_filename): #Download dataset if does not exist
util.http_get('http://sbert.net/datasets/'+emb_filename, emb_filename)
with open(emb_filename, 'rb') as fIn:
img_names, image_embeddings = pickle.load(fIn)
import os
import glob
import zipfile
from tqdm import tqdm
from sentence_transformers import util
# Download 25k images from Unsplash
img_folder = 'photos/'
if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:
os.makedirs(img_folder, exist_ok=True)
from bertopic import BERTopic
from sklearn.feature_extraction.text import TfidfVectorizer
# Create TF-IDF sparse matrix
vectorizer = TfidfVectorizer(min_df=5)
embeddings = vectorizer.fit_transform(docs)
# Model
model = BERTopic(stop_words="english")
topics, probabilities = model.fit_transform(docs, embeddings)
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
# Prepare embeddings
sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens")
embeddings = sentence_model.encode(docs, show_progress_bar=False)
# Create topic model
model = BERTopic()
topics, probabilities = model.fit_transform(docs, embeddings)