This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig = concept_model.visualize_concepts(concepts=[concept for concept, _ in search_results]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig = concept_model.visualize_concepts() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> search_results = concept_model.find_concepts("beach") | |
>>> search_results | |
[(100, 0.277577825349102), | |
(53, 0.27431058773894657), | |
(95, 0.25973751319723837), | |
(77, 0.2560122597417548), | |
(97, 0.25361988261846297)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from concept import ConceptModel | |
# Train Concept using the pre-trained image embeddings | |
concept_model = ConceptModel() | |
concepts = concept_model.fit_transform(img_names, | |
image_embeddings=image_embeddings, | |
docs=selected_nouns) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from concept import ConceptModel | |
concept_model = ConceptModel() | |
concepts = concept_model.fit_transform(img_names, docs=selected_nouns) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import nltk | |
nltk.download("wordnet") | |
from nltk.corpus import wordnet as wn | |
all_nouns = [word for synset in wn.all_synsets('n') for word in synset.lemma_names() | |
if "_" not in word] | |
selected_nouns = random.sample(all_nouns, 50_000) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
from sentence_transformers import util | |
# Load pre-trained image embeddings | |
emb_filename = 'unsplash-25k-photos-embeddings.pkl' | |
if not os.path.exists(emb_filename): #Download dataset if does not exist | |
util.http_get('http://sbert.net/datasets/'+emb_filename, emb_filename) | |
with open(emb_filename, 'rb') as fIn: | |
img_names, image_embeddings = pickle.load(fIn) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import glob | |
import zipfile | |
from tqdm import tqdm | |
from sentence_transformers import util | |
# Download 25k images from Unsplash | |
img_folder = 'photos/' | |
if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0: | |
os.makedirs(img_folder, exist_ok=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bertopic import BERTopic | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
# Create TF-IDF sparse matrix | |
vectorizer = TfidfVectorizer(min_df=5) | |
embeddings = vectorizer.fit_transform(docs) | |
# Model | |
model = BERTopic(stop_words="english") | |
topics, probabilities = model.fit_transform(docs, embeddings) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bertopic import BERTopic | |
from sentence_transformers import SentenceTransformer | |
# Prepare embeddings | |
sentence_model = SentenceTransformer("distilbert-base-nli-mean-tokens") | |
embeddings = sentence_model.encode(docs, show_progress_bar=False) | |
# Create topic model | |
model = BERTopic() | |
topics, probabilities = model.fit_transform(docs, embeddings) |
NewerOlder