Skip to content

Instantly share code, notes, and snippets.

View MaartenGr's full-sized avatar

Maarten Grootendorst MaartenGr

View GitHub Profile
from bertopic import BERTopic
model = BERTopic(language="Dutch")
>>>> model.get_topic(49)
[('windows', 0.006152228076250982),
('drive', 0.004982897610645755),
('dos', 0.004845038866360651),
('file', 0.004140142872194834),
('disk', 0.004131678774810884),
('mac', 0.003624848635985097),
('memory', 0.0034840976976789903),
('software', 0.0034415334250699077),
('email', 0.0034239554442333257),
>>>> model.get_topic_freq().head()
Topic Count
-1 7288
49 3992
30 701
27 684
11 568
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data']
model = BERTopic()
topics, probabilities = model.fit_transform(docs)
import numpy as np
import pandas as pd
from rapidfuzz import fuzz
from polyfuzz.models import BaseMatcher
class MyModel(BaseMatcher):
def match(self, from_list, to_list):
# Calculate distances
matches = [[fuzz.ratio(from_string, to_string) / 100 for to_string in to_list]
model.visualize_precision_recall()
from polyfuzz import PolyFuzz
from polyfuzz.models import Embeddings, TFIDF, RapidFuzz
from flair.embeddings import WordEmbeddings
fasttext_embeddings = WordEmbeddings('en-crawl')
fasttext = Embeddings(fasttext_embeddings, min_similarity=0, model_id="FastText")
tfidf = TFIDF(min_similarity=0, model_id="TF-IDF")
rapidfuzz = RapidFuzz(n_jobs=-1, score_cutoff=0, model_id="RapidFuzz")
matchers = [tfidf, fasttext, rapidfuzz]
from polyfuzz import PolyFuzz
from polyfuzz.models import Embeddings
from flair.embeddings import TransformerWordEmbeddings, WordEmbeddings, StackedEmbeddings
bert_embeddings = TransformerWordEmbeddings('bert-base-multilingual-cased')
fasttext_embeddings = WordEmbeddings('en-crawl')
stacked_embeddings = StackedEmbeddings(embeddings=[bert_embeddings, fasttext_embeddings])
embeddings = Embeddings(stacked_embeddings)
model = PolyFuzz(stacked_embeddings)
from polyfuzz import PolyFuzz
from polyfuzz.models import Embeddings
from flair.embeddings import TransformerWordEmbeddings
embeddings = TransformerWordEmbeddings('bert-base-multilingual-cased')
bert = Embeddings(bert)
models = PolyFuzz(bert)
model.match(from_list, to_list)
from polyfuzz.models import TFIDF
from polyfuzz import PolyFuzz
tfidf = TFIDF(n_gram_range=(3, 3))
model = PolyFuzz(tfidf)
model.match(from_list, to_list)