This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bertopic import BERTopic | |
model = BERTopic(language="Dutch") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>>> model.get_topic(49) | |
[('windows', 0.006152228076250982), | |
('drive', 0.004982897610645755), | |
('dos', 0.004845038866360651), | |
('file', 0.004140142872194834), | |
('disk', 0.004131678774810884), | |
('mac', 0.003624848635985097), | |
('memory', 0.0034840976976789903), | |
('software', 0.0034415334250699077), | |
('email', 0.0034239554442333257), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>>> model.get_topic_freq().head() | |
Topic Count | |
-1 7288 | |
49 3992 | |
30 701 | |
27 684 | |
11 568 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bertopic import BERTopic | |
from sklearn.datasets import fetch_20newsgroups | |
docs = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))['data'] | |
model = BERTopic() | |
topics, probabilities = model.fit_transform(docs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from rapidfuzz import fuzz | |
from polyfuzz.models import BaseMatcher | |
class MyModel(BaseMatcher): | |
def match(self, from_list, to_list): | |
# Calculate distances | |
matches = [[fuzz.ratio(from_string, to_string) / 100 for to_string in to_list] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model.visualize_precision_recall() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from polyfuzz import PolyFuzz | |
from polyfuzz.models import Embeddings, TFIDF, RapidFuzz | |
from flair.embeddings import WordEmbeddings | |
fasttext_embeddings = WordEmbeddings('en-crawl') | |
fasttext = Embeddings(fasttext_embeddings, min_similarity=0, model_id="FastText") | |
tfidf = TFIDF(min_similarity=0, model_id="TF-IDF") | |
rapidfuzz = RapidFuzz(n_jobs=-1, score_cutoff=0, model_id="RapidFuzz") | |
matchers = [tfidf, fasttext, rapidfuzz] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from polyfuzz import PolyFuzz | |
from polyfuzz.models import Embeddings | |
from flair.embeddings import TransformerWordEmbeddings, WordEmbeddings, StackedEmbeddings | |
bert_embeddings = TransformerWordEmbeddings('bert-base-multilingual-cased') | |
fasttext_embeddings = WordEmbeddings('en-crawl') | |
stacked_embeddings = StackedEmbeddings(embeddings=[bert_embeddings, fasttext_embeddings]) | |
embeddings = Embeddings(stacked_embeddings) | |
model = PolyFuzz(stacked_embeddings) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from polyfuzz import PolyFuzz | |
from polyfuzz.models import Embeddings | |
from flair.embeddings import TransformerWordEmbeddings | |
embeddings = TransformerWordEmbeddings('bert-base-multilingual-cased') | |
bert = Embeddings(bert) | |
models = PolyFuzz(bert) | |
model.match(from_list, to_list) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from polyfuzz.models import TFIDF | |
from polyfuzz import PolyFuzz | |
tfidf = TFIDF(n_gram_range=(3, 3)) | |
model = PolyFuzz(tfidf) | |
model.match(from_list, to_list) |