Skip to content

Instantly share code, notes, and snippets.

@FeryET
Created August 26, 2020 10:06
Show Gist options
  • Save FeryET/a43cae04028a0ea8bd93e3ab9ccd43cc to your computer and use it in GitHub Desktop.
Save FeryET/a43cae04028a0ea8bd93e3ab9ccd43cc to your computer and use it in GitHub Desktop.
Gists for Medium Article: Text Classification using LDA
processor = SpacyCleaner(chunksize=1000, workers=workers)
docs = processor.transform(raw_docs)
import argparse
import logging
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tomotopy import HDPModel
from lda_classification.model import TomotopyLDAVectorizer
from lda_classification.preprocess.spacy_cleaner import SpacyCleaner
#############################################
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
workers = 4 #Numbers of workers throughout the project
use_umap = False #make this True if you want to use UMAP for your visualizations
min_df = 5
rm_top = 5
title = "PCA Visualization of the Dataset using {}"
if use_umap is True:
from umap import UMAP
dim_reducer = UMAP(n_components=2)
title = title.format("UMAP")
else:
from sklearn.manifold import TSNE
dim_reducer = TSNE(n_components=2)
title = title.format("TSNE")
x_transform = np.concatenate((x_train, x_test))
x_transform = StandardScaler().fit_transform(x_transform)
x_transform = dim_reducer.fit_transform(x_transform)
x2d_train = x_transform[:x_train.shape[0], :]
x2d_test = x_transform[x_train.shape[0]:, :]
def plot_topic_clusters(ax, x2d, y, labels):
ax.set_aspect("equal")
colors = cm.get_cmap("Spectral", len(labels))
for i, l in enumerate(labels):
c = colors(i / len(labels))
ax.scatter(x2d[y == i, 0], x2d[y == i, 1], color=c, label=l, alpha=0.7)
ax.grid()
ax.legend(prop={'size': 6})
ax.autoscale()
return ax
dpi = 300
fig, axes = plt.subplots(ncols=2, figsize=(3000 / dpi, 1500 / dpi), dpi=dpi)
plot_topic_clusters(axes[0], x2d_train, y_train, labels)
plot_topic_clusters(axes[1], x2d_test, y_test, labels)
axes[0].set_title("Train Subset")
axes[1].set_title("Test Subset")
fig.suptitle(title)
fig.tight_layout()
plt.show()
labels = ["rec.autos", "rec.motorcycles", "rec.sport.baseball", "rec.sport.hockey"]
raw_docs, y = fetch_20newsgroups(subset='all', return_X_y=True, categories=labels)
hdp_model = HDPModel(min_df=min_df, rm_top=rm_top)
hdp_model.optim_interval = 5
for d in docs_train:
hdp_model.add_doc(d)
hdp_model.burn_in = 100
hdp_model.train(0, workers=workers)
for i in range(0, 1000, 10):
hdp_model.train(10, workers=workers)
print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, hdp_model.ll_per_word, hdp_model.live_k))
num_of_topics = hdp_model.live_k
docs_train, docs_test, y_train, y_test = train_test_split(docs, y, test_size=0.1, shuffle=True)
vectorizer = TomotopyLDAVectorizer(num_of_topics=num_of_topics, workers=workers, min_df=min_df, rm_top=rm_top)
x_train = vectorizer.fit_transform(docs_train)
x_test = vectorizer.transform(docs_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment