Skip to content

Instantly share code, notes, and snippets.

@FeryET
Last active August 26, 2020 11:58
Show Gist options
  • Save FeryET/b317597d5f8459046ce73dd96dc1ea85 to your computer and use it in GitHub Desktop.
Save FeryET/b317597d5f8459046ce73dd96dc1ea85 to your computer and use it in GitHub Desktop.
import logging
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import (RepeatedStratifiedKFold, cross_val_score, )
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from tomotopy import HDPModel
from lda_classification.model import TomotopyLDAVectorizer
from lda_classification.preprocess.spacy_cleaner import SpacyCleaner
#############################################
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
workers = 4 #Numbers of workers throughout the project
use_umap = False #make this True if you want to use UMAP for your visualizations
min_df = 5 #Minimum number for document frequency in the corpus
rm_top = 5 #Remove top n frequent words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment