Skip to content

Instantly share code, notes, and snippets.

@peakBreaker
Last active July 10, 2019 12:09
Show Gist options
  • Save peakBreaker/3a342d7df04c6b4d36d8e1831160f12a to your computer and use it in GitHub Desktop.
Save peakBreaker/3a342d7df04c6b4d36d8e1831160f12a to your computer and use it in GitHub Desktop.
A quick document analysis
"""
From documents to clusters
This script will run through a list of docs and process out the groups the docs may belong to using
cluster analysis, NMF and TF*IDF for preprocessing. These are some basic techniques for unsupervised NLP
which may be very handy.
"""
# For creating the data structure to process
from sklearn.feature_extraction.text import TfidfVectorizer
# For the clustering
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
to_csr(documents):
"""
Documents is a list of strings
"""
tfidf = TfidfVectorizer()
# CSR Matrices are high efficiency datasttucts for sparce matrices
csr_mat = tfidf.fit_transform(documents)
# print(csr_mat.toarray())
# words = tfidf.get_feature_names()
return csr_mat
def clusterify(csr_matrice):
"""
The csr matrice is of type scipy.sparse.csr.csr_matrix
"""
# Set up the pipeline
svd = TruncatedSVD(n_components=50)
kmeans = KMeans(n_clusters=6)
pipeline = make_pipeline(svd, kmeans)
pipeline.fit(csr_matrice)
# Cluster and return the data
labels = pipeline.predict(csr_matrice)
df = pd.DataFrame({'label': labels, 'article': titles})
return df
def nmfify(csr_matrice, titles):
"""
Runs the TFIDF results through an NMF
"""
model = NMF(n_components=6)
model.fit(csv_matrice)
nmf_features = model.transform(csr_matrice)
# Construct the processed data
df = pd.DataFrame(nmf_features, index=titles)
return df
def main(documents):
csr = to_csr(documents)
df = clusterify(csr)
print(df.sort_values('label'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment