Last active
July 10, 2019 12:09
-
-
Save peakBreaker/3a342d7df04c6b4d36d8e1831160f12a to your computer and use it in GitHub Desktop.
A quick document analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
From documents to clusters | |
This script will run through a list of docs and process out the groups the docs may belong to using | |
cluster analysis, NMF and TF*IDF for preprocessing. These are some basic techniques for unsupervised NLP | |
which may be very handy. | |
""" | |
# For creating the data structure to process | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
# For the clustering | |
from sklearn.decomposition import TruncatedSVD | |
from sklearn.cluster import KMeans | |
from sklearn.pipeline import make_pipeline | |
to_csr(documents): | |
""" | |
Documents is a list of strings | |
""" | |
tfidf = TfidfVectorizer() | |
# CSR Matrices are high efficiency datasttucts for sparce matrices | |
csr_mat = tfidf.fit_transform(documents) | |
# print(csr_mat.toarray()) | |
# words = tfidf.get_feature_names() | |
return csr_mat | |
def clusterify(csr_matrice): | |
""" | |
The csr matrice is of type scipy.sparse.csr.csr_matrix | |
""" | |
# Set up the pipeline | |
svd = TruncatedSVD(n_components=50) | |
kmeans = KMeans(n_clusters=6) | |
pipeline = make_pipeline(svd, kmeans) | |
pipeline.fit(csr_matrice) | |
# Cluster and return the data | |
labels = pipeline.predict(csr_matrice) | |
df = pd.DataFrame({'label': labels, 'article': titles}) | |
return df | |
def nmfify(csr_matrice, titles): | |
""" | |
Runs the TFIDF results through an NMF | |
""" | |
model = NMF(n_components=6) | |
model.fit(csv_matrice) | |
nmf_features = model.transform(csr_matrice) | |
# Construct the processed data | |
df = pd.DataFrame(nmf_features, index=titles) | |
return df | |
def main(documents): | |
csr = to_csr(documents) | |
df = clusterify(csr) | |
print(df.sort_values('label')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment