peakBreaker/analyzedocs.py

## analyzedocs.py
"""
From documents to clusters

This script will run through a list of docs and process out the groups the docs may belong to using
cluster analysis, NMF and TF*IDF for preprocessing.  These are some basic techniques for unsupervised NLP
which may be very handy.

"""

# For creating the data structure to process
from sklearn.feature_extraction.text import TfidfVectorizer

# For the clustering
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

to_csr(documents):
  """
  Documents is a list of strings
  """
  tfidf = TfidfVectorizer()
  # CSR Matrices are high efficiency datasttucts for sparce matrices
  csr_mat = tfidf.fit_transform(documents)
  # print(csr_mat.toarray())
  # words = tfidf.get_feature_names()
  return csr_mat

def clusterify(csr_matrice):
  """
  The csr matrice is of type scipy.sparse.csr.csr_matrix
  """
  # Set up the pipeline
  svd = TruncatedSVD(n_components=50)
  kmeans = KMeans(n_clusters=6)
  pipeline = make_pipeline(svd, kmeans)
  pipeline.fit(csr_matrice)

  # Cluster and return the data
  labels = pipeline.predict(csr_matrice)
  df = pd.DataFrame({'label': labels, 'article': titles})
  return df

def nmfify(csr_matrice, titles):
  """
  Runs the TFIDF results through an NMF
  """
  model = NMF(n_components=6)
  model.fit(csv_matrice)
  nmf_features = model.transform(csr_matrice)

  # Construct the processed data
  df = pd.DataFrame(nmf_features, index=titles)
  return df

def main(documents):
  csr = to_csr(documents)
  df = clusterify(csr)
  print(df.sort_values('label'))
	"""
	From documents to clusters

	This script will run through a list of docs and process out the groups the docs may belong to using
	cluster analysis, NMF and TF*IDF for preprocessing. These are some basic techniques for unsupervised NLP
	which may be very handy.

	"""

	# For creating the data structure to process
	from sklearn.feature_extraction.text import TfidfVectorizer

	# For the clustering
	from sklearn.decomposition import TruncatedSVD
	from sklearn.cluster import KMeans
	from sklearn.pipeline import make_pipeline

	to_csr(documents):
	"""
	Documents is a list of strings
	"""
	tfidf = TfidfVectorizer()
	# CSR Matrices are high efficiency datasttucts for sparce matrices
	csr_mat = tfidf.fit_transform(documents)
	# print(csr_mat.toarray())
	# words = tfidf.get_feature_names()
	return csr_mat

	def clusterify(csr_matrice):
	"""
	The csr matrice is of type scipy.sparse.csr.csr_matrix
	"""
	# Set up the pipeline
	svd = TruncatedSVD(n_components=50)
	kmeans = KMeans(n_clusters=6)
	pipeline = make_pipeline(svd, kmeans)
	pipeline.fit(csr_matrice)

	# Cluster and return the data
	labels = pipeline.predict(csr_matrice)
	df = pd.DataFrame({'label': labels, 'article': titles})
	return df

	def nmfify(csr_matrice, titles):
	"""
	Runs the TFIDF results through an NMF
	"""
	model = NMF(n_components=6)
	model.fit(csv_matrice)
	nmf_features = model.transform(csr_matrice)

	# Construct the processed data
	df = pd.DataFrame(nmf_features, index=titles)
	return df

	def main(documents):
	csr = to_csr(documents)
	df = clusterify(csr)
	print(df.sort_values('label'))