balamuru/gist:4726232

## gistfile1.py
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
# License: Simplified BSD

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import FeatureHasher
from sklearn.pipeline import Pipeline
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np
import os
import urllib
import logging
import tarfile
import pickle
import shutil
from os import environ
from os.path import dirname
from os.path import join
from os.path import exists
from os.path import expanduser
from os.path import isdir
from os import listdir
from os import makedirs

files_per_chunk = 10

def iter_documents(top_directory, max_files_per_chunk):
#Iterate over all documents, yielding a document (=list of utf8 tokens) at a time.
#    file_list = []
    dict = {}
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            document = open(os.path.join(root, file)).read() # read the entire document, as one big string
            #handle document
 #           file_list.append(document)
            dict[file] = document
#            if len(file_list) >= max_files_per_chunk:
            if len(dict) >= max_files_per_chunk:
#                yield file_list
                yield dict
#                file_list[:] = []
                dict = {}

#    yield file_list
    yield dict

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

## parse commandline arguments
op = OptionParser()
op.add_option("--no-minibatch",
              action="store_false", dest="minibatch", default=True,
              help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--no-idf",
              action="store_false", dest="use_idf", default=True,
              help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",
              action="store_true", default=False,
              help="Use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=10000,
              help="Maximum number of features (dimensions)"
                   "to extract from text.")

print __doc__
op.print_help()

(opts, args) = op.parse_args()
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)


# Uncomment the following to do the analysis on all the categories
#categories = None

#############################

vectorizer = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       non_negative=False, norm='l2',
                                       binary=False)

for doc_dict in iter_documents("/home/vinayb/data/reuters-21578-subset-4315", files_per_chunk):
    # add the docs in chunks of size 'files_per_chunk'

    X_transform_counts = vectorizer.transform(doc_dict.values())
    #X_fit_transform_counts = vectorizer.fit_transform(doc_dict.values())

    #Question 1: I don't know class information because this is an unsupervised learning (clustering) operation. Hence I can't perform a partial_fit
    #Question2 : WRT Question 1, What should I be passing into the clustering algorithm. I would first have to incrementally accumulate data in the vectorizer


    print "## counts: " + str(X_transform_counts.shape)     #<== I wont know the document class in advance for a clustering opeation

#    for doc in doc_list:
#        print doc

print vectorizer
	# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
	# Lars Buitinck <L.J.Buitinck@uva.nl>
	# License: Simplified BSD

	from sklearn.datasets import fetch_20newsgroups
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.feature_extraction.text import HashingVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.feature_extraction.text import FeatureHasher
	from sklearn.pipeline import Pipeline
	from sklearn import metrics

	from sklearn.cluster import KMeans, MiniBatchKMeans

	import logging
	from optparse import OptionParser
	import sys
	from time import time

	import numpy as np
	import os
	import urllib
	import logging
	import tarfile
	import pickle
	import shutil
	from os import environ
	from os.path import dirname
	from os.path import join
	from os.path import exists
	from os.path import expanduser
	from os.path import isdir
	from os import listdir
	from os import makedirs

	files_per_chunk = 10

	def iter_documents(top_directory, max_files_per_chunk):
	#Iterate over all documents, yielding a document (=list of utf8 tokens) at a time.
	# file_list = []
	dict = {}
	for root, dirs, files in os.walk(top_directory):
	for file in filter(lambda file: file.endswith('.txt'), files):
	document = open(os.path.join(root, file)).read() # read the entire document, as one big string
	#handle document
	# file_list.append(document)
	dict[file] = document
	# if len(file_list) >= max_files_per_chunk:
	if len(dict) >= max_files_per_chunk:
	# yield file_list
	yield dict
	# file_list[:] = []
	dict = {}

	# yield file_list
	yield dict

	# Display progress logs on stdout
	logging.basicConfig(level=logging.INFO,
	format='%(asctime)s %(levelname)s %(message)s')

	## parse commandline arguments
	op = OptionParser()
	op.add_option("--no-minibatch",
	action="store_false", dest="minibatch", default=True,
	help="Use ordinary k-means algorithm (in batch mode).")
	op.add_option("--no-idf",
	action="store_false", dest="use_idf", default=True,
	help="Disable Inverse Document Frequency feature weighting.")
	op.add_option("--use-hashing",
	action="store_true", default=False,
	help="Use a hashing feature vectorizer")
	op.add_option("--n-features", type=int, default=10000,
	help="Maximum number of features (dimensions)"
	"to extract from text.")

	print __doc__
	op.print_help()

	(opts, args) = op.parse_args()
	if len(args) > 0:
	op.error("this script takes no arguments.")
	sys.exit(1)



	# Uncomment the following to do the analysis on all the categories
	#categories = None

	#############################

	vectorizer = HashingVectorizer(n_features=opts.n_features,
	stop_words='english',
	non_negative=False, norm='l2',
	binary=False)

	for doc_dict in iter_documents("/home/vinayb/data/reuters-21578-subset-4315", files_per_chunk):
	# add the docs in chunks of size 'files_per_chunk'

	X_transform_counts = vectorizer.transform(doc_dict.values())
	#X_fit_transform_counts = vectorizer.fit_transform(doc_dict.values())

	#Question 1: I don't know class information because this is an unsupervised learning (clustering) operation. Hence I can't perform a partial_fit
	#Question2 : WRT Question 1, What should I be passing into the clustering algorithm. I would first have to incrementally accumulate data in the vectorizer


	print "## counts: " + str(X_transform_counts.shape) #<== I wont know the document class in advance for a clustering opeation

	# for doc in doc_list:
	# print doc

	print vectorizer