ASHUTOSH KUMAR ashunigion

## README-Template.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                ashunigion
                / README-Template.md
            
            
              Created
              October 20, 2018 10:21
                — forked from PurpleBooth/README-Template.md
            
              
                A template to make good README.md
              
          
    Project Title

One Paragraph of project description goes here
Getting Started

These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system.
Prerequisites


## sample_data.py
label_0 = train_data[train_labels==0]
label_1 = train_data[train_labels==1]
#it can be repeated for all the intermediate labels
#...
label_9 = train_data[train_labels==9]

## _2_clustering.py
from sklearn.cluster import KMeans

kmeans_0 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_0)
sample_0 = kmeans_0.cluster_centers_

kmeans_1 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_1)
sample_1 = kmeans_1.cluster_centers_
#the code can be repeated for all the intermediate labels
#...
kmeans_9 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_9)

## _2_stack_train.py
proto_sample = np.concatenate((sample_0,sample_1,
                               sample_2,sample_3,
                               sample_4,sample_5,
                               sample_6,sample_7,
                               sample_8,sample_9
                              ), axis = 0
                             )

## _2_proto_labels.py
# generating the labels for prototyped training data
sample_0_labels = np.full((10,), 0)
sample_1_labels = np.full((10,), 1)
# similarly intermediate labels can be generated
# ...
sample_9_labels = np.full((10,), 9)

# stacking the labels into a single array
proto_labels = np.concatenate((sample_0_labels,sample_1_labels,
                               sample_2_labels,sample_3_labels,

## _2_random_proto.py
# the following snippet was provided as starter code by UCSD for the course
# Machine Learning Fundamentals on edX
def rand_prototypes(M):
    """
    Returns M randomly samples data points and corresponding labels.

    Parameters:
    M (int): number of data points to be sampled

    Returns:

## _2_comparision.py
# the following code snippet was provided by UCSD for Machine Learning Fundamentals on edX
@interact_manual( M=(100,2000,100), rounds=(1,10))
def comparison(M,rounds):
  """
  Shows the mean error of both the prototyping methods.  As we randomly prototyping the data,
  it makes sense to do it multiple times and take the mean error.

  Parameters:
  M(int): number of data points to be sampled
  r(int): number of times random dataset chosen to calculate the mean error

## _3_1.py
import nltk
nltk.download('reuters')
from nltk.corpus import reuters

def read_corpus(category="crude"):
    """ Read files from the specified Reuter's category. And adds
        START and END to beginning and end of each document.

        Params:
            category (string): category name

## _3_2.py
# dont forget to import "pprint"
reuters_corpus = read_corpus()
pprint.pprint(reuters_corpus[:3], compact=True, width=100)

# first three document from the corpus
# [['<START>', 'japan', 'to', 'revise', 'long', '-', 'term', 'energy', 'demand', 'downwards', 'the',
#  'ministry', 'of', 'international', 'trade', 'and', 'industry', '(', 'miti', ')', 'will', 'revise',
#  'its', 'long', '-', 'term', 'energy', 'supply', '/', 'demand', 'outlook', 'by', 'august', 'to',
#  'meet', 'a', 'forecast', 'downtrend', 'in', 'japanese', 'energy', 'demand', ',', 'ministry',
#  'officials', 'said', '.', 'miti', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for',

## _3_3.py
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = -1
	label_0 = train_data[train_labels==0]
	label_1 = train_data[train_labels==1]
	#it can be repeated for all the intermediate labels
	#...
	label_9 = train_data[train_labels==9]
	from sklearn.cluster import KMeans

	kmeans_0 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_0)
	sample_0 = kmeans_0.cluster_centers_

	kmeans_1 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_1)
	sample_1 = kmeans_1.cluster_centers_
	#the code can be repeated for all the intermediate labels
	#...
	kmeans_9 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_9)
	proto_sample = np.concatenate((sample_0,sample_1,
	sample_2,sample_3,
	sample_4,sample_5,
	sample_6,sample_7,
	sample_8,sample_9
	), axis = 0
	)
	# generating the labels for prototyped training data
	sample_0_labels = np.full((10,), 0)
	sample_1_labels = np.full((10,), 1)
	# similarly intermediate labels can be generated
	# ...
	sample_9_labels = np.full((10,), 9)

	# stacking the labels into a single array
	proto_labels = np.concatenate((sample_0_labels,sample_1_labels,
	sample_2_labels,sample_3_labels,
	# the following snippet was provided as starter code by UCSD for the course
	# Machine Learning Fundamentals on edX
	def rand_prototypes(M):
	"""
	Returns M randomly samples data points and corresponding labels.

	Parameters:
	M (int): number of data points to be sampled

	Returns:
	# the following code snippet was provided by UCSD for Machine Learning Fundamentals on edX
	@interact_manual( M=(100,2000,100), rounds=(1,10))
	def comparison(M,rounds):
	"""
	Shows the mean error of both the prototyping methods. As we randomly prototyping the data,
	it makes sense to do it multiple times and take the mean error.

	Parameters:
	M(int): number of data points to be sampled
	r(int): number of times random dataset chosen to calculate the mean error
	import nltk
	nltk.download('reuters')
	from nltk.corpus import reuters

	def read_corpus(category="crude"):
	""" Read files from the specified Reuter's category. And adds
	START and END to beginning and end of each document.

	Params:
	category (string): category name
	# dont forget to import "pprint"
	reuters_corpus = read_corpus()
	pprint.pprint(reuters_corpus[:3], compact=True, width=100)

	# first three document from the corpus
	# [['<START>', 'japan', 'to', 'revise', 'long', '-', 'term', 'energy', 'demand', 'downwards', 'the',
	# 'ministry', 'of', 'international', 'trade', 'and', 'industry', '(', 'miti', ')', 'will', 'revise',
	# 'its', 'long', '-', 'term', 'energy', 'supply', '/', 'demand', 'outlook', 'by', 'august', 'to',
	# 'meet', 'a', 'forecast', 'downtrend', 'in', 'japanese', 'energy', 'demand', ',', 'ministry',
	# 'officials', 'said', '.', 'miti', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for',
	def distinct_words(corpus):
	""" Determine a list of distinct words for the corpus.
	Params:
	corpus (list of list of strings): corpus of documents
	Return:
	corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
	num_corpus_words (integer): number of distinct words across the corpus
	"""
	corpus_words = []
	num_corpus_words = -1