kajal yadav techykajal

## standardize_dataset.py
# I have used StandarScaler() & fit_transform() function of sklearn library to standardize features.
X_std = StandardScaler().fit_transform(X_scaled)

## scale_dataset.py
# Used preprocessing module of sklearn library to scale data.
X_scaled = preprocessing.scale(Transposed_Dataset)
len(X_scaled)

## create_dataframe.py
# Create Dataframe
df = pd.DataFrame(w2v_feature_array)
df.index = n_grams_to_use
df.head()

## word_vec_avg_feature_array.py
# Create word to vector averaged feature array
w2v_feature_array = averaged_word_vectorizer(corpus=ngrams_splited, model=new_word_to_vec_map,
                                             num_features=50)
w2v_feature_array.shape

## domain_embeddings.py
# load glove vectors from pre-trained model domain dataset
glove_path = r"Generating_nGrams\Text Clustering\domain_embeddings.txt"
new_words_to_index, new_index_to_words, new_word_to_vec_map  = read_glove(glove_path)

## read_glove_vectors.py
#  function to read glove vectors from text file
def read_glove(glove_path):
    """
    This function will read glove data from text file and do the following:
    1. prepare dictionary of words and vectors
    2. prepare dictionary of words and index
    3. prepare dictionary of index and words
    """
    # Read word_embedding file stored on glove_path specified.
    with open(glove_path, 'r', encoding='utf-8')as inp_file:

## avg_word_vectors.py
def average_word_vectors(list_words, model, vocabulary, num_features):
    """
    This function will take each tokenized sentence having bigrams or trigrams,
    model = the mapping_of_word_to_vector dictionary, vocabulary = unique set of keys(words) present in model,
    num_features = 50

    This function will return the average of feature vector for each word present in list_words.
    """
    # Created array of zeros (type float) of size num_features, i.e., 50.
    feature_vector = np.zeros((num_features,),dtype="float64")

## split_nGrams_list_words.py
# split each n-gram into separate words
def split_nGrams(n_grams_to_use):
    ngrams_splited = [each.split() for each in n_grams_to_use]
    return ngrams_splited
ngrams_splited = split_nGrams(n_grams_to_use)
len(ngrams_splited)

## read_nGrams.py
def read_nGrams():
    """
    This function will read bigrams & trigrams and
    return combined list of bigrams & trigrams.
    """
    # read  bigrams
    original_bigram = readFile("bigram.txt")
    # read trigrams
    original_trigram = readFile("trigram.txt")

## readfile.py
def readFile(fileName):
    """
    This function will read the text files passed & return the list
    """
    fileObj = open(fileName, "r") #opens the file in read mode
    words = fileObj.read().splitlines() #puts the file into a list
    fileObj.close()
    return words
	# I have used StandarScaler() & fit_transform() function of sklearn library to standardize features.
	X_std = StandardScaler().fit_transform(X_scaled)
	# Used preprocessing module of sklearn library to scale data.
	X_scaled = preprocessing.scale(Transposed_Dataset)
	len(X_scaled)
	# Create Dataframe
	df = pd.DataFrame(w2v_feature_array)
	df.index = n_grams_to_use
	df.head()
	# Create word to vector averaged feature array
	w2v_feature_array = averaged_word_vectorizer(corpus=ngrams_splited, model=new_word_to_vec_map,
	num_features=50)
	w2v_feature_array.shape
	# load glove vectors from pre-trained model domain dataset
	glove_path = r"Generating_nGrams\Text Clustering\domain_embeddings.txt"
	new_words_to_index, new_index_to_words, new_word_to_vec_map = read_glove(glove_path)
	# function to read glove vectors from text file
	def read_glove(glove_path):
	"""
	This function will read glove data from text file and do the following:
	1. prepare dictionary of words and vectors
	2. prepare dictionary of words and index
	3. prepare dictionary of index and words
	"""
	# Read word_embedding file stored on glove_path specified.
	with open(glove_path, 'r', encoding='utf-8')as inp_file:
	def average_word_vectors(list_words, model, vocabulary, num_features):
	"""
	This function will take each tokenized sentence having bigrams or trigrams,
	model = the mapping_of_word_to_vector dictionary, vocabulary = unique set of keys(words) present in model,
	num_features = 50

	This function will return the average of feature vector for each word present in list_words.
	"""
	# Created array of zeros (type float) of size num_features, i.e., 50.
	feature_vector = np.zeros((num_features,),dtype="float64")
	# split each n-gram into separate words
	def split_nGrams(n_grams_to_use):
	ngrams_splited = [each.split() for each in n_grams_to_use]
	return ngrams_splited
	ngrams_splited = split_nGrams(n_grams_to_use)
	len(ngrams_splited)
	def read_nGrams():
	"""
	This function will read bigrams & trigrams and
	return combined list of bigrams & trigrams.
	"""
	# read bigrams
	original_bigram = readFile("bigram.txt")
	# read trigrams
	original_trigram = readFile("trigram.txt")
	def readFile(fileName):
	"""
	This function will read the text files passed & return the list
	"""
	fileObj = open(fileName, "r") #opens the file in read mode
	words = fileObj.read().splitlines() #puts the file into a list
	fileObj.close()
	return words