Skip to content

Instantly share code, notes, and snippets.

View techykajal's full-sized avatar
🎯
Focusing

kajal yadav techykajal

🎯
Focusing
View GitHub Profile
# I have used StandarScaler() & fit_transform() function of sklearn library to standardize features.
X_std = StandardScaler().fit_transform(X_scaled)
# Used preprocessing module of sklearn library to scale data.
X_scaled = preprocessing.scale(Transposed_Dataset)
len(X_scaled)
# Create Dataframe
df = pd.DataFrame(w2v_feature_array)
df.index = n_grams_to_use
df.head()
# Create word to vector averaged feature array
w2v_feature_array = averaged_word_vectorizer(corpus=ngrams_splited, model=new_word_to_vec_map,
num_features=50)
w2v_feature_array.shape
# load glove vectors from pre-trained model domain dataset
glove_path = r"Generating_nGrams\Text Clustering\domain_embeddings.txt"
new_words_to_index, new_index_to_words, new_word_to_vec_map = read_glove(glove_path)
# function to read glove vectors from text file
def read_glove(glove_path):
"""
This function will read glove data from text file and do the following:
1. prepare dictionary of words and vectors
2. prepare dictionary of words and index
3. prepare dictionary of index and words
"""
# Read word_embedding file stored on glove_path specified.
with open(glove_path, 'r', encoding='utf-8')as inp_file:
def average_word_vectors(list_words, model, vocabulary, num_features):
"""
This function will take each tokenized sentence having bigrams or trigrams,
model = the mapping_of_word_to_vector dictionary, vocabulary = unique set of keys(words) present in model,
num_features = 50
This function will return the average of feature vector for each word present in list_words.
"""
# Created array of zeros (type float) of size num_features, i.e., 50.
feature_vector = np.zeros((num_features,),dtype="float64")
# split each n-gram into separate words
def split_nGrams(n_grams_to_use):
ngrams_splited = [each.split() for each in n_grams_to_use]
return ngrams_splited
ngrams_splited = split_nGrams(n_grams_to_use)
len(ngrams_splited)
def read_nGrams():
"""
This function will read bigrams & trigrams and
return combined list of bigrams & trigrams.
"""
# read bigrams
original_bigram = readFile("bigram.txt")
# read trigrams
original_trigram = readFile("trigram.txt")
def readFile(fileName):
"""
This function will read the text files passed & return the list
"""
fileObj = open(fileName, "r") #opens the file in read mode
words = fileObj.read().splitlines() #puts the file into a list
fileObj.close()
return words