Skip to content

Instantly share code, notes, and snippets.

View megha444's full-sized avatar

Megha Agarwal megha444

  • Montreal, Canada
View GitHub Profile
pip install glove_python
import nltk
nltk.download('stopwords')
nltk.download ('punkt')
nltk.download('wordnet')
lines= ["Hello this is a tutorial to convert word to integer" , "It is a beautiful day" , "Jack is going to office"]
from nltk.tokenize import sent_tokenize, word_tokenize
word_tokens=[]
i=0
for line in lines:
words = word_tokenize(line)
word_tokens.insert(i,words)
i=i+1
print (word_tokens)
from nltk.corpus import stopwords
stop_words=stopwords.words('english')
lines_without_stopwords=[]
for line in lines:
stop_removed=[]
for line in word_tokens:
for word in line:
if word not in stop_words:
stop_removed.append(word)
print (stop_removed)
from nltk import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
lines_with_lemmas=[] #stop words contain the set of stop words
for line in lines:
lem_line=[]
for word in stop_removed:
lem_line.append(wordnet_lemmatizer.lemmatize(word))
string=''
new_lines=','.join([str(i) for i in lem_line])
#importing the glove library
from glove import Corpus, Glove
# creating a corpus object
corpus = Corpus()
#training the corpus to generate the co occurence matrix which is used in GloVe
corpus.fit(new_lines, window=10)
#creating a Glove object which will use the matrix created in the above lines to create embeddings
#We can set the learning rate as it uses Gradient Descent and number of components
glove = Glove(no_components=5, learning_rate=0.05)
glove.add_dictionary(corpus.dictionary)
print glove.word_vectors[glove.dictionary['samsung']]
with open(‘data/reviews.txt’, ‘r’) as f:
reviews = f.read()
with open(‘data/labels.txt’, ‘r’) as f:
labels = f.read()
print(reviews[:50])
print()
print(labels[:26])
with open(‘data/reviews.txt’, ‘r’) as f:
reviews = f.read()
with open(‘data/labels.txt’, ‘r’) as f:
labels = f.read()