Abhayparashar31/Word2Vec.py

## Word2Vec.py
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

## Loading dataset from the github repository raw url
df = pd.read_csv('https://raw.githubusercontent.com/Abhayparashar31/datasets/master/twitter.csv')

## cleaning the text with the help of an external python file containing cleaning function
corpus = []
for i in range(0,len(df)):   #we have 1000 reviews
    corpus.append(clean_text(df['text'][i]))   ## 'clean_text' is a separate python file containing a function for cleaning this data. You can find it here : https://gist.github.com/Abhayparashar31/81997c2e2268338809c46a220d08649f
corpus_splitted = [i.split() for i in corpus]

## Generating Word Embeddings
from gensim import models
w2v = models.Word2Vec(corpus_splitted)

## vector representation of  word 'flood'
print(w2v['flood'])

## 5 most similar words for word 'flood'
print(w2v.wv.most_similar('flood')[:5)
	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	wordnet_lemmatizer = WordNetLemmatizer()

	## Loading dataset from the github repository raw url
	df = pd.read_csv('https://raw.githubusercontent.com/Abhayparashar31/datasets/master/twitter.csv')

	## cleaning the text with the help of an external python file containing cleaning function
	corpus = []
	for i in range(0,len(df)): #we have 1000 reviews
	corpus.append(clean_text(df['text'][i])) ## 'clean_text' is a separate python file containing a function for cleaning this data. You can find it here : https://gist.github.com/Abhayparashar31/81997c2e2268338809c46a220d08649f
	corpus_splitted = [i.split() for i in corpus]

	## Generating Word Embeddings
	from gensim import models
	w2v = models.Word2Vec(corpus_splitted)

	## vector representation of word 'flood'
	print(w2v['flood'])

	## 5 most similar words for word 'flood'
	print(w2v.wv.most_similar('flood')[:5)