Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
word2vec example using tweet data
import pandas as pd
import re
import numpy as np
import nltk
import gensim
#import data. contains identifier and tweet
tweets=pd.DataFrame.from_csv('tweets.txt', sep='\t', index_col=False)
#data prep
#cleaning
#lower case
clean= tweets['tweet'].str.lower()
#untranslated symbols
clean = clean.str.replace('amp', ' ')
clean = clean.str.replace('quot', ' ')
#keep words whitespace and '
clean = clean.str.replace(r'[^\w\s\']','')
#remove numerics
clean=clean.str.replace(r'[\d]','')
sentences = clean.tolist()
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
model = gensim.models.Word2Vec(tokenized_sentences, min_count=10)
model.most_similar(positive=['moon'], topn=1)
model.most_similar(positive=['moon'], negative=['poor'], topn=5)
model.most_similar(positive=['moon', 'bench'], topn=5)
model.similarity('john', 'lewis')
model.similarity('bench', 'moon')
@rajacsp

This comment has been minimized.

Copy link

commented Sep 23, 2018

If you can update the 'tweet.txt' file location, it would be great.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.