kachok/tweets_es_pickleit.py

## tweets_es_pickleit.py
import codecs
import pickle


file = "/Users/dkachaev/repos/hltcoe/tweets-es/data/oov.vocab"
out = codecs.open(file, "r", "utf-8")


vocab={}

f=open("/Users/dkachaev/repos/hltcoe/tweets-es/data/tweets_es_vocabulary.pickle","w")

for line in out:
	try:
		line=line.strip()
		freq, word = line.split(" ")
		#print word, " - " ,freq

		vocab[word]={"frequency":int(freq),"context":[""]}
		# Context - "" <- need text of original tweet where word occurred, or 3 tweets ["tweet1", "tweet2", "tweet3"]

	except:
		print "skipping line"

pickle.dump(vocab,f)
f.close()
	import codecs
	import pickle


	file = "/Users/dkachaev/repos/hltcoe/tweets-es/data/oov.vocab"
	out = codecs.open(file, "r", "utf-8")


	vocab={}

	f=open("/Users/dkachaev/repos/hltcoe/tweets-es/data/tweets_es_vocabulary.pickle","w")

	for line in out:
	try:
	line=line.strip()
	freq, word = line.split(" ")
	#print word, " - " ,freq

	vocab[word]={"frequency":int(freq),"context":[""]}
	# Context - "" <- need text of original tweet where word occurred, or 3 tweets ["tweet1", "tweet2", "tweet3"]

	except:
	print "skipping line"

	pickle.dump(vocab,f)
	f.close()