thmavri/RNN_word_embeddings_Named_Entity_Classification_sample.py

## RNN_word_embeddings_Named_Entity_Classification_sample.py
#create an index of all the words (words2idx)
...
#create an index of all the labels (labels2idx)
...
#query to classify
q="hotel amsterdam wifi"
#labels to use "prop", "dest", "fac"

#this will contain all our training data
matrix=[]

#this array will contain all the indexes of the words
words=[]
#this array will contain all the indexes of the labels
labels=[]
#this will contain the arrays of indexes
sentence=[]

words.append(words2idx["hotel"])
labels.append(labels2idx['proptype'])

words.append(words2idx["amsterdam"])
labels.append(labels2idx['dest'])

words.append(words2idx["wifi"])
labels.append(labels2idx['fac'])

sentence.append(words)
sentence.append(labels)
sentence.append(labels)
matrix.append(sentence)

...
#the RNN settings
s = { 'fold':3, # 5 folds 0,1,2,3,4
      'lr':0.0627142536696559,
      'verbose':1,
      'decay':False, # decay on the learning rate
      'win':7, # number of words in the context window
      'bs':9, # number of backprop through time steps
      'nhidden':100, # number of hidden units
      'seed':345,
      'emb_dimension':100, # dimension of word embedding
      'nepochs':50}

# instanciate the model
numpy.random.seed(s['seed'])
random.seed(s['seed'])
rnn = model(    nh = s['nhidden'],
                nc = nclasses,
                ne = vocsize,
                de = s['emb_dimension'],
                cs = s['win'] )
#separate in train, test, validation
...
#train
for i in xrange(nsentences):
    train_lex_list=train_lex[i].tolist()[0]
    cwords = contextwin(train_lex_list, s['win'])
    #print cwords
    words  = map(lambda x: numpy.asarray(x).astype('int32'),\
    minibatch(cwords, s['bs']))
    labels = train_y[i]
    cnt_w=0
    for word_batch , label_last_word in zip(words, labels):
    #print word_batch, label_last_word
    #cnt_w+=1
    #print cnt_w
    rnn.train(word_batch, label_last_word, s['clr'])
    rnn.normalize()
    if s['verbose']:
    print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic),
    sys.stdout.flush()
	#create an index of all the words (words2idx)
	...
	#create an index of all the labels (labels2idx)
	...
	#query to classify
	q="hotel amsterdam wifi"
	#labels to use "prop", "dest", "fac"

	#this will contain all our training data
	matrix=[]

	#this array will contain all the indexes of the words
	words=[]
	#this array will contain all the indexes of the labels
	labels=[]
	#this will contain the arrays of indexes
	sentence=[]

	words.append(words2idx["hotel"])
	labels.append(labels2idx['proptype'])

	words.append(words2idx["amsterdam"])
	labels.append(labels2idx['dest'])

	words.append(words2idx["wifi"])
	labels.append(labels2idx['fac'])

	sentence.append(words)
	sentence.append(labels)
	sentence.append(labels)
	matrix.append(sentence)

	...
	#the RNN settings
	s = { 'fold':3, # 5 folds 0,1,2,3,4
	'lr':0.0627142536696559,
	'verbose':1,
	'decay':False, # decay on the learning rate
	'win':7, # number of words in the context window
	'bs':9, # number of backprop through time steps
	'nhidden':100, # number of hidden units
	'seed':345,
	'emb_dimension':100, # dimension of word embedding
	'nepochs':50}

	# instanciate the model
	numpy.random.seed(s['seed'])
	random.seed(s['seed'])
	rnn = model( nh = s['nhidden'],
	nc = nclasses,
	ne = vocsize,
	de = s['emb_dimension'],
	cs = s['win'] )
	#separate in train, test, validation
	...
	#train
	for i in xrange(nsentences):
	train_lex_list=train_lex[i].tolist()[0]
	cwords = contextwin(train_lex_list, s['win'])
	#print cwords
	words = map(lambda x: numpy.asarray(x).astype('int32'),\
	minibatch(cwords, s['bs']))
	labels = train_y[i]
	cnt_w=0
	for word_batch , label_last_word in zip(words, labels):
	#print word_batch, label_last_word
	#cnt_w+=1
	#print cnt_w
	rnn.train(word_batch, label_last_word, s['clr'])
	rnn.normalize()
	if s['verbose']:
	print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic),
	sys.stdout.flush()