Skip to content

Instantly share code, notes, and snippets.

@rgtjf
Last active July 12, 2017 02:25
Show Gist options
  • Save rgtjf/6080cae822794b97dd8c29e63f57ff0a to your computer and use it in GitHub Desktop.
Save rgtjf/6080cae822794b97dd8c29e63f57ff0a to your computer and use it in GitHub Desktop.
import random
import numpy as np
import re
def make_batches(size, batch_size):
"""
:param size: the size of dataset
:param batch_size: the size of batch
:return: list: [(0, batch_size), (batch_size, 2*batch_size), ..., (. , min(., .))]
"""
nb_batch = int(np.ceil(size/float(batch_size)))
return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]
def pad_2d_matrix(batch_words, max_sent_length=None, dtype=np.int32):
"""
:param batch_words: [batch_size, sent_length]
:param max_sent_length: if None, max(sent_length)
:param dtype:
:return: padding_words: [batch_size, max_sent_length], 0
"""
if max_sent_length is None:
max_sent_length = np.max([len(words) for words in batch_words])
batch_size = len(batch_words)
padding_words = np.zeros((batch_size, max_sent_length), dtype=dtype)
for i in range(batch_size):
words = batch_words[i]
kept_length = len(words)
if kept_length > max_sent_length:
kept_length = max_sent_length
padding_words[i, :kept_length] = words[:kept_length]
return padding_words
def pad_3d_tensor(batch_chars, max_sent_length=None, max_word_length=None, dtype=np.int32):
"""
:param batch_chars: [batch_size, sent_length, word_length]
:param max_sent_length:
:param max_word_length:
:param dtype:
:return:
"""
if max_sent_length is None:
max_sent_length = np.max([len(words) for words in batch_chars])
if max_word_length is None:
max_word_length = np.max([np.max([len(chars) for chars in words]) for words in batch_chars])
batch_size = len(batch_chars)
padding_chars = np.zeros((batch_size, max_sent_length, max_word_length), dtype=dtype)
for i in range(batch_size):
sent_length = max_sent_length
if len(batch_chars[i]) < max_sent_length:
sent_length = len(batch_chars[i])
for j in range(sent_length):
chars = batch_chars[i][j]
kept_length = len(chars)
if kept_length > max_word_length:
kept_length = max_word_length
padding_chars[i, j, :kept_length] = chars[:kept_length]
return padding_chars
class Vocab(object):
def __init__(self):
"""
Vocab, word2index
"""
self.vocab = {}
def __setitem__(self, key, value):
self.vocab[key] = value
def __getitem__(self, item):
if item not in self.vocab:
raise KeyError
return self.vocab[item]
def _save_(self):
pass
def _load_(self):
pass
def load_word_embedding(self):
"""
self.vocab build [len(vocab), dim]
:return:
"""
pass
class Data(object):
def __init__(self):
pass
def add_word(self):
self.word = []
self.word_index = []
class Dataset(object):
def __init__(self, file_list, word_vocab,
is_shuffle=False, is_loop=False, is_sort=True,
batch_size=32, max_char_per_word=10, max_sent_length=200,
label_vocab=None, num_classes=6):
pass
def read_data(self, file):
pass
class SentenceMatchDataStream(object):
def __init__(self, inpath, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None, label_vocab=None, batch_size=60,
isShuffle=False, isLoop=False, isSort=True, max_char_per_word=10, max_sent_length=200, num_classes=6):
instances = []
if type(inpath) == str:
inpath = [inpath]
for file in inpath:
infile = open(file, 'rt')
for line in infile:
line = line.strip()
if line.startswith('-'): continue
items = re.split("\t", line)
label = items[2]
sentence1 = items[0].lower()
sentence2 = items[1].lower()
label = float(label)
label_id = float(label)
label_id = self.vectorize(label_id, num_classes)
word_idx_1 = word_vocab.to_index_sequence(sentence1)
word_idx_2 = word_vocab.to_index_sequence(sentence2)
char_matrix_idx_1 = char_vocab.to_character_matrix(sentence1)
char_matrix_idx_2 = char_vocab.to_character_matrix(sentence2)
if len(word_idx_1)>max_sent_length:
word_idx_1 = word_idx_1[:max_sent_length]
char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]
if len(word_idx_2)>max_sent_length:
word_idx_2 = word_idx_2[:max_sent_length]
char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]
POS_idx_1 = None
POS_idx_2 = None
if POS_vocab is not None:
POS_idx_1 = POS_vocab.to_index_sequence(items[3])
if len(POS_idx_1)>max_sent_length: POS_idx_1 = POS_idx_1[:max_sent_length]
POS_idx_2 = POS_vocab.to_index_sequence(items[4])
if len(POS_idx_2)>max_sent_length: POS_idx_2 = POS_idx_2[:max_sent_length]
NER_idx_1 = None
NER_idx_2 = None
if NER_vocab is not None:
NER_idx_1 = NER_vocab.to_index_sequence(items[5])
if len(NER_idx_1)>max_sent_length: NER_idx_1 = NER_idx_1[:max_sent_length]
NER_idx_2 = NER_vocab.to_index_sequence(items[6])
if len(NER_idx_2)>max_sent_length: NER_idx_2 = NER_idx_2[:max_sent_length]
instances.append((label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2))
infile.close()
# sort instances based on sentence length
if isSort: instances = sorted(instances, key=lambda instance: (len(instance[4]), len(instance[5]))) # sort instances based on length
self.num_instances = len(instances)
# distribute into different buckets
batch_spans = make_batches(self.num_instances, batch_size)
self.batches = []
for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
label_batch = []
sent1_batch = []
sent2_batch = []
label_id_batch = []
word_idx_1_batch = []
word_idx_2_batch = []
char_matrix_idx_1_batch = []
char_matrix_idx_2_batch = []
sent1_length_batch = []
sent2_length_batch = []
sent1_char_length_batch = []
sent2_char_length_batch = []
POS_idx_1_batch = None
if POS_vocab is not None: POS_idx_1_batch = []
POS_idx_2_batch = None
if POS_vocab is not None: POS_idx_2_batch = []
NER_idx_1_batch = None
if NER_vocab is not None: NER_idx_1_batch = []
NER_idx_2_batch = None
if NER_vocab is not None: NER_idx_2_batch = []
for i in range(batch_start, batch_end):
(label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2) = instances[i]
label_batch.append(label)
sent1_batch.append(sentence1)
sent2_batch.append(sentence2)
label_id_batch.append(label_id)
word_idx_1_batch.append(word_idx_1)
word_idx_2_batch.append(word_idx_2)
char_matrix_idx_1_batch.append(char_matrix_idx_1)
char_matrix_idx_2_batch.append(char_matrix_idx_2)
sent1_length_batch.append(len(word_idx_1))
sent2_length_batch.append(len(word_idx_2))
sent1_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_1])
sent2_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_2])
if POS_vocab is not None:
POS_idx_1_batch.append(POS_idx_1)
POS_idx_2_batch.append(POS_idx_2)
if NER_vocab is not None:
NER_idx_1_batch.append(NER_idx_1)
NER_idx_2_batch.append(NER_idx_2)
cur_batch_size = len(label_batch)
if cur_batch_size ==0: continue
# padding
max_sent1_length = np.max(sent1_length_batch)
max_sent2_length = np.max(sent2_length_batch)
max_char_length1 = np.max([np.max(aa) for aa in sent1_char_length_batch])
if max_char_length1>max_char_per_word: max_char_length1=max_char_per_word
max_char_length2 = np.max([np.max(aa) for aa in sent2_char_length_batch])
if max_char_length2>max_char_per_word: max_char_length2=max_char_per_word
label_id_batch = np.array(label_id_batch)
word_idx_1_batch = pad_2d_matrix(word_idx_1_batch, max_sent_length=max_sent1_length)
word_idx_2_batch = pad_2d_matrix(word_idx_2_batch, max_sent_length=max_sent2_length)
char_matrix_idx_1_batch = pad_3d_tensor(char_matrix_idx_1_batch, max_sent_length=max_sent1_length, max_word_length=max_char_length1)
char_matrix_idx_2_batch = pad_3d_tensor(char_matrix_idx_2_batch, max_sent_length=max_sent2_length, max_word_length=max_char_length2)
sent1_length_batch = np.array(sent1_length_batch)
sent2_length_batch = np.array(sent2_length_batch)
sent1_char_length_batch = pad_2d_matrix(sent1_char_length_batch, max_sent_length=max_sent1_length)
sent2_char_length_batch = pad_2d_matrix(sent2_char_length_batch, max_sent_length=max_sent2_length)
if POS_vocab is not None:
POS_idx_1_batch = pad_2d_matrix(POS_idx_1_batch, max_sent_length=max_sent1_length)
POS_idx_2_batch = pad_2d_matrix(POS_idx_2_batch, max_sent_length=max_sent2_length)
if NER_vocab is not None:
NER_idx_1_batch = pad_2d_matrix(NER_idx_1_batch, max_sent_length=max_sent1_length)
NER_idx_2_batch = pad_2d_matrix(NER_idx_2_batch, max_sent_length=max_sent2_length)
self.batches.append((label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch,
char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch,
sent1_char_length_batch, sent2_char_length_batch,
POS_idx_1_batch, POS_idx_2_batch, NER_idx_1_batch, NER_idx_2_batch))
instances = None
self.num_batch = len(self.batches)
self.index_array = np.arange(self.num_batch)
self.isShuffle = isShuffle
if self.isShuffle: np.random.shuffle(self.index_array)
self.isLoop = isLoop
self.cur_pointer = 0
def nextBatch(self):
if self.cur_pointer>=self.num_batch:
if not self.isLoop: return None
self.cur_pointer = 0
if self.isShuffle: np.random.shuffle(self.index_array)
# print('{} '.format(self.index_array[self.cur_pointer]))
cur_batch = self.batches[self.index_array[self.cur_pointer]]
cur_batch = self.scramble(cur_batch)
self.cur_pointer += 1
return cur_batch
def scramble(self, cur_batch):
n = np.random.binomial(1, 0.25, 1)[0]
if n > 0 and self.isShuffle == True:
# if n > 0:
(label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch,
char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch,
sent1_char_length_batch, sent2_char_length_batch,
POS_idx_1_batch, POS_idx_2_batch, NER_idx_1_batch, NER_idx_2_batch) = cur_batch
for idx in range(len(word_idx_1_batch)):
word_idx_1 = word_idx_1_batch[idx]
word_idx_2 = word_idx_2_batch[idx]
# sent1_length = min(30, int(sent1_length_batch[idx]))
sent1_length = sent1_length_batch[idx]
sent2_length = sent2_length_batch[idx]
w_idx_1 = word_idx_1[:sent1_length]
random.shuffle(w_idx_1)
word_idx_1[:sent1_length] = w_idx_1
w_idx_2 = word_idx_2[:sent1_length]
random.shuffle(w_idx_2)
word_idx_2[:sent1_length] = w_idx_2
cur_batch[4][idx] = word_idx_1
cur_batch[5][idx] = word_idx_2
return cur_batch
def reset(self):
self.cur_pointer = 0
def get_num_batch(self):
return self.num_batch
def get_num_instance(self):
return self.num_instances
def get_batch(self, i):
if i>= self.num_batch: return None
return self.batches[i]
def vectorize(self, score, num_classes):
temp = np.zeros(num_classes, dtype=float)
score = float(score)
ceil, fl = int(np.ceil(score)), int(np.floor(score))
if ceil == fl:
temp[fl - 1] = 1
else:
temp[fl - 1] = ceil - score
temp[ceil - 1] = score - fl
temp = temp + 0.00001
return temp
class MultilingualSentenceMatchDataStream(object):
def __init__(self, inpath, en_word_vocab=None, es_word_vocab=None, en_char_vocab=None, es_char_vocab=None,
batch_size=60,
isShuffle=False, isLoop=False, isSort=True, max_char_per_word=10, max_sent_length=200, num_classes=6):
instances = []
if type(inpath) == str:
inpath = [inpath]
for file in inpath:
infile = open(file, 'rt')
for line in infile:
line = line.strip()
if line.startswith('-'): continue
items = re.split("\t", line)
label = items[2]
sentence1 = items[0].lower()
sentence2 = items[1].lower()
label = float(label)
label_id = float(label)
label_id = self.vectorize(label_id, num_classes)
word_idx_1 = en_word_vocab.to_index_sequence(sentence1)
word_idx_2 = es_word_vocab.to_index_sequence(sentence2)
char_matrix_idx_1 = en_char_vocab.to_character_matrix(sentence1)
char_matrix_idx_2 = es_char_vocab.to_character_matrix(sentence2)
if len(word_idx_1) > max_sent_length:
word_idx_1 = word_idx_1[:max_sent_length]
char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]
if len(word_idx_2) > max_sent_length:
word_idx_2 = word_idx_2[:max_sent_length]
char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]
instances.append(
(label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
None, None, None, None))
infile.close()
# sort instances based on sentence length
if isSort: instances = sorted(instances, key=lambda instance: (len(instance[4]), len(instance[5]))) # sort instances based on length
self.num_instances = len(instances)
# distribute into different buckets
batch_spans = make_batches(self.num_instances, batch_size)
self.batches = []
for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
label_batch = []
sent1_batch = []
sent2_batch = []
label_id_batch = []
word_idx_1_batch = []
word_idx_2_batch = []
char_matrix_idx_1_batch = []
char_matrix_idx_2_batch = []
sent1_length_batch = []
sent2_length_batch = []
sent1_char_length_batch = []
sent2_char_length_batch = []
for i in range(batch_start, batch_end):
(label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2) = instances[i]
label_batch.append(label)
sent1_batch.append(sentence1)
sent2_batch.append(sentence2)
label_id_batch.append(label_id)
word_idx_1_batch.append(word_idx_1)
word_idx_2_batch.append(word_idx_2)
char_matrix_idx_1_batch.append(char_matrix_idx_1)
char_matrix_idx_2_batch.append(char_matrix_idx_2)
sent1_length_batch.append(len(word_idx_1))
sent2_length_batch.append(len(word_idx_2))
sent1_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_1])
sent2_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_2])
cur_batch_size = len(label_batch)
if cur_batch_size == 0: continue
# padding
max_sent1_length = np.max(sent1_length_batch)
max_sent2_length = np.max(sent2_length_batch)
max_char_length1 = np.max([np.max(aa) for aa in sent1_char_length_batch])
if max_char_length1 > max_char_per_word: max_char_length1 = max_char_per_word
max_char_length2 = np.max([np.max(aa) for aa in sent2_char_length_batch])
if max_char_length2 > max_char_per_word: max_char_length2 = max_char_per_word
label_id_batch = np.array(label_id_batch)
word_idx_1_batch = pad_2d_matrix(word_idx_1_batch, max_sent_length=max_sent1_length)
word_idx_2_batch = pad_2d_matrix(word_idx_2_batch, max_sent_length=max_sent2_length)
char_matrix_idx_1_batch = pad_3d_tensor(char_matrix_idx_1_batch, max_sent_length=max_sent1_length,
max_word_length=max_char_length1)
char_matrix_idx_2_batch = pad_3d_tensor(char_matrix_idx_2_batch, max_sent_length=max_sent2_length,
max_word_length=max_char_length2)
sent1_length_batch = np.array(sent1_length_batch)
sent2_length_batch = np.array(sent2_length_batch)
sent1_char_length_batch = pad_2d_matrix(sent1_char_length_batch, max_sent_length=max_sent1_length)
sent2_char_length_batch = pad_2d_matrix(sent2_char_length_batch, max_sent_length=max_sent2_length)
self.batches.append((label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch,
char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch,
sent1_char_length_batch, sent2_char_length_batch,
None, None, None, None))
instances = None
self.num_batch = len(self.batches)
self.index_array = np.arange(self.num_batch)
self.isShuffle = isShuffle
if self.isShuffle: np.random.shuffle(self.index_array)
self.isLoop = isLoop
self.cur_pointer = 0
def nextBatch(self):
if self.cur_pointer >= self.num_batch:
if not self.isLoop: return None
self.cur_pointer = 0
if self.isShuffle: np.random.shuffle(self.index_array)
# print('{} '.format(self.index_array[self.cur_pointer]))
cur_batch = self.batches[self.index_array[self.cur_pointer]]
self.cur_pointer += 1
return cur_batch
def reset(self):
self.cur_pointer = 0
def get_num_batch(self):
return self.num_batch
def get_num_instance(self):
return self.num_instances
def get_batch(self, i):
if i >= self.num_batch: return None
return self.batches[i]
def vectorize(self, score, num_classes):
temp = np.zeros(num_classes, dtype=float)
score = float(score)
ceil, fl = int(np.ceil(score)), int(np.floor(score))
if ceil == fl:
temp[fl - 1] = 1
else:
temp[fl - 1] = ceil - score
temp[ceil - 1] = score - fl
temp = temp + 0.00001
return temp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment