Last active
July 12, 2017 02:25
-
-
Save rgtjf/6080cae822794b97dd8c29e63f57ff0a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import numpy as np | |
import re | |
def make_batches(size, batch_size): | |
""" | |
:param size: the size of dataset | |
:param batch_size: the size of batch | |
:return: list: [(0, batch_size), (batch_size, 2*batch_size), ..., (. , min(., .))] | |
""" | |
nb_batch = int(np.ceil(size/float(batch_size))) | |
return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)] | |
def pad_2d_matrix(batch_words, max_sent_length=None, dtype=np.int32): | |
""" | |
:param batch_words: [batch_size, sent_length] | |
:param max_sent_length: if None, max(sent_length) | |
:param dtype: | |
:return: padding_words: [batch_size, max_sent_length], 0 | |
""" | |
if max_sent_length is None: | |
max_sent_length = np.max([len(words) for words in batch_words]) | |
batch_size = len(batch_words) | |
padding_words = np.zeros((batch_size, max_sent_length), dtype=dtype) | |
for i in range(batch_size): | |
words = batch_words[i] | |
kept_length = len(words) | |
if kept_length > max_sent_length: | |
kept_length = max_sent_length | |
padding_words[i, :kept_length] = words[:kept_length] | |
return padding_words | |
def pad_3d_tensor(batch_chars, max_sent_length=None, max_word_length=None, dtype=np.int32): | |
""" | |
:param batch_chars: [batch_size, sent_length, word_length] | |
:param max_sent_length: | |
:param max_word_length: | |
:param dtype: | |
:return: | |
""" | |
if max_sent_length is None: | |
max_sent_length = np.max([len(words) for words in batch_chars]) | |
if max_word_length is None: | |
max_word_length = np.max([np.max([len(chars) for chars in words]) for words in batch_chars]) | |
batch_size = len(batch_chars) | |
padding_chars = np.zeros((batch_size, max_sent_length, max_word_length), dtype=dtype) | |
for i in range(batch_size): | |
sent_length = max_sent_length | |
if len(batch_chars[i]) < max_sent_length: | |
sent_length = len(batch_chars[i]) | |
for j in range(sent_length): | |
chars = batch_chars[i][j] | |
kept_length = len(chars) | |
if kept_length > max_word_length: | |
kept_length = max_word_length | |
padding_chars[i, j, :kept_length] = chars[:kept_length] | |
return padding_chars | |
class Vocab(object): | |
def __init__(self): | |
""" | |
Vocab, word2index | |
""" | |
self.vocab = {} | |
def __setitem__(self, key, value): | |
self.vocab[key] = value | |
def __getitem__(self, item): | |
if item not in self.vocab: | |
raise KeyError | |
return self.vocab[item] | |
def _save_(self): | |
pass | |
def _load_(self): | |
pass | |
def load_word_embedding(self): | |
""" | |
self.vocab build [len(vocab), dim] | |
:return: | |
""" | |
pass | |
class Data(object): | |
def __init__(self): | |
pass | |
def add_word(self): | |
self.word = [] | |
self.word_index = [] | |
class Dataset(object): | |
def __init__(self, file_list, word_vocab, | |
is_shuffle=False, is_loop=False, is_sort=True, | |
batch_size=32, max_char_per_word=10, max_sent_length=200, | |
label_vocab=None, num_classes=6): | |
pass | |
def read_data(self, file): | |
pass | |
class SentenceMatchDataStream(object): | |
def __init__(self, inpath, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None, label_vocab=None, batch_size=60, | |
isShuffle=False, isLoop=False, isSort=True, max_char_per_word=10, max_sent_length=200, num_classes=6): | |
instances = [] | |
if type(inpath) == str: | |
inpath = [inpath] | |
for file in inpath: | |
infile = open(file, 'rt') | |
for line in infile: | |
line = line.strip() | |
if line.startswith('-'): continue | |
items = re.split("\t", line) | |
label = items[2] | |
sentence1 = items[0].lower() | |
sentence2 = items[1].lower() | |
label = float(label) | |
label_id = float(label) | |
label_id = self.vectorize(label_id, num_classes) | |
word_idx_1 = word_vocab.to_index_sequence(sentence1) | |
word_idx_2 = word_vocab.to_index_sequence(sentence2) | |
char_matrix_idx_1 = char_vocab.to_character_matrix(sentence1) | |
char_matrix_idx_2 = char_vocab.to_character_matrix(sentence2) | |
if len(word_idx_1)>max_sent_length: | |
word_idx_1 = word_idx_1[:max_sent_length] | |
char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length] | |
if len(word_idx_2)>max_sent_length: | |
word_idx_2 = word_idx_2[:max_sent_length] | |
char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length] | |
POS_idx_1 = None | |
POS_idx_2 = None | |
if POS_vocab is not None: | |
POS_idx_1 = POS_vocab.to_index_sequence(items[3]) | |
if len(POS_idx_1)>max_sent_length: POS_idx_1 = POS_idx_1[:max_sent_length] | |
POS_idx_2 = POS_vocab.to_index_sequence(items[4]) | |
if len(POS_idx_2)>max_sent_length: POS_idx_2 = POS_idx_2[:max_sent_length] | |
NER_idx_1 = None | |
NER_idx_2 = None | |
if NER_vocab is not None: | |
NER_idx_1 = NER_vocab.to_index_sequence(items[5]) | |
if len(NER_idx_1)>max_sent_length: NER_idx_1 = NER_idx_1[:max_sent_length] | |
NER_idx_2 = NER_vocab.to_index_sequence(items[6]) | |
if len(NER_idx_2)>max_sent_length: NER_idx_2 = NER_idx_2[:max_sent_length] | |
instances.append((label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2, | |
POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2)) | |
infile.close() | |
# sort instances based on sentence length | |
if isSort: instances = sorted(instances, key=lambda instance: (len(instance[4]), len(instance[5]))) # sort instances based on length | |
self.num_instances = len(instances) | |
# distribute into different buckets | |
batch_spans = make_batches(self.num_instances, batch_size) | |
self.batches = [] | |
for batch_index, (batch_start, batch_end) in enumerate(batch_spans): | |
label_batch = [] | |
sent1_batch = [] | |
sent2_batch = [] | |
label_id_batch = [] | |
word_idx_1_batch = [] | |
word_idx_2_batch = [] | |
char_matrix_idx_1_batch = [] | |
char_matrix_idx_2_batch = [] | |
sent1_length_batch = [] | |
sent2_length_batch = [] | |
sent1_char_length_batch = [] | |
sent2_char_length_batch = [] | |
POS_idx_1_batch = None | |
if POS_vocab is not None: POS_idx_1_batch = [] | |
POS_idx_2_batch = None | |
if POS_vocab is not None: POS_idx_2_batch = [] | |
NER_idx_1_batch = None | |
if NER_vocab is not None: NER_idx_1_batch = [] | |
NER_idx_2_batch = None | |
if NER_vocab is not None: NER_idx_2_batch = [] | |
for i in range(batch_start, batch_end): | |
(label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2, | |
POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2) = instances[i] | |
label_batch.append(label) | |
sent1_batch.append(sentence1) | |
sent2_batch.append(sentence2) | |
label_id_batch.append(label_id) | |
word_idx_1_batch.append(word_idx_1) | |
word_idx_2_batch.append(word_idx_2) | |
char_matrix_idx_1_batch.append(char_matrix_idx_1) | |
char_matrix_idx_2_batch.append(char_matrix_idx_2) | |
sent1_length_batch.append(len(word_idx_1)) | |
sent2_length_batch.append(len(word_idx_2)) | |
sent1_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_1]) | |
sent2_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_2]) | |
if POS_vocab is not None: | |
POS_idx_1_batch.append(POS_idx_1) | |
POS_idx_2_batch.append(POS_idx_2) | |
if NER_vocab is not None: | |
NER_idx_1_batch.append(NER_idx_1) | |
NER_idx_2_batch.append(NER_idx_2) | |
cur_batch_size = len(label_batch) | |
if cur_batch_size ==0: continue | |
# padding | |
max_sent1_length = np.max(sent1_length_batch) | |
max_sent2_length = np.max(sent2_length_batch) | |
max_char_length1 = np.max([np.max(aa) for aa in sent1_char_length_batch]) | |
if max_char_length1>max_char_per_word: max_char_length1=max_char_per_word | |
max_char_length2 = np.max([np.max(aa) for aa in sent2_char_length_batch]) | |
if max_char_length2>max_char_per_word: max_char_length2=max_char_per_word | |
label_id_batch = np.array(label_id_batch) | |
word_idx_1_batch = pad_2d_matrix(word_idx_1_batch, max_sent_length=max_sent1_length) | |
word_idx_2_batch = pad_2d_matrix(word_idx_2_batch, max_sent_length=max_sent2_length) | |
char_matrix_idx_1_batch = pad_3d_tensor(char_matrix_idx_1_batch, max_sent_length=max_sent1_length, max_word_length=max_char_length1) | |
char_matrix_idx_2_batch = pad_3d_tensor(char_matrix_idx_2_batch, max_sent_length=max_sent2_length, max_word_length=max_char_length2) | |
sent1_length_batch = np.array(sent1_length_batch) | |
sent2_length_batch = np.array(sent2_length_batch) | |
sent1_char_length_batch = pad_2d_matrix(sent1_char_length_batch, max_sent_length=max_sent1_length) | |
sent2_char_length_batch = pad_2d_matrix(sent2_char_length_batch, max_sent_length=max_sent2_length) | |
if POS_vocab is not None: | |
POS_idx_1_batch = pad_2d_matrix(POS_idx_1_batch, max_sent_length=max_sent1_length) | |
POS_idx_2_batch = pad_2d_matrix(POS_idx_2_batch, max_sent_length=max_sent2_length) | |
if NER_vocab is not None: | |
NER_idx_1_batch = pad_2d_matrix(NER_idx_1_batch, max_sent_length=max_sent1_length) | |
NER_idx_2_batch = pad_2d_matrix(NER_idx_2_batch, max_sent_length=max_sent2_length) | |
self.batches.append((label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch, | |
char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch, | |
sent1_char_length_batch, sent2_char_length_batch, | |
POS_idx_1_batch, POS_idx_2_batch, NER_idx_1_batch, NER_idx_2_batch)) | |
instances = None | |
self.num_batch = len(self.batches) | |
self.index_array = np.arange(self.num_batch) | |
self.isShuffle = isShuffle | |
if self.isShuffle: np.random.shuffle(self.index_array) | |
self.isLoop = isLoop | |
self.cur_pointer = 0 | |
def nextBatch(self): | |
if self.cur_pointer>=self.num_batch: | |
if not self.isLoop: return None | |
self.cur_pointer = 0 | |
if self.isShuffle: np.random.shuffle(self.index_array) | |
# print('{} '.format(self.index_array[self.cur_pointer])) | |
cur_batch = self.batches[self.index_array[self.cur_pointer]] | |
cur_batch = self.scramble(cur_batch) | |
self.cur_pointer += 1 | |
return cur_batch | |
def scramble(self, cur_batch): | |
n = np.random.binomial(1, 0.25, 1)[0] | |
if n > 0 and self.isShuffle == True: | |
# if n > 0: | |
(label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch, | |
char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch, | |
sent1_char_length_batch, sent2_char_length_batch, | |
POS_idx_1_batch, POS_idx_2_batch, NER_idx_1_batch, NER_idx_2_batch) = cur_batch | |
for idx in range(len(word_idx_1_batch)): | |
word_idx_1 = word_idx_1_batch[idx] | |
word_idx_2 = word_idx_2_batch[idx] | |
# sent1_length = min(30, int(sent1_length_batch[idx])) | |
sent1_length = sent1_length_batch[idx] | |
sent2_length = sent2_length_batch[idx] | |
w_idx_1 = word_idx_1[:sent1_length] | |
random.shuffle(w_idx_1) | |
word_idx_1[:sent1_length] = w_idx_1 | |
w_idx_2 = word_idx_2[:sent1_length] | |
random.shuffle(w_idx_2) | |
word_idx_2[:sent1_length] = w_idx_2 | |
cur_batch[4][idx] = word_idx_1 | |
cur_batch[5][idx] = word_idx_2 | |
return cur_batch | |
def reset(self): | |
self.cur_pointer = 0 | |
def get_num_batch(self): | |
return self.num_batch | |
def get_num_instance(self): | |
return self.num_instances | |
def get_batch(self, i): | |
if i>= self.num_batch: return None | |
return self.batches[i] | |
def vectorize(self, score, num_classes): | |
temp = np.zeros(num_classes, dtype=float) | |
score = float(score) | |
ceil, fl = int(np.ceil(score)), int(np.floor(score)) | |
if ceil == fl: | |
temp[fl - 1] = 1 | |
else: | |
temp[fl - 1] = ceil - score | |
temp[ceil - 1] = score - fl | |
temp = temp + 0.00001 | |
return temp | |
class MultilingualSentenceMatchDataStream(object): | |
def __init__(self, inpath, en_word_vocab=None, es_word_vocab=None, en_char_vocab=None, es_char_vocab=None, | |
batch_size=60, | |
isShuffle=False, isLoop=False, isSort=True, max_char_per_word=10, max_sent_length=200, num_classes=6): | |
instances = [] | |
if type(inpath) == str: | |
inpath = [inpath] | |
for file in inpath: | |
infile = open(file, 'rt') | |
for line in infile: | |
line = line.strip() | |
if line.startswith('-'): continue | |
items = re.split("\t", line) | |
label = items[2] | |
sentence1 = items[0].lower() | |
sentence2 = items[1].lower() | |
label = float(label) | |
label_id = float(label) | |
label_id = self.vectorize(label_id, num_classes) | |
word_idx_1 = en_word_vocab.to_index_sequence(sentence1) | |
word_idx_2 = es_word_vocab.to_index_sequence(sentence2) | |
char_matrix_idx_1 = en_char_vocab.to_character_matrix(sentence1) | |
char_matrix_idx_2 = es_char_vocab.to_character_matrix(sentence2) | |
if len(word_idx_1) > max_sent_length: | |
word_idx_1 = word_idx_1[:max_sent_length] | |
char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length] | |
if len(word_idx_2) > max_sent_length: | |
word_idx_2 = word_idx_2[:max_sent_length] | |
char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length] | |
instances.append( | |
(label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2, | |
None, None, None, None)) | |
infile.close() | |
# sort instances based on sentence length | |
if isSort: instances = sorted(instances, key=lambda instance: (len(instance[4]), len(instance[5]))) # sort instances based on length | |
self.num_instances = len(instances) | |
# distribute into different buckets | |
batch_spans = make_batches(self.num_instances, batch_size) | |
self.batches = [] | |
for batch_index, (batch_start, batch_end) in enumerate(batch_spans): | |
label_batch = [] | |
sent1_batch = [] | |
sent2_batch = [] | |
label_id_batch = [] | |
word_idx_1_batch = [] | |
word_idx_2_batch = [] | |
char_matrix_idx_1_batch = [] | |
char_matrix_idx_2_batch = [] | |
sent1_length_batch = [] | |
sent2_length_batch = [] | |
sent1_char_length_batch = [] | |
sent2_char_length_batch = [] | |
for i in range(batch_start, batch_end): | |
(label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2, | |
POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2) = instances[i] | |
label_batch.append(label) | |
sent1_batch.append(sentence1) | |
sent2_batch.append(sentence2) | |
label_id_batch.append(label_id) | |
word_idx_1_batch.append(word_idx_1) | |
word_idx_2_batch.append(word_idx_2) | |
char_matrix_idx_1_batch.append(char_matrix_idx_1) | |
char_matrix_idx_2_batch.append(char_matrix_idx_2) | |
sent1_length_batch.append(len(word_idx_1)) | |
sent2_length_batch.append(len(word_idx_2)) | |
sent1_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_1]) | |
sent2_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_2]) | |
cur_batch_size = len(label_batch) | |
if cur_batch_size == 0: continue | |
# padding | |
max_sent1_length = np.max(sent1_length_batch) | |
max_sent2_length = np.max(sent2_length_batch) | |
max_char_length1 = np.max([np.max(aa) for aa in sent1_char_length_batch]) | |
if max_char_length1 > max_char_per_word: max_char_length1 = max_char_per_word | |
max_char_length2 = np.max([np.max(aa) for aa in sent2_char_length_batch]) | |
if max_char_length2 > max_char_per_word: max_char_length2 = max_char_per_word | |
label_id_batch = np.array(label_id_batch) | |
word_idx_1_batch = pad_2d_matrix(word_idx_1_batch, max_sent_length=max_sent1_length) | |
word_idx_2_batch = pad_2d_matrix(word_idx_2_batch, max_sent_length=max_sent2_length) | |
char_matrix_idx_1_batch = pad_3d_tensor(char_matrix_idx_1_batch, max_sent_length=max_sent1_length, | |
max_word_length=max_char_length1) | |
char_matrix_idx_2_batch = pad_3d_tensor(char_matrix_idx_2_batch, max_sent_length=max_sent2_length, | |
max_word_length=max_char_length2) | |
sent1_length_batch = np.array(sent1_length_batch) | |
sent2_length_batch = np.array(sent2_length_batch) | |
sent1_char_length_batch = pad_2d_matrix(sent1_char_length_batch, max_sent_length=max_sent1_length) | |
sent2_char_length_batch = pad_2d_matrix(sent2_char_length_batch, max_sent_length=max_sent2_length) | |
self.batches.append((label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch, | |
char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch, | |
sent1_char_length_batch, sent2_char_length_batch, | |
None, None, None, None)) | |
instances = None | |
self.num_batch = len(self.batches) | |
self.index_array = np.arange(self.num_batch) | |
self.isShuffle = isShuffle | |
if self.isShuffle: np.random.shuffle(self.index_array) | |
self.isLoop = isLoop | |
self.cur_pointer = 0 | |
def nextBatch(self): | |
if self.cur_pointer >= self.num_batch: | |
if not self.isLoop: return None | |
self.cur_pointer = 0 | |
if self.isShuffle: np.random.shuffle(self.index_array) | |
# print('{} '.format(self.index_array[self.cur_pointer])) | |
cur_batch = self.batches[self.index_array[self.cur_pointer]] | |
self.cur_pointer += 1 | |
return cur_batch | |
def reset(self): | |
self.cur_pointer = 0 | |
def get_num_batch(self): | |
return self.num_batch | |
def get_num_instance(self): | |
return self.num_instances | |
def get_batch(self, i): | |
if i >= self.num_batch: return None | |
return self.batches[i] | |
def vectorize(self, score, num_classes): | |
temp = np.zeros(num_classes, dtype=float) | |
score = float(score) | |
ceil, fl = int(np.ceil(score)), int(np.floor(score)) | |
if ceil == fl: | |
temp[fl - 1] = 1 | |
else: | |
temp[fl - 1] = ceil - score | |
temp[ceil - 1] = score - fl | |
temp = temp + 0.00001 | |
return temp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment