rgtjf/tf_utils.py

## tf_utils.py
import random
import numpy as np
import re


def make_batches(size, batch_size):
    """

    :param size: the size of dataset
    :param batch_size: the size of batch
    :return: list: [(0, batch_size), (batch_size, 2*batch_size), ..., (. , min(., .))]
    """
    nb_batch = int(np.ceil(size/float(batch_size)))
    return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]


def pad_2d_matrix(batch_words, max_sent_length=None, dtype=np.int32):
    """

    :param batch_words: [batch_size, sent_length]
    :param max_sent_length: if None, max(sent_length)
    :param dtype:
    :return: padding_words: [batch_size, max_sent_length], 0
    """

    if max_sent_length is None:
        max_sent_length = np.max([len(words) for words in batch_words])

    batch_size = len(batch_words)
    padding_words = np.zeros((batch_size, max_sent_length), dtype=dtype)

    for i in range(batch_size):
        words = batch_words[i]
        kept_length = len(words)
        if kept_length > max_sent_length:
            kept_length = max_sent_length
        padding_words[i, :kept_length] = words[:kept_length]
    return padding_words


def pad_3d_tensor(batch_chars, max_sent_length=None, max_word_length=None, dtype=np.int32):
    """

    :param batch_chars: [batch_size, sent_length, word_length]
    :param max_sent_length:
    :param max_word_length:
    :param dtype:
    :return:
    """
    if max_sent_length is None:
        max_sent_length = np.max([len(words) for words in batch_chars])

    if max_word_length is None:
        max_word_length = np.max([np.max([len(chars) for chars in words]) for words in batch_chars])

    batch_size = len(batch_chars)
    padding_chars = np.zeros((batch_size, max_sent_length, max_word_length), dtype=dtype)

    for i in range(batch_size):
        sent_length = max_sent_length

        if len(batch_chars[i]) < max_sent_length:
            sent_length = len(batch_chars[i])

        for j in range(sent_length):
            chars = batch_chars[i][j]
            kept_length = len(chars)
            if kept_length > max_word_length:
                kept_length = max_word_length
            padding_chars[i, j, :kept_length] = chars[:kept_length]
    return padding_chars


class Vocab(object):

    def __init__(self):
        """
        Vocab, word2index
        """

        self.vocab = {}

    def __setitem__(self, key, value):
        self.vocab[key] = value

    def __getitem__(self, item):
        if item not in self.vocab:
            raise KeyError
        return self.vocab[item]

    def _save_(self):
        pass

    def _load_(self):
        pass

    def load_word_embedding(self):
        """
        self.vocab build [len(vocab), dim]
        :return:
        """
        pass


class Data(object):

    def __init__(self):
        pass

    def add_word(self):
        self.word = []
        self.word_index = []


class Dataset(object):
    def __init__(self, file_list, word_vocab,
                 is_shuffle=False, is_loop=False, is_sort=True,
                 batch_size=32, max_char_per_word=10, max_sent_length=200,
                 label_vocab=None, num_classes=6):

        pass

    def read_data(self, file):
        pass


class SentenceMatchDataStream(object):
    def __init__(self, inpath, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None, label_vocab=None, batch_size=60,
                 isShuffle=False, isLoop=False, isSort=True, max_char_per_word=10, max_sent_length=200, num_classes=6):
        instances = []
        if type(inpath) == str:
            inpath = [inpath]

        for file in inpath:
            infile = open(file, 'rt')
            for line in infile:
                line = line.strip()
                if line.startswith('-'): continue
                items = re.split("\t", line)
                label = items[2]
                sentence1 = items[0].lower()
                sentence2 = items[1].lower()
                label = float(label)
                label_id = float(label)
                label_id = self.vectorize(label_id, num_classes)
                word_idx_1 = word_vocab.to_index_sequence(sentence1)
                word_idx_2 = word_vocab.to_index_sequence(sentence2)
                char_matrix_idx_1 = char_vocab.to_character_matrix(sentence1)
                char_matrix_idx_2 = char_vocab.to_character_matrix(sentence2)
                if len(word_idx_1)>max_sent_length:
                    word_idx_1 = word_idx_1[:max_sent_length]
                    char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]
                if len(word_idx_2)>max_sent_length:
                    word_idx_2 = word_idx_2[:max_sent_length]
                    char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]

                POS_idx_1 = None
                POS_idx_2 = None
                if POS_vocab is not None:
                    POS_idx_1 = POS_vocab.to_index_sequence(items[3])
                    if len(POS_idx_1)>max_sent_length: POS_idx_1 = POS_idx_1[:max_sent_length]
                    POS_idx_2 = POS_vocab.to_index_sequence(items[4])
                    if len(POS_idx_2)>max_sent_length: POS_idx_2 = POS_idx_2[:max_sent_length]

                NER_idx_1 = None
                NER_idx_2 = None
                if NER_vocab is not None:
                    NER_idx_1 = NER_vocab.to_index_sequence(items[5])
                    if len(NER_idx_1)>max_sent_length: NER_idx_1 = NER_idx_1[:max_sent_length]
                    NER_idx_2 = NER_vocab.to_index_sequence(items[6])
                    if len(NER_idx_2)>max_sent_length: NER_idx_2 = NER_idx_2[:max_sent_length]


                instances.append((label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
                                  POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2))
            infile.close()

        # sort instances based on sentence length
        if isSort: instances = sorted(instances, key=lambda instance: (len(instance[4]), len(instance[5]))) # sort instances based on length
        self.num_instances = len(instances)

        # distribute into different buckets
        batch_spans = make_batches(self.num_instances, batch_size)
        self.batches = []
        for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
            label_batch = []
            sent1_batch = []
            sent2_batch = []
            label_id_batch = []
            word_idx_1_batch = []
            word_idx_2_batch = []
            char_matrix_idx_1_batch = []
            char_matrix_idx_2_batch = []
            sent1_length_batch = []
            sent2_length_batch = []
            sent1_char_length_batch = []
            sent2_char_length_batch = []

            POS_idx_1_batch = None
            if POS_vocab is not None: POS_idx_1_batch = []
            POS_idx_2_batch = None
            if POS_vocab is not None: POS_idx_2_batch = []

            NER_idx_1_batch = None
            if NER_vocab is not None: NER_idx_1_batch = []
            NER_idx_2_batch = None
            if NER_vocab is not None: NER_idx_2_batch = []

            for i in range(batch_start, batch_end):
                (label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
                 POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2) = instances[i]
                label_batch.append(label)
                sent1_batch.append(sentence1)
                sent2_batch.append(sentence2)
                label_id_batch.append(label_id)
                word_idx_1_batch.append(word_idx_1)
                word_idx_2_batch.append(word_idx_2)
                char_matrix_idx_1_batch.append(char_matrix_idx_1)
                char_matrix_idx_2_batch.append(char_matrix_idx_2)
                sent1_length_batch.append(len(word_idx_1))
                sent2_length_batch.append(len(word_idx_2))
                sent1_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_1])
                sent2_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_2])

                if POS_vocab is not None:
                    POS_idx_1_batch.append(POS_idx_1)
                    POS_idx_2_batch.append(POS_idx_2)

                if NER_vocab is not None:
                    NER_idx_1_batch.append(NER_idx_1)
                    NER_idx_2_batch.append(NER_idx_2)


            cur_batch_size = len(label_batch)
            if cur_batch_size ==0: continue

            # padding
            max_sent1_length = np.max(sent1_length_batch)
            max_sent2_length = np.max(sent2_length_batch)

            max_char_length1 = np.max([np.max(aa) for aa in sent1_char_length_batch])
            if max_char_length1>max_char_per_word: max_char_length1=max_char_per_word

            max_char_length2 = np.max([np.max(aa) for aa in sent2_char_length_batch])
            if max_char_length2>max_char_per_word: max_char_length2=max_char_per_word

            label_id_batch = np.array(label_id_batch)
            word_idx_1_batch = pad_2d_matrix(word_idx_1_batch, max_sent_length=max_sent1_length)
            word_idx_2_batch = pad_2d_matrix(word_idx_2_batch, max_sent_length=max_sent2_length)

            char_matrix_idx_1_batch = pad_3d_tensor(char_matrix_idx_1_batch, max_sent_length=max_sent1_length, max_word_length=max_char_length1)
            char_matrix_idx_2_batch = pad_3d_tensor(char_matrix_idx_2_batch, max_sent_length=max_sent2_length, max_word_length=max_char_length2)

            sent1_length_batch = np.array(sent1_length_batch)
            sent2_length_batch = np.array(sent2_length_batch)

            sent1_char_length_batch = pad_2d_matrix(sent1_char_length_batch, max_sent_length=max_sent1_length)
            sent2_char_length_batch = pad_2d_matrix(sent2_char_length_batch, max_sent_length=max_sent2_length)

            if POS_vocab is not None:
                POS_idx_1_batch = pad_2d_matrix(POS_idx_1_batch, max_sent_length=max_sent1_length)
                POS_idx_2_batch = pad_2d_matrix(POS_idx_2_batch, max_sent_length=max_sent2_length)
            if NER_vocab is not None:
                NER_idx_1_batch = pad_2d_matrix(NER_idx_1_batch, max_sent_length=max_sent1_length)
                NER_idx_2_batch = pad_2d_matrix(NER_idx_2_batch, max_sent_length=max_sent2_length)


            self.batches.append((label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch,
                                 char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch,
                                 sent1_char_length_batch, sent2_char_length_batch,
                                 POS_idx_1_batch, POS_idx_2_batch, NER_idx_1_batch, NER_idx_2_batch))

        instances = None
        self.num_batch = len(self.batches)
        self.index_array = np.arange(self.num_batch)
        self.isShuffle = isShuffle
        if self.isShuffle: np.random.shuffle(self.index_array)
        self.isLoop = isLoop
        self.cur_pointer = 0

    def nextBatch(self):
        if self.cur_pointer>=self.num_batch:
            if not self.isLoop: return None
            self.cur_pointer = 0
            if self.isShuffle: np.random.shuffle(self.index_array)
        # print('{} '.format(self.index_array[self.cur_pointer]))
        cur_batch = self.batches[self.index_array[self.cur_pointer]]
        cur_batch = self.scramble(cur_batch)
        self.cur_pointer += 1
        return cur_batch

    def scramble(self, cur_batch):
        n = np.random.binomial(1, 0.25, 1)[0]
        if n > 0 and self.isShuffle == True:
        # if n > 0:
            (label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch,
             char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch,
             sent1_char_length_batch, sent2_char_length_batch,
             POS_idx_1_batch, POS_idx_2_batch, NER_idx_1_batch, NER_idx_2_batch) = cur_batch
            for idx in range(len(word_idx_1_batch)):
                word_idx_1 = word_idx_1_batch[idx]
                word_idx_2 = word_idx_2_batch[idx]
                # sent1_length = min(30, int(sent1_length_batch[idx]))
                sent1_length = sent1_length_batch[idx]
                sent2_length = sent2_length_batch[idx]

                w_idx_1 = word_idx_1[:sent1_length]
                random.shuffle(w_idx_1)
                word_idx_1[:sent1_length] = w_idx_1

                w_idx_2 = word_idx_2[:sent1_length]
                random.shuffle(w_idx_2)
                word_idx_2[:sent1_length] = w_idx_2

                cur_batch[4][idx] = word_idx_1
                cur_batch[5][idx] = word_idx_2
        return cur_batch

    def reset(self):
        self.cur_pointer = 0

    def get_num_batch(self):
        return self.num_batch

    def get_num_instance(self):
        return self.num_instances

    def get_batch(self, i):
        if i>= self.num_batch: return None
        return self.batches[i]

    def vectorize(self, score, num_classes):
        temp = np.zeros(num_classes, dtype=float)
        score = float(score)
        ceil, fl = int(np.ceil(score)), int(np.floor(score))
        if ceil == fl:
            temp[fl - 1] = 1
        else:
            temp[fl - 1] = ceil - score
            temp[ceil - 1] = score - fl
        temp = temp + 0.00001
        return temp


class MultilingualSentenceMatchDataStream(object):

    def __init__(self, inpath, en_word_vocab=None, es_word_vocab=None, en_char_vocab=None, es_char_vocab=None,
                 batch_size=60,
                 isShuffle=False, isLoop=False, isSort=True, max_char_per_word=10, max_sent_length=200, num_classes=6):

        instances = []
        if type(inpath) == str:
            inpath = [inpath]
        for file in inpath:
            infile = open(file, 'rt')
            for line in infile:
                line = line.strip()
                if line.startswith('-'): continue
                items = re.split("\t", line)
                label = items[2]
                sentence1 = items[0].lower()
                sentence2 = items[1].lower()
                label = float(label)
                label_id = float(label)
                label_id = self.vectorize(label_id, num_classes)
                word_idx_1 = en_word_vocab.to_index_sequence(sentence1)
                word_idx_2 = es_word_vocab.to_index_sequence(sentence2)
                char_matrix_idx_1 = en_char_vocab.to_character_matrix(sentence1)
                char_matrix_idx_2 = es_char_vocab.to_character_matrix(sentence2)
                if len(word_idx_1) > max_sent_length:
                    word_idx_1 = word_idx_1[:max_sent_length]
                    char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]
                if len(word_idx_2) > max_sent_length:
                    word_idx_2 = word_idx_2[:max_sent_length]
                    char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]


                instances.append(
                    (label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
                     None, None, None, None))
            infile.close()

        # sort instances based on sentence length
        if isSort: instances = sorted(instances, key=lambda instance: (len(instance[4]), len(instance[5])))  # sort instances based on length
        self.num_instances = len(instances)

        # distribute into different buckets
        batch_spans = make_batches(self.num_instances, batch_size)
        self.batches = []
        for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
            label_batch = []
            sent1_batch = []
            sent2_batch = []
            label_id_batch = []
            word_idx_1_batch = []
            word_idx_2_batch = []
            char_matrix_idx_1_batch = []
            char_matrix_idx_2_batch = []
            sent1_length_batch = []
            sent2_length_batch = []
            sent1_char_length_batch = []
            sent2_char_length_batch = []


            for i in range(batch_start, batch_end):
                (label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
                 POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2) = instances[i]
                label_batch.append(label)
                sent1_batch.append(sentence1)
                sent2_batch.append(sentence2)
                label_id_batch.append(label_id)
                word_idx_1_batch.append(word_idx_1)
                word_idx_2_batch.append(word_idx_2)
                char_matrix_idx_1_batch.append(char_matrix_idx_1)
                char_matrix_idx_2_batch.append(char_matrix_idx_2)
                sent1_length_batch.append(len(word_idx_1))
                sent2_length_batch.append(len(word_idx_2))
                sent1_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_1])
                sent2_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_2])


            cur_batch_size = len(label_batch)
            if cur_batch_size == 0: continue

            # padding
            max_sent1_length = np.max(sent1_length_batch)
            max_sent2_length = np.max(sent2_length_batch)

            max_char_length1 = np.max([np.max(aa) for aa in sent1_char_length_batch])
            if max_char_length1 > max_char_per_word: max_char_length1 = max_char_per_word

            max_char_length2 = np.max([np.max(aa) for aa in sent2_char_length_batch])
            if max_char_length2 > max_char_per_word: max_char_length2 = max_char_per_word

            label_id_batch = np.array(label_id_batch)
            word_idx_1_batch = pad_2d_matrix(word_idx_1_batch, max_sent_length=max_sent1_length)
            word_idx_2_batch = pad_2d_matrix(word_idx_2_batch, max_sent_length=max_sent2_length)

            char_matrix_idx_1_batch = pad_3d_tensor(char_matrix_idx_1_batch, max_sent_length=max_sent1_length,
                                                    max_word_length=max_char_length1)
            char_matrix_idx_2_batch = pad_3d_tensor(char_matrix_idx_2_batch, max_sent_length=max_sent2_length,
                                                    max_word_length=max_char_length2)

            sent1_length_batch = np.array(sent1_length_batch)
            sent2_length_batch = np.array(sent2_length_batch)

            sent1_char_length_batch = pad_2d_matrix(sent1_char_length_batch, max_sent_length=max_sent1_length)
            sent2_char_length_batch = pad_2d_matrix(sent2_char_length_batch, max_sent_length=max_sent2_length)


            self.batches.append((label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch,
                                 char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch,
                                 sent1_char_length_batch, sent2_char_length_batch,
                                 None, None, None, None))

        instances = None
        self.num_batch = len(self.batches)
        self.index_array = np.arange(self.num_batch)
        self.isShuffle = isShuffle
        if self.isShuffle: np.random.shuffle(self.index_array)
        self.isLoop = isLoop
        self.cur_pointer = 0

    def nextBatch(self):
        if self.cur_pointer >= self.num_batch:
            if not self.isLoop: return None
            self.cur_pointer = 0
            if self.isShuffle: np.random.shuffle(self.index_array)
            # print('{} '.format(self.index_array[self.cur_pointer]))
        cur_batch = self.batches[self.index_array[self.cur_pointer]]
        self.cur_pointer += 1
        return cur_batch

    def reset(self):
        self.cur_pointer = 0

    def get_num_batch(self):
        return self.num_batch

    def get_num_instance(self):
        return self.num_instances

    def get_batch(self, i):
        if i >= self.num_batch: return None
        return self.batches[i]

    def vectorize(self, score, num_classes):
        temp = np.zeros(num_classes, dtype=float)
        score = float(score)
        ceil, fl = int(np.ceil(score)), int(np.floor(score))
        if ceil == fl:
            temp[fl - 1] = 1
        else:
            temp[fl - 1] = ceil - score
            temp[ceil - 1] = score - fl
        temp = temp + 0.00001
        return temp
	import random
	import numpy as np
	import re


	def make_batches(size, batch_size):
	"""

	:param size: the size of dataset
	:param batch_size: the size of batch
	:return: list: [(0, batch_size), (batch_size, 2*batch_size), ..., (. , min(., .))]
	"""
	nb_batch = int(np.ceil(size/float(batch_size)))
	return [(ibatch_size, min(size, (i+1)batch_size)) for i in range(0, nb_batch)]


	def pad_2d_matrix(batch_words, max_sent_length=None, dtype=np.int32):
	"""

	:param batch_words: [batch_size, sent_length]
	:param max_sent_length: if None, max(sent_length)
	:param dtype:
	:return: padding_words: [batch_size, max_sent_length], 0
	"""

	if max_sent_length is None:
	max_sent_length = np.max([len(words) for words in batch_words])

	batch_size = len(batch_words)
	padding_words = np.zeros((batch_size, max_sent_length), dtype=dtype)

	for i in range(batch_size):
	words = batch_words[i]
	kept_length = len(words)
	if kept_length > max_sent_length:
	kept_length = max_sent_length
	padding_words[i, :kept_length] = words[:kept_length]
	return padding_words


	def pad_3d_tensor(batch_chars, max_sent_length=None, max_word_length=None, dtype=np.int32):
	"""

	:param batch_chars: [batch_size, sent_length, word_length]
	:param max_sent_length:
	:param max_word_length:
	:param dtype:
	:return:
	"""
	if max_sent_length is None:
	max_sent_length = np.max([len(words) for words in batch_chars])

	if max_word_length is None:
	max_word_length = np.max([np.max([len(chars) for chars in words]) for words in batch_chars])

	batch_size = len(batch_chars)
	padding_chars = np.zeros((batch_size, max_sent_length, max_word_length), dtype=dtype)

	for i in range(batch_size):
	sent_length = max_sent_length

	if len(batch_chars[i]) < max_sent_length:
	sent_length = len(batch_chars[i])

	for j in range(sent_length):
	chars = batch_chars[i][j]
	kept_length = len(chars)
	if kept_length > max_word_length:
	kept_length = max_word_length
	padding_chars[i, j, :kept_length] = chars[:kept_length]
	return padding_chars


	class Vocab(object):

	def __init__(self):
	"""
	Vocab, word2index
	"""

	self.vocab = {}

	def __setitem__(self, key, value):
	self.vocab[key] = value

	def __getitem__(self, item):
	if item not in self.vocab:
	raise KeyError
	return self.vocab[item]

	def _save_(self):
	pass

	def _load_(self):
	pass

	def load_word_embedding(self):
	"""
	self.vocab build [len(vocab), dim]
	:return:
	"""
	pass


	class Data(object):

	def __init__(self):
	pass

	def add_word(self):
	self.word = []
	self.word_index = []


	class Dataset(object):
	def __init__(self, file_list, word_vocab,
	is_shuffle=False, is_loop=False, is_sort=True,
	batch_size=32, max_char_per_word=10, max_sent_length=200,
	label_vocab=None, num_classes=6):

	pass

	def read_data(self, file):
	pass





	class SentenceMatchDataStream(object):
	def __init__(self, inpath, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None, label_vocab=None, batch_size=60,
	isShuffle=False, isLoop=False, isSort=True, max_char_per_word=10, max_sent_length=200, num_classes=6):
	instances = []
	if type(inpath) == str:
	inpath = [inpath]

	for file in inpath:
	infile = open(file, 'rt')
	for line in infile:
	line = line.strip()
	if line.startswith('-'): continue
	items = re.split("\t", line)
	label = items[2]
	sentence1 = items[0].lower()
	sentence2 = items[1].lower()
	label = float(label)
	label_id = float(label)
	label_id = self.vectorize(label_id, num_classes)
	word_idx_1 = word_vocab.to_index_sequence(sentence1)
	word_idx_2 = word_vocab.to_index_sequence(sentence2)
	char_matrix_idx_1 = char_vocab.to_character_matrix(sentence1)
	char_matrix_idx_2 = char_vocab.to_character_matrix(sentence2)
	if len(word_idx_1)>max_sent_length:
	word_idx_1 = word_idx_1[:max_sent_length]
	char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]
	if len(word_idx_2)>max_sent_length:
	word_idx_2 = word_idx_2[:max_sent_length]
	char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]

	POS_idx_1 = None
	POS_idx_2 = None
	if POS_vocab is not None:
	POS_idx_1 = POS_vocab.to_index_sequence(items[3])
	if len(POS_idx_1)>max_sent_length: POS_idx_1 = POS_idx_1[:max_sent_length]
	POS_idx_2 = POS_vocab.to_index_sequence(items[4])
	if len(POS_idx_2)>max_sent_length: POS_idx_2 = POS_idx_2[:max_sent_length]

	NER_idx_1 = None
	NER_idx_2 = None
	if NER_vocab is not None:
	NER_idx_1 = NER_vocab.to_index_sequence(items[5])
	if len(NER_idx_1)>max_sent_length: NER_idx_1 = NER_idx_1[:max_sent_length]
	NER_idx_2 = NER_vocab.to_index_sequence(items[6])
	if len(NER_idx_2)>max_sent_length: NER_idx_2 = NER_idx_2[:max_sent_length]


	instances.append((label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
	POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2))
	infile.close()

	# sort instances based on sentence length
	if isSort: instances = sorted(instances, key=lambda instance: (len(instance[4]), len(instance[5]))) # sort instances based on length
	self.num_instances = len(instances)

	# distribute into different buckets
	batch_spans = make_batches(self.num_instances, batch_size)
	self.batches = []
	for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
	label_batch = []
	sent1_batch = []
	sent2_batch = []
	label_id_batch = []
	word_idx_1_batch = []
	word_idx_2_batch = []
	char_matrix_idx_1_batch = []
	char_matrix_idx_2_batch = []
	sent1_length_batch = []
	sent2_length_batch = []
	sent1_char_length_batch = []
	sent2_char_length_batch = []

	POS_idx_1_batch = None
	if POS_vocab is not None: POS_idx_1_batch = []
	POS_idx_2_batch = None
	if POS_vocab is not None: POS_idx_2_batch = []

	NER_idx_1_batch = None
	if NER_vocab is not None: NER_idx_1_batch = []
	NER_idx_2_batch = None
	if NER_vocab is not None: NER_idx_2_batch = []

	for i in range(batch_start, batch_end):
	(label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
	POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2) = instances[i]
	label_batch.append(label)
	sent1_batch.append(sentence1)
	sent2_batch.append(sentence2)
	label_id_batch.append(label_id)
	word_idx_1_batch.append(word_idx_1)
	word_idx_2_batch.append(word_idx_2)
	char_matrix_idx_1_batch.append(char_matrix_idx_1)
	char_matrix_idx_2_batch.append(char_matrix_idx_2)
	sent1_length_batch.append(len(word_idx_1))
	sent2_length_batch.append(len(word_idx_2))
	sent1_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_1])
	sent2_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_2])

	if POS_vocab is not None:
	POS_idx_1_batch.append(POS_idx_1)
	POS_idx_2_batch.append(POS_idx_2)

	if NER_vocab is not None:
	NER_idx_1_batch.append(NER_idx_1)
	NER_idx_2_batch.append(NER_idx_2)


	cur_batch_size = len(label_batch)
	if cur_batch_size ==0: continue

	# padding
	max_sent1_length = np.max(sent1_length_batch)
	max_sent2_length = np.max(sent2_length_batch)

	max_char_length1 = np.max([np.max(aa) for aa in sent1_char_length_batch])
	if max_char_length1>max_char_per_word: max_char_length1=max_char_per_word

	max_char_length2 = np.max([np.max(aa) for aa in sent2_char_length_batch])
	if max_char_length2>max_char_per_word: max_char_length2=max_char_per_word

	label_id_batch = np.array(label_id_batch)
	word_idx_1_batch = pad_2d_matrix(word_idx_1_batch, max_sent_length=max_sent1_length)
	word_idx_2_batch = pad_2d_matrix(word_idx_2_batch, max_sent_length=max_sent2_length)

	char_matrix_idx_1_batch = pad_3d_tensor(char_matrix_idx_1_batch, max_sent_length=max_sent1_length, max_word_length=max_char_length1)
	char_matrix_idx_2_batch = pad_3d_tensor(char_matrix_idx_2_batch, max_sent_length=max_sent2_length, max_word_length=max_char_length2)

	sent1_length_batch = np.array(sent1_length_batch)
	sent2_length_batch = np.array(sent2_length_batch)

	sent1_char_length_batch = pad_2d_matrix(sent1_char_length_batch, max_sent_length=max_sent1_length)
	sent2_char_length_batch = pad_2d_matrix(sent2_char_length_batch, max_sent_length=max_sent2_length)

	if POS_vocab is not None:
	POS_idx_1_batch = pad_2d_matrix(POS_idx_1_batch, max_sent_length=max_sent1_length)
	POS_idx_2_batch = pad_2d_matrix(POS_idx_2_batch, max_sent_length=max_sent2_length)
	if NER_vocab is not None:
	NER_idx_1_batch = pad_2d_matrix(NER_idx_1_batch, max_sent_length=max_sent1_length)
	NER_idx_2_batch = pad_2d_matrix(NER_idx_2_batch, max_sent_length=max_sent2_length)


	self.batches.append((label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch,
	char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch,
	sent1_char_length_batch, sent2_char_length_batch,
	POS_idx_1_batch, POS_idx_2_batch, NER_idx_1_batch, NER_idx_2_batch))

	instances = None
	self.num_batch = len(self.batches)
	self.index_array = np.arange(self.num_batch)
	self.isShuffle = isShuffle
	if self.isShuffle: np.random.shuffle(self.index_array)
	self.isLoop = isLoop
	self.cur_pointer = 0

	def nextBatch(self):
	if self.cur_pointer>=self.num_batch:
	if not self.isLoop: return None
	self.cur_pointer = 0
	if self.isShuffle: np.random.shuffle(self.index_array)
	# print('{} '.format(self.index_array[self.cur_pointer]))
	cur_batch = self.batches[self.index_array[self.cur_pointer]]
	cur_batch = self.scramble(cur_batch)
	self.cur_pointer += 1
	return cur_batch

	def scramble(self, cur_batch):
	n = np.random.binomial(1, 0.25, 1)[0]
	if n > 0 and self.isShuffle == True:
	# if n > 0:
	(label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch,
	char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch,
	sent1_char_length_batch, sent2_char_length_batch,
	POS_idx_1_batch, POS_idx_2_batch, NER_idx_1_batch, NER_idx_2_batch) = cur_batch
	for idx in range(len(word_idx_1_batch)):
	word_idx_1 = word_idx_1_batch[idx]
	word_idx_2 = word_idx_2_batch[idx]
	# sent1_length = min(30, int(sent1_length_batch[idx]))
	sent1_length = sent1_length_batch[idx]
	sent2_length = sent2_length_batch[idx]

	w_idx_1 = word_idx_1[:sent1_length]
	random.shuffle(w_idx_1)
	word_idx_1[:sent1_length] = w_idx_1

	w_idx_2 = word_idx_2[:sent1_length]
	random.shuffle(w_idx_2)
	word_idx_2[:sent1_length] = w_idx_2

	cur_batch[4][idx] = word_idx_1
	cur_batch[5][idx] = word_idx_2
	return cur_batch

	def reset(self):
	self.cur_pointer = 0

	def get_num_batch(self):
	return self.num_batch

	def get_num_instance(self):
	return self.num_instances

	def get_batch(self, i):
	if i>= self.num_batch: return None
	return self.batches[i]

	def vectorize(self, score, num_classes):
	temp = np.zeros(num_classes, dtype=float)
	score = float(score)
	ceil, fl = int(np.ceil(score)), int(np.floor(score))
	if ceil == fl:
	temp[fl - 1] = 1
	else:
	temp[fl - 1] = ceil - score
	temp[ceil - 1] = score - fl
	temp = temp + 0.00001
	return temp


	class MultilingualSentenceMatchDataStream(object):

	def __init__(self, inpath, en_word_vocab=None, es_word_vocab=None, en_char_vocab=None, es_char_vocab=None,
	batch_size=60,
	isShuffle=False, isLoop=False, isSort=True, max_char_per_word=10, max_sent_length=200, num_classes=6):

	instances = []
	if type(inpath) == str:
	inpath = [inpath]
	for file in inpath:
	infile = open(file, 'rt')
	for line in infile:
	line = line.strip()
	if line.startswith('-'): continue
	items = re.split("\t", line)
	label = items[2]
	sentence1 = items[0].lower()
	sentence2 = items[1].lower()
	label = float(label)
	label_id = float(label)
	label_id = self.vectorize(label_id, num_classes)
	word_idx_1 = en_word_vocab.to_index_sequence(sentence1)
	word_idx_2 = es_word_vocab.to_index_sequence(sentence2)
	char_matrix_idx_1 = en_char_vocab.to_character_matrix(sentence1)
	char_matrix_idx_2 = es_char_vocab.to_character_matrix(sentence2)
	if len(word_idx_1) > max_sent_length:
	word_idx_1 = word_idx_1[:max_sent_length]
	char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]
	if len(word_idx_2) > max_sent_length:
	word_idx_2 = word_idx_2[:max_sent_length]
	char_matrix_idx_1 = char_matrix_idx_1[:max_sent_length]


	instances.append(
	(label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
	None, None, None, None))
	infile.close()

	# sort instances based on sentence length
	if isSort: instances = sorted(instances, key=lambda instance: (len(instance[4]), len(instance[5]))) # sort instances based on length
	self.num_instances = len(instances)

	# distribute into different buckets
	batch_spans = make_batches(self.num_instances, batch_size)
	self.batches = []
	for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
	label_batch = []
	sent1_batch = []
	sent2_batch = []
	label_id_batch = []
	word_idx_1_batch = []
	word_idx_2_batch = []
	char_matrix_idx_1_batch = []
	char_matrix_idx_2_batch = []
	sent1_length_batch = []
	sent2_length_batch = []
	sent1_char_length_batch = []
	sent2_char_length_batch = []


	for i in range(batch_start, batch_end):
	(label, sentence1, sentence2, label_id, word_idx_1, word_idx_2, char_matrix_idx_1, char_matrix_idx_2,
	POS_idx_1, POS_idx_2, NER_idx_1, NER_idx_2) = instances[i]
	label_batch.append(label)
	sent1_batch.append(sentence1)
	sent2_batch.append(sentence2)
	label_id_batch.append(label_id)
	word_idx_1_batch.append(word_idx_1)
	word_idx_2_batch.append(word_idx_2)
	char_matrix_idx_1_batch.append(char_matrix_idx_1)
	char_matrix_idx_2_batch.append(char_matrix_idx_2)
	sent1_length_batch.append(len(word_idx_1))
	sent2_length_batch.append(len(word_idx_2))
	sent1_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_1])
	sent2_char_length_batch.append([len(cur_char_idx) for cur_char_idx in char_matrix_idx_2])


	cur_batch_size = len(label_batch)
	if cur_batch_size == 0: continue

	# padding
	max_sent1_length = np.max(sent1_length_batch)
	max_sent2_length = np.max(sent2_length_batch)

	max_char_length1 = np.max([np.max(aa) for aa in sent1_char_length_batch])
	if max_char_length1 > max_char_per_word: max_char_length1 = max_char_per_word

	max_char_length2 = np.max([np.max(aa) for aa in sent2_char_length_batch])
	if max_char_length2 > max_char_per_word: max_char_length2 = max_char_per_word

	label_id_batch = np.array(label_id_batch)
	word_idx_1_batch = pad_2d_matrix(word_idx_1_batch, max_sent_length=max_sent1_length)
	word_idx_2_batch = pad_2d_matrix(word_idx_2_batch, max_sent_length=max_sent2_length)

	char_matrix_idx_1_batch = pad_3d_tensor(char_matrix_idx_1_batch, max_sent_length=max_sent1_length,
	max_word_length=max_char_length1)
	char_matrix_idx_2_batch = pad_3d_tensor(char_matrix_idx_2_batch, max_sent_length=max_sent2_length,
	max_word_length=max_char_length2)

	sent1_length_batch = np.array(sent1_length_batch)
	sent2_length_batch = np.array(sent2_length_batch)

	sent1_char_length_batch = pad_2d_matrix(sent1_char_length_batch, max_sent_length=max_sent1_length)
	sent2_char_length_batch = pad_2d_matrix(sent2_char_length_batch, max_sent_length=max_sent2_length)


	self.batches.append((label_batch, sent1_batch, sent2_batch, label_id_batch, word_idx_1_batch, word_idx_2_batch,
	char_matrix_idx_1_batch, char_matrix_idx_2_batch, sent1_length_batch, sent2_length_batch,
	sent1_char_length_batch, sent2_char_length_batch,
	None, None, None, None))

	instances = None
	self.num_batch = len(self.batches)
	self.index_array = np.arange(self.num_batch)
	self.isShuffle = isShuffle
	if self.isShuffle: np.random.shuffle(self.index_array)
	self.isLoop = isLoop
	self.cur_pointer = 0

	def nextBatch(self):
	if self.cur_pointer >= self.num_batch:
	if not self.isLoop: return None
	self.cur_pointer = 0
	if self.isShuffle: np.random.shuffle(self.index_array)
	# print('{} '.format(self.index_array[self.cur_pointer]))
	cur_batch = self.batches[self.index_array[self.cur_pointer]]
	self.cur_pointer += 1
	return cur_batch

	def reset(self):
	self.cur_pointer = 0

	def get_num_batch(self):
	return self.num_batch

	def get_num_instance(self):
	return self.num_instances

	def get_batch(self, i):
	if i >= self.num_batch: return None
	return self.batches[i]

	def vectorize(self, score, num_classes):
	temp = np.zeros(num_classes, dtype=float)
	score = float(score)
	ceil, fl = int(np.ceil(score)), int(np.floor(score))
	if ceil == fl:
	temp[fl - 1] = 1
	else:
	temp[fl - 1] = ceil - score
	temp[ceil - 1] = score - fl
	temp = temp + 0.00001
	return temp