Skip to content

Instantly share code, notes, and snippets.

@iamukasa
Created February 27, 2018 14:05
Show Gist options
  • Save iamukasa/cc553a5a4082f77d9e0bee1c74449d12 to your computer and use it in GitHub Desktop.
Save iamukasa/cc553a5a4082f77d9e0bee1c74449d12 to your computer and use it in GitHub Desktop.
PReprocessing from ubuntu dialog corpus to our dataset
import os
import csv
import itertools
import functools
import tensorflow as tf
import numpy as np
import array
tf.flags.DEFINE_integer(
"min_word_frequency", 5, "Minimum frequency of words in the vocabulary")
tf.flags.DEFINE_integer("max_sentence_len", 160, "Maximum Sentence Length")
tf.flags.DEFINE_string(
"input_dir", os.path.abspath("./data"),
"Input directory containing original CSV data files (default = './data')")
tf.flags.DEFINE_string(
"output_dir", os.path.abspath("./data"),
"Output directory for TFrEcord files (default = './data')")
FLAGS = tf.flags.FLAGS
TRAIN_PATH = os.path.join(FLAGS.input_dir, "train.csv")
VALIDATION_PATH = os.path.join(FLAGS.input_dir, "valid.csv")
TEST_PATH = os.path.join(FLAGS.input_dir, "test.csv")
def tokenizer_fn(iterator):
return (x.split(" ") for x in iterator)
def create_csv_iter(filename):
"""
Returns an iterator over a CSV file. Skips the header.
"""
with open(filename) as csvfile:
reader = csv.reader(csvfile)
# Skip the header
next(reader)
for row in reader:
yield row
def create_vocab(input_iter, min_frequency):
"""
Creates and returns a VocabularyProcessor object with the vocabulary
for the input iterator.
"""
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
FLAGS.max_sentence_len,
min_frequency=min_frequency,
tokenizer_fn=tokenizer_fn)
vocab_processor.fit(input_iter)
return vocab_processor
def transform_sentence(sequence, vocab_processor):
"""
Maps a single sentence into the integer vocabulary. Returns a python array.
"""
return next(vocab_processor.transform([sequence])).tolist()
def create_text_sequence_feature(fl, sentence, sentence_len, vocab):
"""
Writes a sentence to FeatureList protocol buffer
"""
sentence_transformed = transform_sentence(sentence, vocab)
for word_id in sentence_transformed:
fl.feature.add().int64_list.value.extend([word_id])
return fl
def create_example_train(row, vocab):
"""
Creates a training example for the Ubuntu Dialog Corpus dataset.
Returnsthe a tensorflow.Example Protocol Buffer object.
"""
context, utterance, label = row
context_transformed = transform_sentence(context, vocab)
utterance_transformed = transform_sentence(utterance, vocab)
context_len = len(next(vocab._tokenizer([context])))
utterance_len = len(next(vocab._tokenizer([utterance])))
label = int(float(label))
# New Example
example = tf.train.Example()
example.features.feature["context"].int64_list.value.extend(context_transformed)
example.features.feature["utterance"].int64_list.value.extend(utterance_transformed)
example.features.feature["context_len"].int64_list.value.extend([context_len])
example.features.feature["utterance_len"].int64_list.value.extend([utterance_len])
example.features.feature["label"].int64_list.value.extend([label])
return example
def create_example_test(row, vocab):
"""
Creates a test/validation example for the Ubuntu Dialog Corpus dataset.
Returnsthe a tensorflow.Example Protocol Buffer object.
"""
context, utterance = row[:2]
distractors = row[2:]
context_len = len(next(vocab._tokenizer([context])))
utterance_len = len(next(vocab._tokenizer([utterance])))
context_transformed = transform_sentence(context, vocab)
utterance_transformed = transform_sentence(utterance, vocab)
# New Example
example = tf.train.Example()
example.features.feature["context"].int64_list.value.extend(context_transformed)
example.features.feature["utterance"].int64_list.value.extend(utterance_transformed)
example.features.feature["context_len"].int64_list.value.extend([context_len])
example.features.feature["utterance_len"].int64_list.value.extend([utterance_len])
# Distractor sequences
for i, distractor in enumerate(distractors):
dis_key = "distractor_{}".format(i)
dis_len_key = "distractor_{}_len".format(i)
# Distractor Length Feature
dis_len = len(next(vocab._tokenizer([distractor])))
example.features.feature[dis_len_key].int64_list.value.extend([dis_len])
# Distractor Text Feature
dis_transformed = transform_sentence(distractor, vocab)
example.features.feature[dis_key].int64_list.value.extend(dis_transformed)
return example
def create_tfrecords_file(input_filename, output_filename, example_fn):
"""
Creates a TFRecords file for the given input data and
example transofmration function
"""
writer = tf.python_io.TFRecordWriter(output_filename)
print("Creating TFRecords file at {}...".format(output_filename))
for i, row in enumerate(create_csv_iter(input_filename)):
x = example_fn(row)
writer.write(x.SerializeToString())
writer.close()
print("Wrote to {}".format(output_filename))
def write_vocabulary(vocab_processor, outfile):
"""
Writes the vocabulary to a file, one word per line.
"""
vocab_size = len(vocab_processor.vocabulary_)
with open(outfile, "w") as vocabfile:
for id in range(vocab_size):
word = vocab_processor.vocabulary_._reverse_mapping[id]
vocabfile.write(word + "\n")
print("Saved vocabulary to {}".format(outfile))
if __name__ == "__main__":
print("Creating vocabulary...")
input_iter = create_csv_iter(TRAIN_PATH)
input_iter = (x[0] + " " + x[1] for x in input_iter)
vocab = create_vocab(input_iter, min_frequency=FLAGS.min_word_frequency)
print("Total vocabulary size: {}".format(len(vocab.vocabulary_)))
# Create vocabulary.txt file
write_vocabulary(
vocab, os.path.join(FLAGS.output_dir, "vocabulary.txt"))
# Save vocab processor
vocab.save(os.path.join(FLAGS.output_dir, "vocab_processor.bin"))
# Create validation.tfrecords
create_tfrecords_file(
input_filename=VALIDATION_PATH,
output_filename=os.path.join(FLAGS.output_dir, "validation.tfrecords"),
example_fn=functools.partial(create_example_test, vocab=vocab))
# Create test.tfrecords
create_tfrecords_file(
input_filename=TEST_PATH,
output_filename=os.path.join(FLAGS.output_dir, "test.tfrecords"),
example_fn=functools.partial(create_example_test, vocab=vocab))
# Create train.tfrecords
create_tfrecords_file(
input_filename=TRAIN_PATH,
output_filename=os.path.join(FLAGS.output_dir, "train.tfrecords"),
example_fn=functools.partial(create_example_train, vocab=vocab))
WNDEF SOPHIE
I am a smart chat robot who knows about reproductive health. Ask me any questions about sexual and reproductive health.The Devs have fed me with a lot of knowledge about sexual and reproductive health.I also talk back but you can shut me up if you so wish
WNDEF YOUR NAME
My name is Sophie. I am a smart chat robot who knows about reproductive health. Ask me any questions about sexual and reproductive health.The Devs have fed me with a lot of knowledge about sexual and reproductive health.I also talk back but you can shut me up if you so wish.
WNDEF Unaitwa nani
My name is Sophie. I am a smart chat robot who knows about reproductive health. Ask me any questions about sexual and reproductive health.The Devs have fed me with a lot of knowledge about sexual and reproductive health.I also talk back but you can shut me up if you so wish.
WNDEF SEX
The state of being male or female.Men or male animals as a group or women or female animals as a group.Physical activity in which people touch each other's bodies, kiss each other, etc.Physical activity that is related to and often includes sexual intercourse.
WNDEF WHAT IS SEX
The state of being male or female.Men or male animals as a group or women or female animals as a group.Physical activity in which people touch each other's bodies, kiss each other, etc.Physical activity that is related to and often includes sexual intercourse.
WNDEF DESCRIBE SEX
The state of being male or female.Men or male animals as a group or women or female animals as a group.Physical activity in which people touch each other's bodies, kiss each other, etc.Physical activity that is related to and often includes sexual intercourse.
WNDEF EXPLAIN SEX
The state of being male or female.Men or male animals as a group or women or female animals as a group.Physical activity in which people touch each other's bodies, kiss each other, etc.Physical activity that is related to and often includes sexual intercourse.
WNDEF DEFINE SEX
The state of being male or female.Men or male animals as a group or women or female animals as a group.Physical activity in which people touch each other's bodies, kiss each other, etc.Physical activity that is related to and often includes sexual intercourse.
WNDEF HEALTH
The state of physical,mental and psychological well-being.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment