Harshdeep Singh HarshSingh16

## feature.py
#Creating feature columns from our categorical data

education1=tf.feature_column.categorical_column_with_hash_bucket("education",hash_bucket_size=16)
workclass1=tf.feature_column.categorical_column_with_hash_bucket("workclass",hash_bucket_size=10)
martial1=tf.feature_column.categorical_column_with_hash_bucket("marital_status",hash_bucket_size=7)
occupation1=tf.feature_column.categorical_column_with_hash_bucket("occupation",hash_bucket_size=14)
relationship1=tf.feature_column.categorical_column_with_hash_bucket("relationship",hash_bucket_size=6)
race1=tf.feature_column.categorical_column_with_hash_bucket("race",hash_bucket_size=5)
gender1=tf.feature_column.categorical_column_with_hash_bucket("gender",hash_bucket_size=2)
native_country1=tf.feature_column.categorical_column_with_hash_bucket("native_country",hash_bucket_size=60)

## CleaningData.py
# Doing a first cleaning of the texts
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)

## Conversations and QuestionsAnswers.py
# Creating a list of all of the conversations
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(','))

# Getting separately the questions and the answers
questions = []
answers = []
for conversation in conversations_ids:

## SortingQuestionAnswers.py
#Sorting clean questions and answers by questions:
sorted_clean_questions=[]
sorted_clean_answers=[]
for i in range(1,25):
    for question in enumerate(questions_int_sequence):
        if len(question[1])==i:
            sorted_clean_questions.append(questions_int_sequence[question[0]])
            sorted_clean_answers.append(answers_int_sequence[question[0]])

## Seq_integers.py
#Conveting questions and answers into sequence of integers
questions_int_sequence=[]
for question in Clean_questions:
    int=[]
    for word in question.split():
        if word not in dict_word2integer:
            int.append("<OUT>")
        else:
            int.append(dict_word2integer[word])
    questions_int_sequence.append(int)

## AddingEOS.py
#Adding EOS at end of every answer
new_clean_answers=[]
for answers in Clean_answers:
    new_clean_answers.append(answers+" <EOS>")

## InverseDict.py
#Inverse Mapped Dictionary
dict_integer2word={i:w for w,i in dict_word2integer.items()}

## InverseDict.py
#Inverse Mapped Dictionary
dict_integer2word={i:w for w,i in dict_word2integer.items()}

## word2integermapping.py
#SETTING A THRESHOLD AND MAPPING EACH WORD TO A UNIQUE INTEGER

threshold=20
word_number=0
dict_word2integer={}
for word,frequency in word2count.items():
    if frequency>20:
        dict_word2integer[word]=word_number
        word_number+=1

## Tokens.py
#Adding tokens to our dictionary
Tokens=["<PAD>","<SOS>","<EOS>","<OUT>"]
for token in Tokens:
    dict_integer[token]=len(dict_integer)+1
	#Creating feature columns from our categorical data

	education1=tf.feature_column.categorical_column_with_hash_bucket("education",hash_bucket_size=16)
	workclass1=tf.feature_column.categorical_column_with_hash_bucket("workclass",hash_bucket_size=10)
	martial1=tf.feature_column.categorical_column_with_hash_bucket("marital_status",hash_bucket_size=7)
	occupation1=tf.feature_column.categorical_column_with_hash_bucket("occupation",hash_bucket_size=14)
	relationship1=tf.feature_column.categorical_column_with_hash_bucket("relationship",hash_bucket_size=6)
	race1=tf.feature_column.categorical_column_with_hash_bucket("race",hash_bucket_size=5)
	gender1=tf.feature_column.categorical_column_with_hash_bucket("gender",hash_bucket_size=2)
	native_country1=tf.feature_column.categorical_column_with_hash_bucket("native_country",hash_bucket_size=60)
	# Doing a first cleaning of the texts
	def clean_text(text):
	text = text.lower()
	text = re.sub(r"i'm", "i am", text)
	text = re.sub(r"he's", "he is", text)
	text = re.sub(r"she's", "she is", text)
	text = re.sub(r"that's", "that is", text)
	text = re.sub(r"what's", "what is", text)
	text = re.sub(r"where's", "where is", text)
	text = re.sub(r"\'ll", " will", text)
	# Creating a list of all of the conversations
	conversations_ids = []
	for conversation in conversations[:-1]:
	_conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
	conversations_ids.append(_conversation.split(','))

	# Getting separately the questions and the answers
	questions = []
	answers = []
	for conversation in conversations_ids:
	#Sorting clean questions and answers by questions:
	sorted_clean_questions=[]
	sorted_clean_answers=[]
	for i in range(1,25):
	for question in enumerate(questions_int_sequence):
	if len(question[1])==i:
	sorted_clean_questions.append(questions_int_sequence[question[0]])
	sorted_clean_answers.append(answers_int_sequence[question[0]])
	#Conveting questions and answers into sequence of integers
	questions_int_sequence=[]
	for question in Clean_questions:
	int=[]
	for word in question.split():
	if word not in dict_word2integer:
	int.append("<OUT>")
	else:
	int.append(dict_word2integer[word])
	questions_int_sequence.append(int)
	#Adding EOS at end of every answer
	new_clean_answers=[]
	for answers in Clean_answers:
	new_clean_answers.append(answers+" <EOS>")
	#Inverse Mapped Dictionary
	dict_integer2word={i:w for w,i in dict_word2integer.items()}
	#SETTING A THRESHOLD AND MAPPING EACH WORD TO A UNIQUE INTEGER

	threshold=20
	word_number=0
	dict_word2integer={}
	for word,frequency in word2count.items():
	if frequency>20:
	dict_word2integer[word]=word_number
	word_number+=1
	#Adding tokens to our dictionary
	Tokens=["<PAD>","<SOS>","<EOS>","<OUT>"]
	for token in Tokens:
	dict_integer[token]=len(dict_integer)+1