Skip to content

Instantly share code, notes, and snippets.

View HarshSingh16's full-sized avatar

Harshdeep Singh HarshSingh16

View GitHub Profile
#Creating feature columns from our categorical data
education1=tf.feature_column.categorical_column_with_hash_bucket("education",hash_bucket_size=16)
workclass1=tf.feature_column.categorical_column_with_hash_bucket("workclass",hash_bucket_size=10)
martial1=tf.feature_column.categorical_column_with_hash_bucket("marital_status",hash_bucket_size=7)
occupation1=tf.feature_column.categorical_column_with_hash_bucket("occupation",hash_bucket_size=14)
relationship1=tf.feature_column.categorical_column_with_hash_bucket("relationship",hash_bucket_size=6)
race1=tf.feature_column.categorical_column_with_hash_bucket("race",hash_bucket_size=5)
gender1=tf.feature_column.categorical_column_with_hash_bucket("gender",hash_bucket_size=2)
native_country1=tf.feature_column.categorical_column_with_hash_bucket("native_country",hash_bucket_size=60)
# Doing a first cleaning of the texts
def clean_text(text):
text = text.lower()
text = re.sub(r"i'm", "i am", text)
text = re.sub(r"he's", "he is", text)
text = re.sub(r"she's", "she is", text)
text = re.sub(r"that's", "that is", text)
text = re.sub(r"what's", "what is", text)
text = re.sub(r"where's", "where is", text)
text = re.sub(r"\'ll", " will", text)
# Creating a list of all of the conversations
conversations_ids = []
for conversation in conversations[:-1]:
_conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
conversations_ids.append(_conversation.split(','))
# Getting separately the questions and the answers
questions = []
answers = []
for conversation in conversations_ids:
#Sorting clean questions and answers by questions:
sorted_clean_questions=[]
sorted_clean_answers=[]
for i in range(1,25):
for question in enumerate(questions_int_sequence):
if len(question[1])==i:
sorted_clean_questions.append(questions_int_sequence[question[0]])
sorted_clean_answers.append(answers_int_sequence[question[0]])
#Conveting questions and answers into sequence of integers
questions_int_sequence=[]
for question in Clean_questions:
int=[]
for word in question.split():
if word not in dict_word2integer:
int.append("<OUT>")
else:
int.append(dict_word2integer[word])
questions_int_sequence.append(int)
#Adding EOS at end of every answer
new_clean_answers=[]
for answers in Clean_answers:
new_clean_answers.append(answers+" <EOS>")
#Inverse Mapped Dictionary
dict_integer2word={i:w for w,i in dict_word2integer.items()}
#Inverse Mapped Dictionary
dict_integer2word={i:w for w,i in dict_word2integer.items()}
#SETTING A THRESHOLD AND MAPPING EACH WORD TO A UNIQUE INTEGER
threshold=20
word_number=0
dict_word2integer={}
for word,frequency in word2count.items():
if frequency>20:
dict_word2integer[word]=word_number
word_number+=1
#Adding tokens to our dictionary
Tokens=["<PAD>","<SOS>","<EOS>","<OUT>"]
for token in Tokens:
dict_integer[token]=len(dict_integer)+1