Mohamed Gaber MoGaber

## gist:188124aeea0ca24ac91e89ae1e0711ba
# get the average score for each professional
prof_score = pd.merge(answers[["professional_id","answers_id" ]], answers_scores.rename(columns = {"id":"answers_id"}),how='left',  on=["answers_id" ]).drop(columns = ["answers_id"])
score_mean = prof_score.groupby("professional_id").mean().reset_index()
professionals_dataset = pd.merge(professionals_dataset,score_mean,how='left',  on=["professional_id" ]).rename(columns = {"score":"avg_ansrs_score"})
professionals_dataset["avg_ansrs_score"] = professionals_dataset["avg_ansrs_score"].fillna(0)

## gist:88f6ee6a7d4ab3d9abec1fcd348992b0
# get all the tags of the questions that he answered before
a = full_data[["professional_id","q_tags"]][full_data["q_tags"].notnull()].groupby("professional_id")["q_tags"].agg(sum).apply(lambda x: list(set(x)))
professionals_dataset = pd.merge(professionals_dataset, a,how='left',  on=["professional_id" ])
professionals_dataset = professionals_dataset.rename(columns = {"q_tags": "prev_q_tags"})

## gist:e14a0fb57bc3c84a3e38a2acae504461
# get all the tags that each professional is following
all_tags = pd.merge(tag_users.rename(columns = {"tag_users_tag_id": "tag_id"}),tags.rename(columns = {"tags_tag_id": "tag_id"}),how='left',  on=["tag_id" ])
foll_tags = all_tags.groupby('tag_users_user_id')['tags_tag_name'].apply(lambda x: list(set(x))).reset_index(name='following_tags').rename(columns = {"tag_users_user_id": "professional_id"})
professionals_dataset = pd.merge(professionals_dataset, foll_tags,how='left',  on=["professional_id" ])

## gist:7d6e226d4c591210d1ddc8b3be41eb1a
# count the total number of questions that each professional answered (including those answered after email)
answers_count = answers["professional_id"].value_counts().reset_index().rename(columns = {"professional_id":"number_q_answered","index":"professional_id",  })
professionals_dataset = pd.merge(professionals_dataset, answers_count,how='left',  on=["professional_id" ])
professionals_dataset["number_q_answered"]= professionals_dataset["number_q_answered"].fillna(0)

## gist:c91e72ca17e8b09c562ddc03ebd1e14f
# getting the average time they took to answer the question if they did
grouped = full_data.groupby('professional_id')["time_taken"]
time_mean = grouped.apply(lambda x: np.mean(x))
professionals_dataset = pd.merge(professionals_dataset, time_mean,how='left',  on=["professional_id" ])
professionals_dataset=  professionals_dataset.rename(columns ={"time_taken": "avg_time_taken"})

## gist:8b5bd9c5a1dc0f392795a3468277284c
# getting the response rate by counting the number of questions answered after the email was sent
response = full_data.groupby("professional_id").mean().drop(columns = ["email_id"]).reset_index()
professionals_dataset = pd.merge(professionals_dataset, response,how='left',  on=["professional_id" ])
professionals_dataset = professionals_dataset.rename(columns = {"q_answered?": "response_rate"})

## gist:09bf04219b6c1cc1b5eb46b7eae6e26a
# get all the tags of each question
tag_questions_names = pd.merge(tag_questions.rename(columns = {"tag_questions_tag_id":"tag_id"}), tags.rename(columns = {"tags_tag_id":"tag_id"}),how='left',  on=["tag_id" ])
tag_questions_names = tag_questions_names.rename(columns ={"tag_questions_question_id":"question_id", "tags_tag_name":"tag_name"})
questions_tags = tag_questions_names.groupby("question_id")["tag_name"].apply(list).reset_index(name="q_tags")
full_data = pd.merge(full_data,questions_tags,how='left',  on=["question_id" ])

## gist:d8972a964f06e5af7331945220caf3a6
# some people were too active and answered the question even before the email was sent to them. So that would lead
# to a negative time_taken. So we fix that by putting a zero for the time they took (they answered it immediately)
indices2 = full_data["time_taken"][full_data["time_taken"] < datetime.timedelta( days=0)].index.values
full_data.loc[indices2, "time_taken"]=datetime.timedelta(days=0, seconds=0,minutes=0, hours=0 )

## gist:3bacc5cc980d95d4a0b337f55e6d65f8
#adding the time taken for each professional to answer a question
full_data["time_taken"] =  pd.to_datetime(full_data["answers_date_added"]) - pd.to_datetime(full_data["emails_date_sent"])

## gist:9731751323298ab7df4649f02f1768ae
# turn the asnwer added data to datetime value instead of a string (while ignoring the NAs)
indices = full_data["answers_date_added"][full_data["answers_date_added"].notnull()].index.values
full_data.loc[indices, "answers_date_added"] = full_data["answers_date_added"][full_data["answers_date_added"].notnull()].apply(date_vectorizer)
	# get the average score for each professional
	prof_score = pd.merge(answers[["professional_id","answers_id" ]], answers_scores.rename(columns = {"id":"answers_id"}),how='left', on=["answers_id" ]).drop(columns = ["answers_id"])
	score_mean = prof_score.groupby("professional_id").mean().reset_index()
	professionals_dataset = pd.merge(professionals_dataset,score_mean,how='left', on=["professional_id" ]).rename(columns = {"score":"avg_ansrs_score"})
	professionals_dataset["avg_ansrs_score"] = professionals_dataset["avg_ansrs_score"].fillna(0)
	# get all the tags of the questions that he answered before
	a = full_data[["professional_id","q_tags"]][full_data["q_tags"].notnull()].groupby("professional_id")["q_tags"].agg(sum).apply(lambda x: list(set(x)))
	professionals_dataset = pd.merge(professionals_dataset, a,how='left', on=["professional_id" ])
	professionals_dataset = professionals_dataset.rename(columns = {"q_tags": "prev_q_tags"})
	# get all the tags that each professional is following
	all_tags = pd.merge(tag_users.rename(columns = {"tag_users_tag_id": "tag_id"}),tags.rename(columns = {"tags_tag_id": "tag_id"}),how='left', on=["tag_id" ])
	foll_tags = all_tags.groupby('tag_users_user_id')['tags_tag_name'].apply(lambda x: list(set(x))).reset_index(name='following_tags').rename(columns = {"tag_users_user_id": "professional_id"})
	professionals_dataset = pd.merge(professionals_dataset, foll_tags,how='left', on=["professional_id" ])
	# count the total number of questions that each professional answered (including those answered after email)
	answers_count = answers["professional_id"].value_counts().reset_index().rename(columns = {"professional_id":"number_q_answered","index":"professional_id", })
	professionals_dataset = pd.merge(professionals_dataset, answers_count,how='left', on=["professional_id" ])
	professionals_dataset["number_q_answered"]= professionals_dataset["number_q_answered"].fillna(0)
	# getting the average time they took to answer the question if they did
	grouped = full_data.groupby('professional_id')["time_taken"]
	time_mean = grouped.apply(lambda x: np.mean(x))
	professionals_dataset = pd.merge(professionals_dataset, time_mean,how='left', on=["professional_id" ])
	professionals_dataset= professionals_dataset.rename(columns ={"time_taken": "avg_time_taken"})
	# getting the response rate by counting the number of questions answered after the email was sent
	response = full_data.groupby("professional_id").mean().drop(columns = ["email_id"]).reset_index()
	professionals_dataset = pd.merge(professionals_dataset, response,how='left', on=["professional_id" ])
	professionals_dataset = professionals_dataset.rename(columns = {"q_answered?": "response_rate"})
	# get all the tags of each question
	tag_questions_names = pd.merge(tag_questions.rename(columns = {"tag_questions_tag_id":"tag_id"}), tags.rename(columns = {"tags_tag_id":"tag_id"}),how='left', on=["tag_id" ])
	tag_questions_names = tag_questions_names.rename(columns ={"tag_questions_question_id":"question_id", "tags_tag_name":"tag_name"})
	questions_tags = tag_questions_names.groupby("question_id")["tag_name"].apply(list).reset_index(name="q_tags")
	full_data = pd.merge(full_data,questions_tags,how='left', on=["question_id" ])
	# some people were too active and answered the question even before the email was sent to them. So that would lead
	# to a negative time_taken. So we fix that by putting a zero for the time they took (they answered it immediately)
	indices2 = full_data["time_taken"][full_data["time_taken"] < datetime.timedelta( days=0)].index.values
	full_data.loc[indices2, "time_taken"]=datetime.timedelta(days=0, seconds=0,minutes=0, hours=0 )
	#adding the time taken for each professional to answer a question
	full_data["time_taken"] = pd.to_datetime(full_data["answers_date_added"]) - pd.to_datetime(full_data["emails_date_sent"])
	# turn the asnwer added data to datetime value instead of a string (while ignoring the NAs)
	indices = full_data["answers_date_added"][full_data["answers_date_added"].notnull()].index.values
	full_data.loc[indices, "answers_date_added"] = full_data["answers_date_added"][full_data["answers_date_added"].notnull()].apply(date_vectorizer)