Last active
January 4, 2021 12:12
-
-
Save annecool37/4062fd966026be332b33a47a522b824e to your computer and use it in GitHub Desktop.
Museum_recommendation_system
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This section demonstrates: | |
# 1. How I got address of the museum | |
# 2. How I convert adress into latitude and longitude via Google API | |
# 3. The main code to scrape the data and save it into .csv | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import googlemaps | |
def get_address(museum_soup): | |
'''get address''' | |
return [soup.find('span', {'property':'address'}).getText().strip('\n').rstrip()[9:]\ | |
for soup in museum_soup] | |
# enable the API on console | |
# https://console.developers.google.com/apis/dashboard?project=my-trip-142904&duration=PT1H | |
# get access to API | |
def get_lat_lng(add_lst): | |
'''get latitube and longtitube of address''' | |
gmaps = googlemaps.Client(key='AIzaSyAp4nKWDK7gL4hMqm-uPy0S49UMcU3Mqr4') | |
lat_lst = [] | |
lng_lst = [] | |
for address in add_lst: | |
geocode_result = gmaps.geocode(address) | |
try: | |
lat_lst.append(geocode_result[0]['geometry']['location']['lat']) | |
lng_lst.append(geocode_result[0]['geometry']['location']['lng']) | |
except: | |
lat_lst.append('NA') | |
lng_lst.append('NA') | |
# print lat_lst | |
return lat_lst, lng_lst | |
def get_soup(url_head): | |
'''convert url into beautiful objects''' | |
url_lst = create_url_lst(url_head) | |
search_soup_lst = create_master_soup(url_lst) | |
return search_soup_lst | |
def get_data_and_save_stepwise(search_soup_lst, which): | |
'''main code to get data''' | |
# initialize the list | |
museum_name_lst= [] | |
review_count_lst=[] | |
address_lst=[] | |
lat_lst=[] | |
lng_lst=[] | |
rating_lst=[] | |
rank_lst=[] | |
total_things_to_do_lst=[] | |
categories_nested_lst=[] | |
featured_in_guide_count_lst=[] | |
phone_num_lst=[] | |
fee_lst=[] | |
description_lst=[] | |
length_of_visit_lst=[] | |
quote_lst=[] | |
review_content_lst=[] | |
tagcloud_lst=[] | |
traverler_rating_lst=[] | |
traveler_type_lst=[] | |
img_link_lst = [] | |
# save result for each page ran to prevent no result scenario caused by timeoutexception | |
for idx, search_soup in enumerate(search_soup_lst): | |
print "running page "+ str(idx+1) | |
museum_soup = get_museum_soup(search_soup) | |
museum_name_lst += get_museum_name(museum_soup) | |
review_count_lst += get_review_count(museum_soup) | |
add_lst = get_address(museum_soup) | |
address_lst += add_lst | |
lat_lst_, lng_lst_ = get_lat_lng(add_lst) | |
lat_lst += lat_lst_ | |
lng_lst += lng_lst_ | |
rating_lst += get_rating(museum_soup) | |
rank_lst_, total_things_to_do_lst_ = get_rank_total(museum_soup) | |
rank_lst += rank_lst_ | |
total_things_to_do_lst += total_things_to_do_lst_ | |
categories = get_heading_details(museum_soup) | |
categories_nested_lst += get_category(categories) | |
featured_in_guide_count_lst += if_featured_count(categories) | |
phone_num_lst += get_phone_num(museum_soup) | |
fee_lst += check_fee(museum_soup) | |
description_lst += get_description(museum_soup) | |
length_of_visit_lst += get_length_of_visit(museum_soup) | |
quote_lst += get_review_quotes(museum_soup) | |
review_content_lst += get_partial_review(museum_soup) | |
tagcloud_lst += get_review_tag_cloud(museum_soup) | |
traverler_rating_lst += get_rating_details(museum_soup) | |
traveler_type_lst += get_traveler_type(museum_soup) | |
img_link_lst += get_img_link(museum_soup) | |
# save result into dictionary | |
museum_dict = {'MuseumName': museum_name_lst, 'ReviewCount': review_count_lst, | |
'Address':address_lst, 'Latitude':lat_lst, 'Longtitude':lng_lst, | |
'Rating':rating_lst, 'Rank':rank_lst, 'TotalThingsToDo': total_things_to_do_lst, | |
'FeatureCount':featured_in_guide_count_lst, 'PhoneNum':phone_num_lst, 'Fee':fee_lst, | |
'Description':description_lst, 'LengthOfVisit':length_of_visit_lst} | |
# convert unicode to ascii | |
museum_dict['MuseumName'] = unicode_to_ascii(museum_dict['MuseumName']) | |
museum_dict['Address'] = unicode_to_ascii(museum_dict['Address']) | |
museum_dict['Description'] = unicode_to_ascii(museum_dict['Description']) | |
museum_dict['PhoneNum'] = unicode_to_ascii(museum_dict['PhoneNum']) | |
# convert dictionary to dataframe | |
museum_df = pd.DataFrame(museum_dict) | |
# save file as .csv | |
museum_df.to_csv('tripadvisor_museum'+ which +'.csv') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This section demonstrates how I added museum type as new features | |
def assign_0_or_1(df, target, dic): | |
'''assign value as 1 if the values of the dictionary in the musuem match the target''' | |
df = df.copy(deep=True) | |
# initialize column with 0 | |
for sub_item in target: | |
df[sub_item] = 0 | |
for museum_name, value in dic.items(): | |
# get index of specific museum name | |
idx = df[df['MuseumName'] == museum_name].index.tolist()[0] | |
for sub_item in target: | |
if sub_item in value: | |
# assign 1 if the museum meet the criteria | |
df.loc[idx, sub_item] = 1 | |
return df | |
def add_category_feature(df, category): | |
'''add several museum type as new features''' | |
category_lst = reduce(lambda x,y: x + y, category.values(),[]) | |
lst = list(set(category_lst)) | |
lst.sort() | |
df = assign_0_or_1(df, lst, category) | |
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import nltk | |
from nltk import * | |
from textblob import TextBlob | |
def get_nested_sentiment(dic): | |
'''get polarity and subjectivity score for each text in nested list''' | |
polarity_dic = {} | |
subjectivity_dic = {} | |
for museum_name, lst in dic.items(): | |
polarity_lst = [] | |
subjectivity_lst = [] | |
for sentence in lst: | |
blob = TextBlob(sentence) | |
polarity_lst.append(blob.sentiment.polarity) | |
subjectivity_lst.append(blob.sentiment.subjectivity) | |
polarity_dic[museum_name] = polarity_lst | |
subjectivity_dic[museum_name] = subjectivity_lst | |
return polarity_dic, subjectivity_dic | |
def add_multiple_score_feature(df, header, dic): | |
'''add max, min, mean, and var of sentiment scores as new features''' | |
df = df.copy(deep=True) | |
colnames = [header+'var', header+'mean', header+'max', header+'min'] | |
for museum_name, score_lst in dic.items(): | |
# get index | |
row_idx = df[df['MuseumName'] == museum_name].index.tolist()[0] | |
# create new features for scores: max, min, mean, and var of score_lst | |
df.loc[row_idx, colnames[0]] = np.var(score_lst) | |
df.loc[row_idx, colnames[1]] = np.mean(score_lst) | |
df.loc[row_idx, colnames[2]] = max(score_lst) | |
df.loc[row_idx, colnames[3]] = min(score_lst) | |
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Run this file and get sorted recommended museums | |
# import modules | |
import pandas as pd | |
import numpy as np | |
import sys | |
from sklearn.metrics.pairwise import cosine_similarity | |
import json | |
# import data | |
museum_df = pd.read_csv("./data/imputed_df_with_name.csv") | |
# drop unnamed column | |
museum_df = museum_df.drop(museum_df.columns[0], axis=1) | |
# drop column "MuseumName", "ReviewCount", "TotalThingsToDo" | |
imputed_df = museum_df.drop(museum_df.columns[[0,4,5]], axis=1) | |
# get number of rows of the dataframe | |
nrow = imputed_df.shape[0] | |
def get_museum_lst(target_museum_input): | |
'''get the museum lst from input''' | |
return target_museum_input.split(';')[1:] | |
def get_master_srt_lst(museum_lst): | |
'''concatenate all top five lists for museums in museum_lst''' | |
master_srt_lst = [] | |
for m in museum_lst: | |
master_srt_lst += get_top_five_for_one(m) | |
return master_srt_lst | |
def sort_list(lst): | |
'''sort the nested list based on the second item in list''' | |
sorted_lst = sorted(lst, key=lambda x: x[1], reverse = True) | |
return sorted_lst | |
def get_top_five_for_one(target_museum): | |
'''get top five museum and consine similarity for one musuem''' | |
target_idx = museum_df[museum_df['MuseumName'] == target_museum].index.tolist()[0] | |
input_vec = np.array(imputed_df.iloc[target_idx]).reshape(1, -1) | |
cos_sim = [] | |
for i in range(nrow): | |
# reshapre the row into a vector | |
vec = np.array(imputed_df.iloc[i]).reshape(1, -1) | |
# compute and store consine similarity along with musuem name | |
cos_sim.append([museum_df['MuseumName'][i], cosine_similarity(input_vec, vec)[0][0]]) | |
top_five = sort_list(cos_sim) | |
return top_five[1:6] # ignore the top one since it's the target musuem itself | |
def lst_to_dic(lst): | |
'''convert lst into dictionary''' | |
top_five_dic = {} | |
for i in lst: | |
top_five_dic[i[0]] = i[1] | |
return top_five_dic | |
def to_json(name, dic): | |
'''write dictionary to json file''' | |
filename = name + '.json' | |
with open(filename, 'w') as f: | |
json.dump(dic, f) | |
def exclude_selected(museum_lst, srt_lst): | |
'''exclude museums selected from the sorted list''' | |
return [x for x in srt_lst if x[0] not in museum_lst] | |
def get_sorted_dic(lst): | |
'''convert sorted list into sorted dictionary''' | |
dic = {} | |
for idx, item in enumerate(lst): | |
dic[idx+1] = [item[0], item[1]] | |
return dic | |
def get_unique_recom(master_srt_lst): | |
'''remove duplicate recommendations''' | |
unique_name = list(set([i[0]for i in master_srt_lst])) | |
uni_lst = [] | |
for i in master_srt_lst: | |
if i[0] in unique_name: | |
uni_lst.append([ i[0],i[1] ]) | |
unique_name.pop(unique_name.index(i[0])) | |
return uni_lst | |
if __name__ == "__main__": | |
'''main code''' | |
museum_lst = get_museum_lst(sys.argv[1]) | |
master_srt_lst = get_master_srt_lst(museum_lst) | |
uni_lst = get_unique_recom(master_srt_lst) | |
sorted_lst = sort_list(uni_lst) | |
top_lst = exclude_selected(museum_lst, sorted_lst) | |
sorted_dic = get_sorted_dic(top_lst) | |
to_json('sorted_suggestion', sorted_dic) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment