Skip to content

Instantly share code, notes, and snippets.

@annecool37
Last active January 4, 2021 12:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save annecool37/4062fd966026be332b33a47a522b824e to your computer and use it in GitHub Desktop.
Save annecool37/4062fd966026be332b33a47a522b824e to your computer and use it in GitHub Desktop.
Museum_recommendation_system
# This section demonstrates:
# 1. How I got address of the museum
# 2. How I convert adress into latitude and longitude via Google API
# 3. The main code to scrape the data and save it into .csv
from bs4 import BeautifulSoup
import pandas as pd
import googlemaps
def get_address(museum_soup):
'''get address'''
return [soup.find('span', {'property':'address'}).getText().strip('\n').rstrip()[9:]\
for soup in museum_soup]
# enable the API on console
# https://console.developers.google.com/apis/dashboard?project=my-trip-142904&duration=PT1H
# get access to API
def get_lat_lng(add_lst):
'''get latitube and longtitube of address'''
gmaps = googlemaps.Client(key='AIzaSyAp4nKWDK7gL4hMqm-uPy0S49UMcU3Mqr4')
lat_lst = []
lng_lst = []
for address in add_lst:
geocode_result = gmaps.geocode(address)
try:
lat_lst.append(geocode_result[0]['geometry']['location']['lat'])
lng_lst.append(geocode_result[0]['geometry']['location']['lng'])
except:
lat_lst.append('NA')
lng_lst.append('NA')
# print lat_lst
return lat_lst, lng_lst
def get_soup(url_head):
'''convert url into beautiful objects'''
url_lst = create_url_lst(url_head)
search_soup_lst = create_master_soup(url_lst)
return search_soup_lst
def get_data_and_save_stepwise(search_soup_lst, which):
'''main code to get data'''
# initialize the list
museum_name_lst= []
review_count_lst=[]
address_lst=[]
lat_lst=[]
lng_lst=[]
rating_lst=[]
rank_lst=[]
total_things_to_do_lst=[]
categories_nested_lst=[]
featured_in_guide_count_lst=[]
phone_num_lst=[]
fee_lst=[]
description_lst=[]
length_of_visit_lst=[]
quote_lst=[]
review_content_lst=[]
tagcloud_lst=[]
traverler_rating_lst=[]
traveler_type_lst=[]
img_link_lst = []
# save result for each page ran to prevent no result scenario caused by timeoutexception
for idx, search_soup in enumerate(search_soup_lst):
print "running page "+ str(idx+1)
museum_soup = get_museum_soup(search_soup)
museum_name_lst += get_museum_name(museum_soup)
review_count_lst += get_review_count(museum_soup)
add_lst = get_address(museum_soup)
address_lst += add_lst
lat_lst_, lng_lst_ = get_lat_lng(add_lst)
lat_lst += lat_lst_
lng_lst += lng_lst_
rating_lst += get_rating(museum_soup)
rank_lst_, total_things_to_do_lst_ = get_rank_total(museum_soup)
rank_lst += rank_lst_
total_things_to_do_lst += total_things_to_do_lst_
categories = get_heading_details(museum_soup)
categories_nested_lst += get_category(categories)
featured_in_guide_count_lst += if_featured_count(categories)
phone_num_lst += get_phone_num(museum_soup)
fee_lst += check_fee(museum_soup)
description_lst += get_description(museum_soup)
length_of_visit_lst += get_length_of_visit(museum_soup)
quote_lst += get_review_quotes(museum_soup)
review_content_lst += get_partial_review(museum_soup)
tagcloud_lst += get_review_tag_cloud(museum_soup)
traverler_rating_lst += get_rating_details(museum_soup)
traveler_type_lst += get_traveler_type(museum_soup)
img_link_lst += get_img_link(museum_soup)
# save result into dictionary
museum_dict = {'MuseumName': museum_name_lst, 'ReviewCount': review_count_lst,
'Address':address_lst, 'Latitude':lat_lst, 'Longtitude':lng_lst,
'Rating':rating_lst, 'Rank':rank_lst, 'TotalThingsToDo': total_things_to_do_lst,
'FeatureCount':featured_in_guide_count_lst, 'PhoneNum':phone_num_lst, 'Fee':fee_lst,
'Description':description_lst, 'LengthOfVisit':length_of_visit_lst}
# convert unicode to ascii
museum_dict['MuseumName'] = unicode_to_ascii(museum_dict['MuseumName'])
museum_dict['Address'] = unicode_to_ascii(museum_dict['Address'])
museum_dict['Description'] = unicode_to_ascii(museum_dict['Description'])
museum_dict['PhoneNum'] = unicode_to_ascii(museum_dict['PhoneNum'])
# convert dictionary to dataframe
museum_df = pd.DataFrame(museum_dict)
# save file as .csv
museum_df.to_csv('tripadvisor_museum'+ which +'.csv')
# This section demonstrates how I added museum type as new features
def assign_0_or_1(df, target, dic):
'''assign value as 1 if the values of the dictionary in the musuem match the target'''
df = df.copy(deep=True)
# initialize column with 0
for sub_item in target:
df[sub_item] = 0
for museum_name, value in dic.items():
# get index of specific museum name
idx = df[df['MuseumName'] == museum_name].index.tolist()[0]
for sub_item in target:
if sub_item in value:
# assign 1 if the museum meet the criteria
df.loc[idx, sub_item] = 1
return df
def add_category_feature(df, category):
'''add several museum type as new features'''
category_lst = reduce(lambda x,y: x + y, category.values(),[])
lst = list(set(category_lst))
lst.sort()
df = assign_0_or_1(df, lst, category)
return df
import numpy as np
import nltk
from nltk import *
from textblob import TextBlob
def get_nested_sentiment(dic):
'''get polarity and subjectivity score for each text in nested list'''
polarity_dic = {}
subjectivity_dic = {}
for museum_name, lst in dic.items():
polarity_lst = []
subjectivity_lst = []
for sentence in lst:
blob = TextBlob(sentence)
polarity_lst.append(blob.sentiment.polarity)
subjectivity_lst.append(blob.sentiment.subjectivity)
polarity_dic[museum_name] = polarity_lst
subjectivity_dic[museum_name] = subjectivity_lst
return polarity_dic, subjectivity_dic
def add_multiple_score_feature(df, header, dic):
'''add max, min, mean, and var of sentiment scores as new features'''
df = df.copy(deep=True)
colnames = [header+'var', header+'mean', header+'max', header+'min']
for museum_name, score_lst in dic.items():
# get index
row_idx = df[df['MuseumName'] == museum_name].index.tolist()[0]
# create new features for scores: max, min, mean, and var of score_lst
df.loc[row_idx, colnames[0]] = np.var(score_lst)
df.loc[row_idx, colnames[1]] = np.mean(score_lst)
df.loc[row_idx, colnames[2]] = max(score_lst)
df.loc[row_idx, colnames[3]] = min(score_lst)
return df
# Run this file and get sorted recommended museums
# import modules
import pandas as pd
import numpy as np
import sys
from sklearn.metrics.pairwise import cosine_similarity
import json
# import data
museum_df = pd.read_csv("./data/imputed_df_with_name.csv")
# drop unnamed column
museum_df = museum_df.drop(museum_df.columns[0], axis=1)
# drop column "MuseumName", "ReviewCount", "TotalThingsToDo"
imputed_df = museum_df.drop(museum_df.columns[[0,4,5]], axis=1)
# get number of rows of the dataframe
nrow = imputed_df.shape[0]
def get_museum_lst(target_museum_input):
'''get the museum lst from input'''
return target_museum_input.split(';')[1:]
def get_master_srt_lst(museum_lst):
'''concatenate all top five lists for museums in museum_lst'''
master_srt_lst = []
for m in museum_lst:
master_srt_lst += get_top_five_for_one(m)
return master_srt_lst
def sort_list(lst):
'''sort the nested list based on the second item in list'''
sorted_lst = sorted(lst, key=lambda x: x[1], reverse = True)
return sorted_lst
def get_top_five_for_one(target_museum):
'''get top five museum and consine similarity for one musuem'''
target_idx = museum_df[museum_df['MuseumName'] == target_museum].index.tolist()[0]
input_vec = np.array(imputed_df.iloc[target_idx]).reshape(1, -1)
cos_sim = []
for i in range(nrow):
# reshapre the row into a vector
vec = np.array(imputed_df.iloc[i]).reshape(1, -1)
# compute and store consine similarity along with musuem name
cos_sim.append([museum_df['MuseumName'][i], cosine_similarity(input_vec, vec)[0][0]])
top_five = sort_list(cos_sim)
return top_five[1:6] # ignore the top one since it's the target musuem itself
def lst_to_dic(lst):
'''convert lst into dictionary'''
top_five_dic = {}
for i in lst:
top_five_dic[i[0]] = i[1]
return top_five_dic
def to_json(name, dic):
'''write dictionary to json file'''
filename = name + '.json'
with open(filename, 'w') as f:
json.dump(dic, f)
def exclude_selected(museum_lst, srt_lst):
'''exclude museums selected from the sorted list'''
return [x for x in srt_lst if x[0] not in museum_lst]
def get_sorted_dic(lst):
'''convert sorted list into sorted dictionary'''
dic = {}
for idx, item in enumerate(lst):
dic[idx+1] = [item[0], item[1]]
return dic
def get_unique_recom(master_srt_lst):
'''remove duplicate recommendations'''
unique_name = list(set([i[0]for i in master_srt_lst]))
uni_lst = []
for i in master_srt_lst:
if i[0] in unique_name:
uni_lst.append([ i[0],i[1] ])
unique_name.pop(unique_name.index(i[0]))
return uni_lst
if __name__ == "__main__":
'''main code'''
museum_lst = get_museum_lst(sys.argv[1])
master_srt_lst = get_master_srt_lst(museum_lst)
uni_lst = get_unique_recom(master_srt_lst)
sorted_lst = sort_list(uni_lst)
top_lst = exclude_selected(museum_lst, sorted_lst)
sorted_dic = get_sorted_dic(top_lst)
to_json('sorted_suggestion', sorted_dic)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment