annecool37/1_web_scraping_example.py

## 1_web_scraping_example.py
# This section demonstrates:
# 1. How I got address of the museum
# 2. How I convert adress into latitude and longitude via Google API
# 3. The main code to scrape the data and save it into .csv

from bs4 import BeautifulSoup
import pandas as pd
import googlemaps

def get_address(museum_soup):
    '''get address'''
    return [soup.find('span', {'property':'address'}).getText().strip('\n').rstrip()[9:]\
            for soup in museum_soup]

# enable the API on console
# https://console.developers.google.com/apis/dashboard?project=my-trip-142904&duration=PT1H
# get access to API
def get_lat_lng(add_lst):
    '''get latitube and longtitube of address'''
    gmaps = googlemaps.Client(key='AIzaSyAp4nKWDK7gL4hMqm-uPy0S49UMcU3Mqr4')
    lat_lst = []
    lng_lst = []
    for address in add_lst:
        geocode_result = gmaps.geocode(address)
        try:
            lat_lst.append(geocode_result[0]['geometry']['location']['lat'])
            lng_lst.append(geocode_result[0]['geometry']['location']['lng'])
        except:
            lat_lst.append('NA')
            lng_lst.append('NA')
    # print lat_lst
    return lat_lst, lng_lst

def get_soup(url_head):
    '''convert url into beautiful objects'''
    url_lst = create_url_lst(url_head)
    search_soup_lst = create_master_soup(url_lst)
    return search_soup_lst

def get_data_and_save_stepwise(search_soup_lst, which):
    '''main code to get data'''
    # initialize the list
    museum_name_lst= []
    review_count_lst=[]
    address_lst=[]
    lat_lst=[]
    lng_lst=[]
    rating_lst=[]
    rank_lst=[]
    total_things_to_do_lst=[]
    categories_nested_lst=[]
    featured_in_guide_count_lst=[]
    phone_num_lst=[]
    fee_lst=[]
    description_lst=[]
    length_of_visit_lst=[]
    quote_lst=[]
    review_content_lst=[]
    tagcloud_lst=[]
    traverler_rating_lst=[]
    traveler_type_lst=[]
    img_link_lst = []
    # save result for each page ran to prevent no result scenario caused by timeoutexception
    for idx, search_soup in enumerate(search_soup_lst):
        print "running page "+ str(idx+1)
        museum_soup = get_museum_soup(search_soup)
        museum_name_lst += get_museum_name(museum_soup)
        review_count_lst += get_review_count(museum_soup)
        add_lst = get_address(museum_soup)
        address_lst += add_lst
        lat_lst_, lng_lst_ = get_lat_lng(add_lst)
        lat_lst += lat_lst_
        lng_lst += lng_lst_
        rating_lst +=  get_rating(museum_soup)
        rank_lst_, total_things_to_do_lst_ = get_rank_total(museum_soup)
        rank_lst += rank_lst_
        total_things_to_do_lst += total_things_to_do_lst_
        categories = get_heading_details(museum_soup)
        categories_nested_lst += get_category(categories)
        featured_in_guide_count_lst += if_featured_count(categories)
        phone_num_lst += get_phone_num(museum_soup)
        fee_lst += check_fee(museum_soup)
        description_lst += get_description(museum_soup)
        length_of_visit_lst += get_length_of_visit(museum_soup)
        quote_lst += get_review_quotes(museum_soup)
        review_content_lst += get_partial_review(museum_soup)
        tagcloud_lst += get_review_tag_cloud(museum_soup)
        traverler_rating_lst += get_rating_details(museum_soup)
        traveler_type_lst += get_traveler_type(museum_soup)
        img_link_lst += get_img_link(museum_soup)

        # save result into dictionary
        museum_dict = {'MuseumName': museum_name_lst, 'ReviewCount': review_count_lst,
                   'Address':address_lst, 'Latitude':lat_lst, 'Longtitude':lng_lst,
                   'Rating':rating_lst, 'Rank':rank_lst, 'TotalThingsToDo': total_things_to_do_lst,
                   'FeatureCount':featured_in_guide_count_lst, 'PhoneNum':phone_num_lst, 'Fee':fee_lst,
                   'Description':description_lst, 'LengthOfVisit':length_of_visit_lst}

        # convert unicode to ascii
        museum_dict['MuseumName'] = unicode_to_ascii(museum_dict['MuseumName'])
        museum_dict['Address'] = unicode_to_ascii(museum_dict['Address'])
        museum_dict['Description'] = unicode_to_ascii(museum_dict['Description'])
        museum_dict['PhoneNum'] = unicode_to_ascii(museum_dict['PhoneNum'])

        # convert dictionary to dataframe
        museum_df = pd.DataFrame(museum_dict)

        # save file as .csv
        museum_df.to_csv('tripadvisor_museum'+ which +'.csv')

## 2_add_features_example.py
# This section demonstrates how I added museum type as new features
def assign_0_or_1(df, target, dic):
    '''assign value as 1 if the values of the dictionary in the musuem match the target'''
    df = df.copy(deep=True)
    # initialize column with 0
    for sub_item in target:
        df[sub_item] = 0

    for museum_name, value in dic.items():
        # get index of specific museum name
        idx = df[df['MuseumName'] == museum_name].index.tolist()[0]
        for sub_item in target:
            if sub_item in value:
                # assign 1 if the museum meet the criteria
                df.loc[idx, sub_item] = 1
    return df

def add_category_feature(df, category):
    '''add several museum type as new features'''
    category_lst = reduce(lambda x,y: x + y, category.values(),[])
    lst = list(set(category_lst))
    lst.sort()
    df = assign_0_or_1(df, lst, category)
    return df

## 3_get_sentiment_scores.py
import numpy as np
import nltk
from nltk import *
from textblob import TextBlob

def get_nested_sentiment(dic):
    '''get polarity and subjectivity score for each text in nested list'''
    polarity_dic = {}
    subjectivity_dic = {}
    for museum_name, lst in dic.items():
        polarity_lst = []
        subjectivity_lst = []
        for sentence in lst:
            blob = TextBlob(sentence)
            polarity_lst.append(blob.sentiment.polarity)
            subjectivity_lst.append(blob.sentiment.subjectivity)
        polarity_dic[museum_name] = polarity_lst
        subjectivity_dic[museum_name] = subjectivity_lst
    return polarity_dic, subjectivity_dic

def add_multiple_score_feature(df, header, dic):
    '''add max, min, mean, and var of sentiment scores as new features'''
    df = df.copy(deep=True)
    colnames = [header+'var', header+'mean', header+'max', header+'min']
    for museum_name, score_lst in dic.items():
        # get index
        row_idx = df[df['MuseumName'] == museum_name].index.tolist()[0]
        # create new features for scores: max, min, mean, and var of score_lst
        df.loc[row_idx, colnames[0]] = np.var(score_lst)
        df.loc[row_idx, colnames[1]] = np.mean(score_lst)
        df.loc[row_idx, colnames[2]] = max(score_lst)
        df.loc[row_idx, colnames[3]] = min(score_lst)
    return df

## 4_cosine_similarity.py
# Run this file and get sorted recommended museums

# import modules
import pandas as pd
import numpy as np
import sys
from sklearn.metrics.pairwise import cosine_similarity
import json

# import data
museum_df = pd.read_csv("./data/imputed_df_with_name.csv")
# drop unnamed column
museum_df = museum_df.drop(museum_df.columns[0], axis=1)
# drop column "MuseumName", "ReviewCount", "TotalThingsToDo"
imputed_df = museum_df.drop(museum_df.columns[[0,4,5]], axis=1)
# get number of rows of the dataframe
nrow = imputed_df.shape[0]

def get_museum_lst(target_museum_input):
    '''get the museum lst from input'''
    return target_museum_input.split(';')[1:]

def get_master_srt_lst(museum_lst):
    '''concatenate all top five lists for museums in museum_lst'''
    master_srt_lst = []
    for m in museum_lst:
         master_srt_lst += get_top_five_for_one(m)
    return master_srt_lst

def sort_list(lst):
    '''sort the nested list based on the second item in list'''
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse = True)
    return sorted_lst

def get_top_five_for_one(target_museum):
    '''get top five museum and consine similarity for one musuem'''
    target_idx = museum_df[museum_df['MuseumName'] == target_museum].index.tolist()[0]
    input_vec = np.array(imputed_df.iloc[target_idx]).reshape(1, -1)
    cos_sim = []
    for i in range(nrow):
        # reshapre the row into a vector
        vec = np.array(imputed_df.iloc[i]).reshape(1, -1)
        # compute and store consine similarity along with musuem name
        cos_sim.append([museum_df['MuseumName'][i], cosine_similarity(input_vec, vec)[0][0]])
    top_five  = sort_list(cos_sim)
    return top_five[1:6] # ignore the top one since it's the target musuem itself

def lst_to_dic(lst):
    '''convert lst into dictionary'''
    top_five_dic = {}
    for i in lst:
        top_five_dic[i[0]] = i[1]
    return top_five_dic

def to_json(name, dic):
    '''write dictionary to json file'''
    filename = name + '.json'
    with open(filename, 'w') as f:
        json.dump(dic, f)

def exclude_selected(museum_lst, srt_lst):
    '''exclude museums selected from the sorted list'''
    return [x for x in srt_lst if x[0] not in museum_lst]

def get_sorted_dic(lst):
    '''convert sorted list into sorted dictionary'''
    dic = {}
    for idx, item in enumerate(lst):
        dic[idx+1] = [item[0], item[1]]
    return dic

def get_unique_recom(master_srt_lst):
    '''remove duplicate recommendations'''
    unique_name = list(set([i[0]for i in master_srt_lst]))
    uni_lst = []
    for i in master_srt_lst:
        if i[0] in unique_name:
            uni_lst.append([ i[0],i[1] ])
            unique_name.pop(unique_name.index(i[0]))

    return uni_lst

if __name__ == "__main__":
    '''main code'''
    museum_lst = get_museum_lst(sys.argv[1])
    master_srt_lst = get_master_srt_lst(museum_lst)
    uni_lst = get_unique_recom(master_srt_lst)
    sorted_lst = sort_list(uni_lst)
    top_lst = exclude_selected(museum_lst, sorted_lst)
    sorted_dic = get_sorted_dic(top_lst)
    to_json('sorted_suggestion', sorted_dic)
	# This section demonstrates:
	# 1. How I got address of the museum
	# 2. How I convert adress into latitude and longitude via Google API
	# 3. The main code to scrape the data and save it into .csv

	from bs4 import BeautifulSoup
	import pandas as pd
	import googlemaps

	def get_address(museum_soup):
	'''get address'''
	return [soup.find('span', {'property':'address'}).getText().strip('\n').rstrip()[9:]\
	for soup in museum_soup]

	# enable the API on console
	# https://console.developers.google.com/apis/dashboard?project=my-trip-142904&duration=PT1H
	# get access to API
	def get_lat_lng(add_lst):
	'''get latitube and longtitube of address'''
	gmaps = googlemaps.Client(key='AIzaSyAp4nKWDK7gL4hMqm-uPy0S49UMcU3Mqr4')
	lat_lst = []
	lng_lst = []
	for address in add_lst:
	geocode_result = gmaps.geocode(address)
	try:
	lat_lst.append(geocode_result[0]['geometry']['location']['lat'])
	lng_lst.append(geocode_result[0]['geometry']['location']['lng'])
	except:
	lat_lst.append('NA')
	lng_lst.append('NA')
	# print lat_lst
	return lat_lst, lng_lst

	def get_soup(url_head):
	'''convert url into beautiful objects'''
	url_lst = create_url_lst(url_head)
	search_soup_lst = create_master_soup(url_lst)
	return search_soup_lst

	def get_data_and_save_stepwise(search_soup_lst, which):
	'''main code to get data'''
	# initialize the list
	museum_name_lst= []
	review_count_lst=[]
	address_lst=[]
	lat_lst=[]
	lng_lst=[]
	rating_lst=[]
	rank_lst=[]
	total_things_to_do_lst=[]
	categories_nested_lst=[]
	featured_in_guide_count_lst=[]
	phone_num_lst=[]
	fee_lst=[]
	description_lst=[]
	length_of_visit_lst=[]
	quote_lst=[]
	review_content_lst=[]
	tagcloud_lst=[]
	traverler_rating_lst=[]
	traveler_type_lst=[]
	img_link_lst = []
	# save result for each page ran to prevent no result scenario caused by timeoutexception
	for idx, search_soup in enumerate(search_soup_lst):
	print "running page "+ str(idx+1)
	museum_soup = get_museum_soup(search_soup)
	museum_name_lst += get_museum_name(museum_soup)
	review_count_lst += get_review_count(museum_soup)
	add_lst = get_address(museum_soup)
	address_lst += add_lst
	lat_lst_, lng_lst_ = get_lat_lng(add_lst)
	lat_lst += lat_lst_
	lng_lst += lng_lst_
	rating_lst += get_rating(museum_soup)
	rank_lst_, total_things_to_do_lst_ = get_rank_total(museum_soup)
	rank_lst += rank_lst_
	total_things_to_do_lst += total_things_to_do_lst_
	categories = get_heading_details(museum_soup)
	categories_nested_lst += get_category(categories)
	featured_in_guide_count_lst += if_featured_count(categories)
	phone_num_lst += get_phone_num(museum_soup)
	fee_lst += check_fee(museum_soup)
	description_lst += get_description(museum_soup)
	length_of_visit_lst += get_length_of_visit(museum_soup)
	quote_lst += get_review_quotes(museum_soup)
	review_content_lst += get_partial_review(museum_soup)
	tagcloud_lst += get_review_tag_cloud(museum_soup)
	traverler_rating_lst += get_rating_details(museum_soup)
	traveler_type_lst += get_traveler_type(museum_soup)
	img_link_lst += get_img_link(museum_soup)

	# save result into dictionary
	museum_dict = {'MuseumName': museum_name_lst, 'ReviewCount': review_count_lst,
	'Address':address_lst, 'Latitude':lat_lst, 'Longtitude':lng_lst,
	'Rating':rating_lst, 'Rank':rank_lst, 'TotalThingsToDo': total_things_to_do_lst,
	'FeatureCount':featured_in_guide_count_lst, 'PhoneNum':phone_num_lst, 'Fee':fee_lst,
	'Description':description_lst, 'LengthOfVisit':length_of_visit_lst}

	# convert unicode to ascii
	museum_dict['MuseumName'] = unicode_to_ascii(museum_dict['MuseumName'])
	museum_dict['Address'] = unicode_to_ascii(museum_dict['Address'])
	museum_dict['Description'] = unicode_to_ascii(museum_dict['Description'])
	museum_dict['PhoneNum'] = unicode_to_ascii(museum_dict['PhoneNum'])

	# convert dictionary to dataframe
	museum_df = pd.DataFrame(museum_dict)

	# save file as .csv
	museum_df.to_csv('tripadvisor_museum'+ which +'.csv')
	# This section demonstrates how I added museum type as new features
	def assign_0_or_1(df, target, dic):
	'''assign value as 1 if the values of the dictionary in the musuem match the target'''
	df = df.copy(deep=True)
	# initialize column with 0
	for sub_item in target:
	df[sub_item] = 0

	for museum_name, value in dic.items():
	# get index of specific museum name
	idx = df[df['MuseumName'] == museum_name].index.tolist()[0]
	for sub_item in target:
	if sub_item in value:
	# assign 1 if the museum meet the criteria
	df.loc[idx, sub_item] = 1
	return df

	def add_category_feature(df, category):
	'''add several museum type as new features'''
	category_lst = reduce(lambda x,y: x + y, category.values(),[])
	lst = list(set(category_lst))
	lst.sort()
	df = assign_0_or_1(df, lst, category)
	return df
	import numpy as np
	import nltk
	from nltk import *
	from textblob import TextBlob

	def get_nested_sentiment(dic):
	'''get polarity and subjectivity score for each text in nested list'''
	polarity_dic = {}
	subjectivity_dic = {}
	for museum_name, lst in dic.items():
	polarity_lst = []
	subjectivity_lst = []
	for sentence in lst:
	blob = TextBlob(sentence)
	polarity_lst.append(blob.sentiment.polarity)
	subjectivity_lst.append(blob.sentiment.subjectivity)
	polarity_dic[museum_name] = polarity_lst
	subjectivity_dic[museum_name] = subjectivity_lst
	return polarity_dic, subjectivity_dic

	def add_multiple_score_feature(df, header, dic):
	'''add max, min, mean, and var of sentiment scores as new features'''
	df = df.copy(deep=True)
	colnames = [header+'var', header+'mean', header+'max', header+'min']
	for museum_name, score_lst in dic.items():
	# get index
	row_idx = df[df['MuseumName'] == museum_name].index.tolist()[0]
	# create new features for scores: max, min, mean, and var of score_lst
	df.loc[row_idx, colnames[0]] = np.var(score_lst)
	df.loc[row_idx, colnames[1]] = np.mean(score_lst)
	df.loc[row_idx, colnames[2]] = max(score_lst)
	df.loc[row_idx, colnames[3]] = min(score_lst)
	return df
	# Run this file and get sorted recommended museums

	# import modules
	import pandas as pd
	import numpy as np
	import sys
	from sklearn.metrics.pairwise import cosine_similarity
	import json

	# import data
	museum_df = pd.read_csv("./data/imputed_df_with_name.csv")
	# drop unnamed column
	museum_df = museum_df.drop(museum_df.columns[0], axis=1)
	# drop column "MuseumName", "ReviewCount", "TotalThingsToDo"
	imputed_df = museum_df.drop(museum_df.columns[[0,4,5]], axis=1)
	# get number of rows of the dataframe
	nrow = imputed_df.shape[0]

	def get_museum_lst(target_museum_input):
	'''get the museum lst from input'''
	return target_museum_input.split(';')[1:]

	def get_master_srt_lst(museum_lst):
	'''concatenate all top five lists for museums in museum_lst'''
	master_srt_lst = []
	for m in museum_lst:
	master_srt_lst += get_top_five_for_one(m)
	return master_srt_lst

	def sort_list(lst):
	'''sort the nested list based on the second item in list'''
	sorted_lst = sorted(lst, key=lambda x: x[1], reverse = True)
	return sorted_lst

	def get_top_five_for_one(target_museum):
	'''get top five museum and consine similarity for one musuem'''
	target_idx = museum_df[museum_df['MuseumName'] == target_museum].index.tolist()[0]
	input_vec = np.array(imputed_df.iloc[target_idx]).reshape(1, -1)
	cos_sim = []
	for i in range(nrow):
	# reshapre the row into a vector
	vec = np.array(imputed_df.iloc[i]).reshape(1, -1)
	# compute and store consine similarity along with musuem name
	cos_sim.append([museum_df['MuseumName'][i], cosine_similarity(input_vec, vec)[0][0]])
	top_five = sort_list(cos_sim)
	return top_five[1:6] # ignore the top one since it's the target musuem itself

	def lst_to_dic(lst):
	'''convert lst into dictionary'''
	top_five_dic = {}
	for i in lst:
	top_five_dic[i[0]] = i[1]
	return top_five_dic

	def to_json(name, dic):
	'''write dictionary to json file'''
	filename = name + '.json'
	with open(filename, 'w') as f:
	json.dump(dic, f)

	def exclude_selected(museum_lst, srt_lst):
	'''exclude museums selected from the sorted list'''
	return [x for x in srt_lst if x[0] not in museum_lst]

	def get_sorted_dic(lst):
	'''convert sorted list into sorted dictionary'''
	dic = {}
	for idx, item in enumerate(lst):
	dic[idx+1] = [item[0], item[1]]
	return dic

	def get_unique_recom(master_srt_lst):
	'''remove duplicate recommendations'''
	unique_name = list(set([i[0]for i in master_srt_lst]))
	uni_lst = []
	for i in master_srt_lst:
	if i[0] in unique_name:
	uni_lst.append([ i[0],i[1] ])
	unique_name.pop(unique_name.index(i[0]))

	return uni_lst

	if __name__ == "__main__":
	'''main code'''
	museum_lst = get_museum_lst(sys.argv[1])
	master_srt_lst = get_master_srt_lst(museum_lst)
	uni_lst = get_unique_recom(master_srt_lst)
	sorted_lst = sort_list(uni_lst)
	top_lst = exclude_selected(museum_lst, sorted_lst)
	sorted_dic = get_sorted_dic(top_lst)
	to_json('sorted_suggestion', sorted_dic)