Susan Li susanli2016

## topic_model_LSA.py
reindexed_data = df['Review Text']
tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
reindexed_data = reindexed_data.values
document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)
n_topics = 6
lsa_model = TruncatedSVD(n_components=n_topics)
lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)

def get_keys(topic_matrix):
    '''

## dirty_hotel_rec.py
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import random
import re, nltk, spacy, gensim
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt

## haversine_np
from math import radians, cos, sin, asin, sqrt
import numpy as np

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.

## HitRate.py
def HitRate(topNPredicted, leftOutPredictions):
    hits = 0
    total = 0

 # For each left-out rating
    for leftOut in leftOutPredictions:
        userID = leftOut[0]
        leftOutMovieID = leftOut[1]
        # Is it in the predicted top 10 for this user?
        hit = False

## Baseball_data.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                susanli2016
                / Baseball_data.ipynb
            
            
              Created
              January 12, 2017 06:55
            
              
                Analyzing Baseball Data with Numpy and Pandas
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## RecSys_ALS_items.py
def recommend(customer_id, sparse_customer_item, customer_vecs, item_vecs, num_items=10):

    customer_interactions = sparse_customer_item[customer_id,:].toarray()
    customer_interactions = customer_interactions.reshape(-1) + 1
    customer_interactions[customer_interactions > 1] = 0

    rec_vector = customer_vecs[customer_id,:].dot(item_vecs.T).toarray()

    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]

## CBLOF.py
outliers_fraction = 0.01
xx , yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0)
clf.fit(X)
scores_pred = clf.decision_function(X) * -1
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)

plt.figure(figsize=(8, 8))

## g7_house_price_line_chart.py
fig = px.line(df_g7_melt, x="DATE", y="VALUE", color='COUNTRY', title='Real Residential Property Prices - All G7 Countries')
fig.update_layout(title=dict(x=0.5,y=0.95))
fig.show()

## df_g7_country.py
countries = {'US':'United States', 'GB':'United Kingdom', 'FR':'France', 'CA':'Canada', 'DE':'Germany', 'JP':'Japan', 'IT':'Italy'}

def check_country(x):
    for country in countries:
        if country.lower() in x.lower():
            return countries[country]
    return ''

df_g7_melt['COUNTRY'] = df_g7_melt['SERIES_ID'].map(lambda x: check_country(x))

## g7_series.py
g7_list = df_g7['series_id'].tolist()

start_date = '1970-01-01'
end_date = '2021-10-01'

df_g7 = get_fred_data(series_list = g7_list,
                          start_date = start_date,
                          end_date = end_date)

df_g7_melt = pd.melt(df_g7, id_vars = ['DATE'], value_vars = g7_list, var_name = 'SERIES_ID', value_name = 'VALUE')
	reindexed_data = df['Review Text']
	tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
	reindexed_data = reindexed_data.values
	document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)
	n_topics = 6
	lsa_model = TruncatedSVD(n_components=n_topics)
	lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)

	def get_keys(topic_matrix):
	'''
	from nltk.corpus import stopwords
	from sklearn.metrics.pairwise import linear_kernel
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.decomposition import LatentDirichletAllocation
	import random
	import re, nltk, spacy, gensim
	import pyLDAvis
	import pyLDAvis.sklearn
	import matplotlib.pyplot as plt
	from math import radians, cos, sin, asin, sqrt
	import numpy as np

	def haversine_np(lon1, lat1, lon2, lat2):
	"""
	Calculate the great circle distance between two points
	on the earth (specified in decimal degrees)

	All args must be of equal length.
	def HitRate(topNPredicted, leftOutPredictions):
	hits = 0
	total = 0

	# For each left-out rating
	for leftOut in leftOutPredictions:
	userID = leftOut[0]
	leftOutMovieID = leftOut[1]
	# Is it in the predicted top 10 for this user?
	hit = False
	def recommend(customer_id, sparse_customer_item, customer_vecs, item_vecs, num_items=10):

	customer_interactions = sparse_customer_item[customer_id,:].toarray()
	customer_interactions = customer_interactions.reshape(-1) + 1
	customer_interactions[customer_interactions > 1] = 0

	rec_vector = customer_vecs[customer_id,:].dot(item_vecs.T).toarray()

	min_max = MinMaxScaler()
	rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
	outliers_fraction = 0.01
	xx , yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
	clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0)
	clf.fit(X)
	scores_pred = clf.decision_function(X) * -1
	y_pred = clf.predict(X)
	n_inliers = len(y_pred) - np.count_nonzero(y_pred)
	n_outliers = np.count_nonzero(y_pred == 1)

	plt.figure(figsize=(8, 8))
	fig = px.line(df_g7_melt, x="DATE", y="VALUE", color='COUNTRY', title='Real Residential Property Prices - All G7 Countries')
	fig.update_layout(title=dict(x=0.5,y=0.95))
	fig.show()
	countries = {'US':'United States', 'GB':'United Kingdom', 'FR':'France', 'CA':'Canada', 'DE':'Germany', 'JP':'Japan', 'IT':'Italy'}

	def check_country(x):
	for country in countries:
	if country.lower() in x.lower():
	return countries[country]
	return ''

	df_g7_melt['COUNTRY'] = df_g7_melt['SERIES_ID'].map(lambda x: check_country(x))
	g7_list = df_g7['series_id'].tolist()

	start_date = '1970-01-01'
	end_date = '2021-10-01'

	df_g7 = get_fred_data(series_list = g7_list,
	start_date = start_date,
	end_date = end_date)

	df_g7_melt = pd.melt(df_g7, id_vars = ['DATE'], value_vars = g7_list, var_name = 'SERIES_ID', value_name = 'VALUE')