Skip to content

Instantly share code, notes, and snippets.

View susanli2016's full-sized avatar
💃
<script>Nice meet you here</script>

Susan Li susanli2016

💃
<script>Nice meet you here</script>
View GitHub Profile
reindexed_data = df['Review Text']
tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
reindexed_data = reindexed_data.values
document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)
n_topics = 6
lsa_model = TruncatedSVD(n_components=n_topics)
lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)
def get_keys(topic_matrix):
'''
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import random
import re, nltk, spacy, gensim
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
from math import radians, cos, sin, asin, sqrt
import numpy as np
def haversine_np(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
All args must be of equal length.
def HitRate(topNPredicted, leftOutPredictions):
hits = 0
total = 0
# For each left-out rating
for leftOut in leftOutPredictions:
userID = leftOut[0]
leftOutMovieID = leftOut[1]
# Is it in the predicted top 10 for this user?
hit = False
@susanli2016
susanli2016 / Baseball_data.ipynb
Created January 12, 2017 06:55
Analyzing Baseball Data with Numpy and Pandas
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
def recommend(customer_id, sparse_customer_item, customer_vecs, item_vecs, num_items=10):
customer_interactions = sparse_customer_item[customer_id,:].toarray()
customer_interactions = customer_interactions.reshape(-1) + 1
customer_interactions[customer_interactions > 1] = 0
rec_vector = customer_vecs[customer_id,:].dot(item_vecs.T).toarray()
min_max = MinMaxScaler()
rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
outliers_fraction = 0.01
xx , yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
clf = CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=0)
clf.fit(X)
scores_pred = clf.decision_function(X) * -1
y_pred = clf.predict(X)
n_inliers = len(y_pred) - np.count_nonzero(y_pred)
n_outliers = np.count_nonzero(y_pred == 1)
plt.figure(figsize=(8, 8))
fig = px.line(df_g7_melt, x="DATE", y="VALUE", color='COUNTRY', title='Real Residential Property Prices - All G7 Countries')
fig.update_layout(title=dict(x=0.5,y=0.95))
fig.show()
countries = {'US':'United States', 'GB':'United Kingdom', 'FR':'France', 'CA':'Canada', 'DE':'Germany', 'JP':'Japan', 'IT':'Italy'}
def check_country(x):
for country in countries:
if country.lower() in x.lower():
return countries[country]
return ''
df_g7_melt['COUNTRY'] = df_g7_melt['SERIES_ID'].map(lambda x: check_country(x))
g7_list = df_g7['series_id'].tolist()
start_date = '1970-01-01'
end_date = '2021-10-01'
df_g7 = get_fred_data(series_list = g7_list,
start_date = start_date,
end_date = end_date)
df_g7_melt = pd.melt(df_g7, id_vars = ['DATE'], value_vars = g7_list, var_name = 'SERIES_ID', value_name = 'VALUE')