Skip to content

Instantly share code, notes, and snippets.

View susanli2016's full-sized avatar
💃
<script>Nice meet you here</script>

Susan Li susanli2016

💃
<script>Nice meet you here</script>
View GitHub Profile
unique_customers = grouped_df.CustomerID.unique()
customer_ids = dict(zip(unique_customers, np.arange(unique_customers.shape[0], dtype=np.int32)))
unique_items = grouped_df.StockCode.unique()
item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32)))
grouped_df['customer_id'] = grouped_df.CustomerID.apply(lambda i: customer_ids[i])
grouped_df['item_id'] = grouped_df.StockCode.apply(lambda i: item_ids[i])
sparse_item_customer = sparse.csr_matrix((grouped_df['Quantity'].astype(float), (grouped_df['item_id'], grouped_df['customer_id'])))
print(f'Number of unique customers: {grouped_df.CustomerID.nunique()}')
print(f'Number of unique items: {grouped_df.StockCode.nunique()}')
print(f'Average purchase quantity per interaction: {int(grouped_df.Quantity.mean())}')
print(f'Minimum purchase quantity per interaction: {grouped_df.Quantity.min()}')
print(f'Maximum purchase quantity per interaction: {grouped_df.Quantity.max()}')
import plotly.express as px
fig = px.histogram(grouped_df, x='Quantity', title='Distribution of the purchase quantity', nbins=500)
fig.show();
retail_df = retail_df[retail_df['CustomerID'].notna()]
grouped_df = retail_df[['CustomerID', 'StockCode', 'Description', 'Quantity']].groupby(['CustomerID', 'StockCode', 'Description']).sum().reset_index()
grouped_df.loc[grouped_df['Quantity'] == 0, ['Quantity']] = 1
grouped_df = grouped_df.loc[grouped_df['Quantity'] > 0]
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
import implicit
from prince import MCA
df_cat = df_clean[['vote16', 'vote', 'confecon', 'self_ideology', 'immignum', 'health_care', 'worry_covid',
'worry_covid_economy', 'partyID', 'sex', 'science', 'universal_income', 'freecol']].dropna()
mca = MCA(n_components=2)
mca = mca.fit(df_cat)
mca.column_coordinates(df_cat).sort_values(0)
from prince import PCA
df_ft = df_clean.loc[:, 'fttrump1': 'ftdemocraticparty'].dropna()
pca = PCA(n_components=2,
rescale_with_mean=True,
rescale_with_std=True)
pca = pca.fit(df_ft)
loadings = pca.column_correlations(df_ft)
loadings.columns = ['dim1', 'dim2']
stats.chi2_contingency(pd.crosstab(df_clean['favor_both'], df_clean['self_ideology']).values)
round(pd.crosstab(df_clean['favor_both'], df_clean['self_ideology'], normalize='columns')*100,2)
stats.f_oneway(df_clean.query("partyID=='Democrat'").age.dropna(),
df_clean.query("partyID=='Republican'").age.dropna(),
df_clean.query("partyID=='independent'").age.dropna())