Mark Edmondson MarkEdmondson1234

## japan_pyramid.R
library(idbr) # devtools::install_github('walkerke/idbr')
library(ggplot2)
library(animation)
library(dplyr)
library(ggthemes)

idb_api_key("Your Census API key goes here")

male <- idb1('JA', 2010:2050, sex = 'male') %>%
  mutate(POP = POP * -1,

## costdata.gs
function uploadData() {
  var accountId = "xxxxxxxx";
  var webPropertyId = "UA-xxxxxxxx-x";
  var customDataSourceId = "xxxxxxxx";
  var ss = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
  var maxRows = ss.getLastRow();
  var maxColumns = ss.getLastColumn();
  var data = [];
  for (var i = 1; i < maxRows;i++) {
    data.push(ss.getRange([i], 1,1, maxColumns).getValues());

## animate.R
library(ggraph)
library(gganimate)
library(igraph)
# Data from http://konect.uni-koblenz.de/networks/sociopatterns-infectious
infect <- read.table('out.sociopatterns-infectious', skip = 2, sep = ' ', stringsAsFactors = FALSE)
infect$V3 <- NULL
names(infect) <- c('from', 'to', 'time')
infect$timebins <- as.numeric(cut(infect$time, breaks = 100))

# We want that nice fading effect so we need to add extra data for the trailing

## heatmap-clusters.R
kResults <- data.frame(k_data, cluster = k$cluster)

## Transform data for columns of cluster, rows of Sku with value of mean total for each
rl <- as.data.frame(lapply(1:4, function(x){ r3 <- kResults[kResults$cluster == x,
                                                            setdiff(names(kResults), 'cluster')]
                               r4 <- colSums(r3) / nrow(r3)
                               r4
                               }))
names(rl) <- paste("cluster",1:4)

## how-many-clusters.R
# Determine number of clusters
## run kmeans for varying number of clusters 1 to 15
wss <- (nrow(comp)-1)*sum(apply(comp,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(comp,
                                     centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")


# From scree plot elbow occurs at k = 4-6

## ml_pca.R
## Finding number of components
pc <- princomp(model_data)
plot(pc, type="l")

# look for dimension that is ~ 85% variance
summary(pc)
loadings(pc)

# run more convenient pca needed for k-means
pc <- prcomp(k_data)

## ml_plot_randomForests.R
## function to get plot data format
getCompareTable <- function (test_data, prediction) {
  require(dplyr)

  ## plot real vs model bought Sku
  actual_freq <- table(model_data$boughtSku)
  predicted_freq <- table(prediction)

  actual_freq <- actual_freq[order(actual_freq)]
  predicted_freq <- predicted_freq[order(predicted_freq)]

## ml_model_assess_randomForest.R
library(randomForest)
## warning - can take a long time (30mins)
rf <- randomForest(x = predictors, y = response)

## once model done, we run it using test data and compare results to reality
predictor_test <- test[,which(!names(test) %in% c("dimension1","boughtSku"))]
response_test <- as.factor(test[,"boughtSku"])

## check result on test set
prediction <- predict(rf, predictor_test)

## ml_trans_data.R
## want: 30049 x 187
## userId, product1_view, product2_view, ...., productN_view, productBought
pv <- reshape2::recast(product_views,
                       dimension1 ~ productSku + variable,
                       fun.aggregate=sum)

library(dplyr)
## if a user buys more than once, the row will be duplicated
pt <-  product_trans %>% select(productSku, dimension1)

## ml_get_data.R
library(googleAnalyticsR_public)

gar_auth(new_user=T)

## your profile view Id
id <- "XXXXXX"

## 61607 results
## 30049 unique Ids
## 185 Sku's
	library(idbr) # devtools::install_github('walkerke/idbr')
	library(ggplot2)
	library(animation)
	library(dplyr)
	library(ggthemes)

	idb_api_key("Your Census API key goes here")

	male <- idb1('JA', 2010:2050, sex = 'male') %>%
	mutate(POP = POP * -1,
	function uploadData() {
	var accountId = "xxxxxxxx";
	var webPropertyId = "UA-xxxxxxxx-x";
	var customDataSourceId = "xxxxxxxx";
	var ss = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
	var maxRows = ss.getLastRow();
	var maxColumns = ss.getLastColumn();
	var data = [];
	for (var i = 1; i < maxRows;i++) {
	data.push(ss.getRange([i], 1,1, maxColumns).getValues());
	library(ggraph)
	library(gganimate)
	library(igraph)
	# Data from http://konect.uni-koblenz.de/networks/sociopatterns-infectious
	infect <- read.table('out.sociopatterns-infectious', skip = 2, sep = ' ', stringsAsFactors = FALSE)
	infect$V3 <- NULL
	names(infect) <- c('from', 'to', 'time')
	infect$timebins <- as.numeric(cut(infect$time, breaks = 100))

	# We want that nice fading effect so we need to add extra data for the trailing
	kResults <- data.frame(k_data, cluster = k$cluster)

	## Transform data for columns of cluster, rows of Sku with value of mean total for each
	rl <- as.data.frame(lapply(1:4, function(x){ r3 <- kResults[kResults$cluster == x,
	setdiff(names(kResults), 'cluster')]
	r4 <- colSums(r3) / nrow(r3)
	r4
	}))
	names(rl) <- paste("cluster",1:4)
	# Determine number of clusters
	## run kmeans for varying number of clusters 1 to 15
	wss <- (nrow(comp)-1)*sum(apply(comp,2,var))
	for (i in 2:15) wss[i] <- sum(kmeans(comp,
	centers=i)$withinss)
	plot(1:15, wss, type="b", xlab="Number of Clusters",
	ylab="Within groups sum of squares")


	# From scree plot elbow occurs at k = 4-6
	## Finding number of components
	pc <- princomp(model_data)
	plot(pc, type="l")

	# look for dimension that is ~ 85% variance
	summary(pc)
	loadings(pc)

	# run more convenient pca needed for k-means
	pc <- prcomp(k_data)
	## function to get plot data format
	getCompareTable <- function (test_data, prediction) {
	require(dplyr)

	## plot real vs model bought Sku
	actual_freq <- table(model_data$boughtSku)
	predicted_freq <- table(prediction)

	actual_freq <- actual_freq[order(actual_freq)]
	predicted_freq <- predicted_freq[order(predicted_freq)]
	library(randomForest)
	## warning - can take a long time (30mins)
	rf <- randomForest(x = predictors, y = response)

	## once model done, we run it using test data and compare results to reality
	predictor_test <- test[,which(!names(test) %in% c("dimension1","boughtSku"))]
	response_test <- as.factor(test[,"boughtSku"])

	## check result on test set
	prediction <- predict(rf, predictor_test)
	## want: 30049 x 187
	## userId, product1_view, product2_view, ...., productN_view, productBought
	pv <- reshape2::recast(product_views,
	dimension1 ~ productSku + variable,
	fun.aggregate=sum)

	library(dplyr)
	## if a user buys more than once, the row will be duplicated
	pt <- product_trans %>% select(productSku, dimension1)
	library(googleAnalyticsR_public)

	gar_auth(new_user=T)

	## your profile view Id
	id <- "XXXXXX"

	## 61607 results
	## 30049 unique Ids
	## 185 Sku's