DataScienceMom DSMom

## RunTimeDifference
> n_simu <- 10000
> ptc <- proc.time()
> MC_pullAll <- get_sample(n_simu, no_sample = n_simu)
> proc.time() - ptc
   user  system elapsed
  0.065   0.004   0.072

> ptc <- proc.time()
> MC_pullOne <- get_sample(n_simu, no_sample = 1)
> proc.time() - ptc

## MC Oil in Place
#################
# MC_OilInPlace.R
# Monte Carlo simulation for oil in place
#
# http://petrowiki.org/Monte_Carlo_simulation
# http://www.statvision.com/webinars/Monte%20Carlo%20Simulation.pdf
#
# Author: Yang Cong
# Created Date: 7/7/2018
# Modified Date: 7/8/2018

## Potential
targetItem <- 'PINK REGENCY TEACUP AND SAUCER'

dfPre <- subdata[subdata$CustomerID %in% subdata[subdata$Description==targetItem,]$CustomerID, c("CustomerID", "Description")]
dfPre <- unique(dfPre)
dfPre <- dfPre[dfPre$Description %in% c(targetItem, TblRule[TblRule$Purchased == targetItem, ]$AlsoPurchased), ]
dfPre$Purchased <- 1

dfSummary <- spread(dfPre, Description, Purchased)
dfSummary[is.na(dfSummary)] <- 0

## MBA
#### MBA analysis ####
library(arules)
library(tidyr)
library(dplyr)

subdata <- data[data$CustomerID %in% RFM$CustomerID,]
trandata <- subdata[,c('InvoiceNo', 'Description')]
trandata$Description <- trimws(trandata$Description, which = "both")
trandata <- trandata[trandata$Description!='Manual',]
trandata <- trandata[!duplicated(trandata),]

## RFM
#### RFM analysis #####
library(dplyr)
# filter to only United Kingdom
data <- data[data$Country == "United Kingdom", ]
data$PxQ <- data$Quantity * data$UnitPrice
data$Recency <- difftime(as.Date(max(data$InvoiceDate)), as.Date(data$InvoiceDate), units = "days")

RFMresult <- data %>%
  select(InvoiceNo, CustomerID, Recency, PxQ) %>%
  group_by(CustomerID) %>%

## data preparation
#### data preparation ####
# load data
data <- read.csv('data.csv', stringsAsFactors = FALSE)
# remove imcomplete records
QC_logic <- complete.cases(data[, !names(data) %in% c('Description')])
data <- data[QC_logic,]
# remove duplicated records
data <- data[!duplicated(data),]
# remove returned records
data <- data[!grepl('c', tolower(data$InvoiceNo)), ]

## sampling
# compare distribution of drawing 1 sample at a time with drawing multiple samples at a time
seed = 790256
n_sample <- 10000

par(mfrow=c(2,2))
# normal distribution #
set.seed(seed)
output_AllAtOnce <- rnorm(n_sample, mean = 0, sd =1)
hist(output_AllAtOnce, freq = TRUE, main = 'Normal Distribution \n (draw multiple samples at a time)')
	> n_simu <- 10000
	> ptc <- proc.time()
	> MC_pullAll <- get_sample(n_simu, no_sample = n_simu)
	> proc.time() - ptc
	user system elapsed
	0.065 0.004 0.072

	> ptc <- proc.time()
	> MC_pullOne <- get_sample(n_simu, no_sample = 1)
	> proc.time() - ptc
	#################
	# MC_OilInPlace.R
	# Monte Carlo simulation for oil in place
	#
	# http://petrowiki.org/Monte_Carlo_simulation
	# http://www.statvision.com/webinars/Monte%20Carlo%20Simulation.pdf
	#
	# Author: Yang Cong
	# Created Date: 7/7/2018
	# Modified Date: 7/8/2018
	targetItem <- 'PINK REGENCY TEACUP AND SAUCER'

	dfPre <- subdata[subdata$CustomerID %in% subdata[subdata$Description==targetItem,]$CustomerID, c("CustomerID", "Description")]
	dfPre <- unique(dfPre)
	dfPre <- dfPre[dfPre$Description %in% c(targetItem, TblRule[TblRule$Purchased == targetItem, ]$AlsoPurchased), ]
	dfPre$Purchased <- 1

	dfSummary <- spread(dfPre, Description, Purchased)
	dfSummary[is.na(dfSummary)] <- 0
	#### MBA analysis ####
	library(arules)
	library(tidyr)
	library(dplyr)

	subdata <- data[data$CustomerID %in% RFM$CustomerID,]
	trandata <- subdata[,c('InvoiceNo', 'Description')]
	trandata$Description <- trimws(trandata$Description, which = "both")
	trandata <- trandata[trandata$Description!='Manual',]
	trandata <- trandata[!duplicated(trandata),]
	#### RFM analysis #####
	library(dplyr)
	# filter to only United Kingdom
	data <- data[data$Country == "United Kingdom", ]
	data$PxQ <- data$Quantity * data$UnitPrice
	data$Recency <- difftime(as.Date(max(data$InvoiceDate)), as.Date(data$InvoiceDate), units = "days")

	RFMresult <- data %>%
	select(InvoiceNo, CustomerID, Recency, PxQ) %>%
	group_by(CustomerID) %>%
	#### data preparation ####
	# load data
	data <- read.csv('data.csv', stringsAsFactors = FALSE)
	# remove imcomplete records
	QC_logic <- complete.cases(data[, !names(data) %in% c('Description')])
	data <- data[QC_logic,]
	# remove duplicated records
	data <- data[!duplicated(data),]
	# remove returned records
	data <- data[!grepl('c', tolower(data$InvoiceNo)), ]
	# compare distribution of drawing 1 sample at a time with drawing multiple samples at a time
	seed = 790256
	n_sample <- 10000

	par(mfrow=c(2,2))
	# normal distribution #
	set.seed(seed)
	output_AllAtOnce <- rnorm(n_sample, mean = 0, sd =1)
	hist(output_AllAtOnce, freq = TRUE, main = 'Normal Distribution \n (draw multiple samples at a time)')