hillarysanders/hackbright_play_with_data

## hackbright_play_with_data
######################################################################
# hackbright stuffs
# Premise
######################################################################
# Author: Hillary Sanders
######################################################################
######################################################################

# get data here: https://data.premise.com/


# There are a few main types of objects in R:
# vectors (e.g. c(1,2,3)), matrices, lists, and dataframes.
# dataframes are very common, and is mostly what we're going to
# be playing with below. A dataframe is like a matrix, in that
# is has two dimensions, but each column can be a specific type
# of value: e.g. numeric or character.

########################################################
########################################################
# super basic intro to R
x <- c(1,2,3)
y <- c('a', 'b', 'c')
df <- data.frame(x,y)
print(df)
class(df)
class(df[,1])
class(df[,2])
class(as.character(df[,2]))

# you can access values of a dataframe in multiple ways:
df[1,1]
df[,1]
df[ , 'y']
df$y
df$x[1]

# periods "." in R mean little. If you see a period, it doesn't mean
# you're accessing some function. e.g.:
i.am.an.object <- 1:10
print(i.am.an.object)
sum(i.am.an.object)

# a function is written like this
foo <- function(x=4){ print(c('bar', x))}
foo()
foo(x=1000)

# plotting in R is fun:
plot(1:100, (1:100)^3, col=rainbow(100, alpha=.1), pch=19, cex=50, xlab='', ylab='')
points(1:100, (100:1)^3, col=rainbow(100, alpha=.05), pch=19, cex=seq(50, 1, length=80))
text('This can get addictive', x=50, y=75^3, cex=3)
########################################################
########################################################

################################
# Size quantity normalization:

setwd('~/Desktop/PREMISE/')
source('Hillary_Premise/utils/env.R')

# read in some data:
br <- read.csv('Hillary_Premise/data/br_raw.csv')
spec.df <- br[br$spec_name=='Cream', ]


# what do the prices look like?
hist(spec.df$price, breaks=50, col='#3090aa40', xlab='',
     main=spec.df$spec_name[1], cex.main=4)


# everything past this red line is likely an outlier, we'll deal with this later.
# lwd = line thickness, lty = line type (2 = dotted), v := 'vertical', col= color.
abline(v=6, col='red', lty=2, lwd=2)

# inspect some metadata:
table(as.character(spec.df$size))
table(as.character(spec.df$size_unit))
table(as.character(spec.df$quantity))

# paste size and size_unit together, and analyze that:
size_and_unit <- paste(spec.df$size, spec.df$size_unit)
table(size_and_unit)
spec.df$size_and_unit <- size_and_unit

# visualize this:
barplot(sort(table(size_and_unit)),
        cex.names=.6, col=rainbow(10, .5, .8), main=spec.df$spec_name[1],
        cex.main=4)

# this is a function that I made, might put it up on a gist if people want to check it out.
sizes <- plot.effect2(spec.df, varname='size_and_unit')

# placename inspection
spec.df <- cluster.metadata(spec.df, var='placename')
places <- plot.effect2(spec.df, varname='placename')

# product name inspection
spec.df <- cluster.metadata(spec.df, var='product_name')
products <- plot.effect2(spec.df, varname='product_name', cex.labels=.8)


spec.df.standardized <- unit.standardizer(spec.df, verbose=T)
table(spec.df.standardized$quantity)
table(spec.df.standardized$size_unit)

# bi-modal
hist(spec.df$price, breaks=50, col='#4090aa')
# unimodal - better!
hist(spec.df.standardized$price, breaks=50, col='#4090aa')

# was standard deviation decreased?
print(sd(spec.df$price))
print(sd(spec.df.standardized$price))
# yes! That's good.

################################################################
################################################################
# outlier detection:
# a simple way to clean out crazay observations

spec.df.clean <- trimmed.normal(spec.df.standardized)
hist(spec.df.clean$price, breaks=50, col='#40aa6080', xlab='price', main=spec.df$spec_name[1])

# make and plot the time series:
ts <- get.simple.ts(spec.df.clean, demean=F, obs.window.median=10)
plot.ts(ts, spec.df.clean, ylim=c(1,3))

# Now plot it with the median prices plotted instead.
spec.df.clean$price <- get.window.median(spec.df.clean$price, n=10)
plot.ts(ts, spec.df.clean, ylim=c(1,3))


################################################################
################################################################
# here's an example of what you can eventually built up to:
# a food staples index for an entire country.
uuids <- read.csv('~/Desktop/uuids.csv')
inflation <- get.offline.cpi(all.data=br, max=2, min=.5, uuids=uuids,
                             min.obs.smooth=100, carry.forward=F,
                             clean=T, trim1=.1, trim2=.05, z1=5, z2=4,
                             obs.window.median=3,
                             country='br')
objects.to.global.env(inflation)
plot.offline.cpi(plot.elements=T, zoom=.2)
plot.offline.cpi(plot.elements=T, zoom=4)
abline(h=100, col='black', lty=2)
################################################################
################################################################


################################################################
################################################################
# premise places - kmeans algorithm

usa.places <- read.csv('~/Desktop/premise_places/us_places_extra_refined3.csv')
head(usa.places)
foobar <- usa.places[usa.places$center1=='1', ]
hist(foobar$loc_accuracy, breaks=90, col='cornflowerblue')

geo <- cluster.by.geo(foobar, guess.k.by.placenames=F, placename.tune=T, mainplaces.proportion=.5,
                       get.k.by='user', zoom=.1, loc.accuracy.cutoff=1500)
################################################################
################################################################
	######################################################################
	# hackbright stuffs
	# Premise
	######################################################################
	# Author: Hillary Sanders
	######################################################################
	######################################################################

	# get data here: https://data.premise.com/


	# There are a few main types of objects in R:
	# vectors (e.g. c(1,2,3)), matrices, lists, and dataframes.
	# dataframes are very common, and is mostly what we're going to
	# be playing with below. A dataframe is like a matrix, in that
	# is has two dimensions, but each column can be a specific type
	# of value: e.g. numeric or character.

	########################################################
	########################################################
	# super basic intro to R
	x <- c(1,2,3)
	y <- c('a', 'b', 'c')
	df <- data.frame(x,y)
	print(df)
	class(df)
	class(df[,1])
	class(df[,2])
	class(as.character(df[,2]))

	# you can access values of a dataframe in multiple ways:
	df[1,1]
	df[,1]
	df[ , 'y']
	df$y
	df$x[1]

	# periods "." in R mean little. If you see a period, it doesn't mean
	# you're accessing some function. e.g.:
	i.am.an.object <- 1:10
	print(i.am.an.object)
	sum(i.am.an.object)

	# a function is written like this
	foo <- function(x=4){ print(c('bar', x))}
	foo()
	foo(x=1000)

	# plotting in R is fun:
	plot(1:100, (1:100)^3, col=rainbow(100, alpha=.1), pch=19, cex=50, xlab='', ylab='')
	points(1:100, (100:1)^3, col=rainbow(100, alpha=.05), pch=19, cex=seq(50, 1, length=80))
	text('This can get addictive', x=50, y=75^3, cex=3)
	########################################################
	########################################################

	################################
	# Size quantity normalization:

	setwd('~/Desktop/PREMISE/')
	source('Hillary_Premise/utils/env.R')

	# read in some data:
	br <- read.csv('Hillary_Premise/data/br_raw.csv')
	spec.df <- br[br$spec_name=='Cream', ]


	# what do the prices look like?
	hist(spec.df$price, breaks=50, col='#3090aa40', xlab='',
	main=spec.df$spec_name[1], cex.main=4)


	# everything past this red line is likely an outlier, we'll deal with this later.
	# lwd = line thickness, lty = line type (2 = dotted), v := 'vertical', col= color.
	abline(v=6, col='red', lty=2, lwd=2)

	# inspect some metadata:
	table(as.character(spec.df$size))
	table(as.character(spec.df$size_unit))
	table(as.character(spec.df$quantity))

	# paste size and size_unit together, and analyze that:
	size_and_unit <- paste(spec.df$size, spec.df$size_unit)
	table(size_and_unit)
	spec.df$size_and_unit <- size_and_unit

	# visualize this:
	barplot(sort(table(size_and_unit)),
	cex.names=.6, col=rainbow(10, .5, .8), main=spec.df$spec_name[1],
	cex.main=4)

	# this is a function that I made, might put it up on a gist if people want to check it out.
	sizes <- plot.effect2(spec.df, varname='size_and_unit')

	# placename inspection
	spec.df <- cluster.metadata(spec.df, var='placename')
	places <- plot.effect2(spec.df, varname='placename')

	# product name inspection
	spec.df <- cluster.metadata(spec.df, var='product_name')
	products <- plot.effect2(spec.df, varname='product_name', cex.labels=.8)


	spec.df.standardized <- unit.standardizer(spec.df, verbose=T)
	table(spec.df.standardized$quantity)
	table(spec.df.standardized$size_unit)

	# bi-modal
	hist(spec.df$price, breaks=50, col='#4090aa')
	# unimodal - better!
	hist(spec.df.standardized$price, breaks=50, col='#4090aa')

	# was standard deviation decreased?
	print(sd(spec.df$price))
	print(sd(spec.df.standardized$price))
	# yes! That's good.

	################################################################
	################################################################
	# outlier detection:
	# a simple way to clean out crazay observations

	spec.df.clean <- trimmed.normal(spec.df.standardized)
	hist(spec.df.clean$price, breaks=50, col='#40aa6080', xlab='price', main=spec.df$spec_name[1])

	# make and plot the time series:
	ts <- get.simple.ts(spec.df.clean, demean=F, obs.window.median=10)
	plot.ts(ts, spec.df.clean, ylim=c(1,3))

	# Now plot it with the median prices plotted instead.
	spec.df.clean$price <- get.window.median(spec.df.clean$price, n=10)
	plot.ts(ts, spec.df.clean, ylim=c(1,3))


	################################################################
	################################################################
	# here's an example of what you can eventually built up to:
	# a food staples index for an entire country.
	uuids <- read.csv('~/Desktop/uuids.csv')
	inflation <- get.offline.cpi(all.data=br, max=2, min=.5, uuids=uuids,
	min.obs.smooth=100, carry.forward=F,
	clean=T, trim1=.1, trim2=.05, z1=5, z2=4,
	obs.window.median=3,
	country='br')
	objects.to.global.env(inflation)
	plot.offline.cpi(plot.elements=T, zoom=.2)
	plot.offline.cpi(plot.elements=T, zoom=4)
	abline(h=100, col='black', lty=2)
	################################################################
	################################################################


	################################################################
	################################################################
	# premise places - kmeans algorithm

	usa.places <- read.csv('~/Desktop/premise_places/us_places_extra_refined3.csv')
	head(usa.places)
	foobar <- usa.places[usa.places$center1=='1', ]
	hist(foobar$loc_accuracy, breaks=90, col='cornflowerblue')

	geo <- cluster.by.geo(foobar, guess.k.by.placenames=F, placename.tune=T, mainplaces.proportion=.5,
	get.k.by='user', zoom=.1, loc.accuracy.cutoff=1500)
	################################################################
	################################################################