Skip to content

Instantly share code, notes, and snippets.

@hillarysanders
Last active December 28, 2015 03:59
Show Gist options
  • Save hillarysanders/7439589 to your computer and use it in GitHub Desktop.
Save hillarysanders/7439589 to your computer and use it in GitHub Desktop.
hackbright - play with some data
######################################################################
# hackbright stuffs
# Premise
######################################################################
# Author: Hillary Sanders
######################################################################
######################################################################
# get data here: https://data.premise.com/
# There are a few main types of objects in R:
# vectors (e.g. c(1,2,3)), matrices, lists, and dataframes.
# dataframes are very common, and is mostly what we're going to
# be playing with below. A dataframe is like a matrix, in that
# is has two dimensions, but each column can be a specific type
# of value: e.g. numeric or character.
########################################################
########################################################
# super basic intro to R
x <- c(1,2,3)
y <- c('a', 'b', 'c')
df <- data.frame(x,y)
print(df)
class(df)
class(df[,1])
class(df[,2])
class(as.character(df[,2]))
# you can access values of a dataframe in multiple ways:
df[1,1]
df[,1]
df[ , 'y']
df$y
df$x[1]
# periods "." in R mean little. If you see a period, it doesn't mean
# you're accessing some function. e.g.:
i.am.an.object <- 1:10
print(i.am.an.object)
sum(i.am.an.object)
# a function is written like this
foo <- function(x=4){ print(c('bar', x))}
foo()
foo(x=1000)
# plotting in R is fun:
plot(1:100, (1:100)^3, col=rainbow(100, alpha=.1), pch=19, cex=50, xlab='', ylab='')
points(1:100, (100:1)^3, col=rainbow(100, alpha=.05), pch=19, cex=seq(50, 1, length=80))
text('This can get addictive', x=50, y=75^3, cex=3)
########################################################
########################################################
################################
# Size quantity normalization:
setwd('~/Desktop/PREMISE/')
source('Hillary_Premise/utils/env.R')
# read in some data:
br <- read.csv('Hillary_Premise/data/br_raw.csv')
spec.df <- br[br$spec_name=='Cream', ]
# what do the prices look like?
hist(spec.df$price, breaks=50, col='#3090aa40', xlab='',
main=spec.df$spec_name[1], cex.main=4)
# everything past this red line is likely an outlier, we'll deal with this later.
# lwd = line thickness, lty = line type (2 = dotted), v := 'vertical', col= color.
abline(v=6, col='red', lty=2, lwd=2)
# inspect some metadata:
table(as.character(spec.df$size))
table(as.character(spec.df$size_unit))
table(as.character(spec.df$quantity))
# paste size and size_unit together, and analyze that:
size_and_unit <- paste(spec.df$size, spec.df$size_unit)
table(size_and_unit)
spec.df$size_and_unit <- size_and_unit
# visualize this:
barplot(sort(table(size_and_unit)),
cex.names=.6, col=rainbow(10, .5, .8), main=spec.df$spec_name[1],
cex.main=4)
# this is a function that I made, might put it up on a gist if people want to check it out.
sizes <- plot.effect2(spec.df, varname='size_and_unit')
# placename inspection
spec.df <- cluster.metadata(spec.df, var='placename')
places <- plot.effect2(spec.df, varname='placename')
# product name inspection
spec.df <- cluster.metadata(spec.df, var='product_name')
products <- plot.effect2(spec.df, varname='product_name', cex.labels=.8)
spec.df.standardized <- unit.standardizer(spec.df, verbose=T)
table(spec.df.standardized$quantity)
table(spec.df.standardized$size_unit)
# bi-modal
hist(spec.df$price, breaks=50, col='#4090aa')
# unimodal - better!
hist(spec.df.standardized$price, breaks=50, col='#4090aa')
# was standard deviation decreased?
print(sd(spec.df$price))
print(sd(spec.df.standardized$price))
# yes! That's good.
################################################################
################################################################
# outlier detection:
# a simple way to clean out crazay observations
spec.df.clean <- trimmed.normal(spec.df.standardized)
hist(spec.df.clean$price, breaks=50, col='#40aa6080', xlab='price', main=spec.df$spec_name[1])
# make and plot the time series:
ts <- get.simple.ts(spec.df.clean, demean=F, obs.window.median=10)
plot.ts(ts, spec.df.clean, ylim=c(1,3))
# Now plot it with the median prices plotted instead.
spec.df.clean$price <- get.window.median(spec.df.clean$price, n=10)
plot.ts(ts, spec.df.clean, ylim=c(1,3))
################################################################
################################################################
# here's an example of what you can eventually built up to:
# a food staples index for an entire country.
uuids <- read.csv('~/Desktop/uuids.csv')
inflation <- get.offline.cpi(all.data=br, max=2, min=.5, uuids=uuids,
min.obs.smooth=100, carry.forward=F,
clean=T, trim1=.1, trim2=.05, z1=5, z2=4,
obs.window.median=3,
country='br')
objects.to.global.env(inflation)
plot.offline.cpi(plot.elements=T, zoom=.2)
plot.offline.cpi(plot.elements=T, zoom=4)
abline(h=100, col='black', lty=2)
################################################################
################################################################
################################################################
################################################################
# premise places - kmeans algorithm
usa.places <- read.csv('~/Desktop/premise_places/us_places_extra_refined3.csv')
head(usa.places)
foobar <- usa.places[usa.places$center1=='1', ]
hist(foobar$loc_accuracy, breaks=90, col='cornflowerblue')
geo <- cluster.by.geo(foobar, guess.k.by.placenames=F, placename.tune=T, mainplaces.proportion=.5,
get.k.by='user', zoom=.1, loc.accuracy.cutoff=1500)
################################################################
################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment