Greg Lamp glamp

## forecast_example.R
library(forecast)

# mdeaths: Monthly Deaths from Lung Diseases in the UK
fit <- auto.arima(mdeaths)
#customize your confidence intervals
forecast(fit, level=c(80, 95, 99), h=3)
#         Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95    Lo 99    Hi 99
#Jan 1980       1822.863 1564.192 2081.534 1427.259 2218.467 1302.952 2342.774
#Feb 1980       1923.190 1635.530 2210.851 1483.251 2363.130 1345.012 2501.368
#Mar 1980       1789.153 1495.048 2083.258 1339.359 2238.947 1198.023 2380.283

## qcc_example.R
library(qcc)

# series of value w/ mean of 10 with a little random noise added in
x <- rep(10, 100) + rnorm(100)
# a test series w/ a mean of 11
new.x <- rep(11, 15) + rnorm(15)
# qcc will flag the new points
qcc(x, newdata=new.x, type="xbar.one")

## reshape_example.R
library(reshape2)

# generate a unique id for each row; this let's us go back to wide format later
iris$id <- 1:nrow(iris)

iris.lng <- melt(iris, id=c("id", "Species"))
head(iris.lng)
#  id Species     variable value
#1  1  setosa Sepal.Length   5.1
#2  2  setosa Sepal.Length   4.9

## randomforest_example.R
library(randomForest)

# download Titanic Survivors data
data <- read.table("http://math.ucdenver.edu/RTutorial/titanic.txt", h=T, sep="\t")
# make survived into a yes/no
data$Survived <- as.factor(ifelse(data$Survived==1, "yes", "no"))

# split into a training and test set
idx <- runif(nrow(data)) <= .75
data.train <- data[idx,]

## dbdriver_example.R
library(RPostgreSQL)

drv <- dbDriver("PostgreSQL")
db <- dbConnect(drv, dbname="ncaa",
                 user="YOUR USER NAME", password="YOUR PASSWORD")

q <- "SELECT
        *
      FROM
        game_scores;"

## pandas_plyr.py
import numpy as np
import pandas as pd
import pylab as pl

baseball = pd.read_csv("http://bit.ly/144sh7t")

# group by year and get a summary of each numeric column
baseball.groupby(["year"]).describe()
# for each year, get the mean of each column
baseball.groupby(["year"]).aggregate(np.mean)

## reshape_example.py
from pandas.core.reshape import melt, pivot

df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['id'] = np.arange(0, len(df))
df['species'] = pd.Factor(iris.target, levels=iris.target_names)
df_lng = melt(df, id_vars=['id', 'species'])
df_lng.head()
df_lng.tail()

# back to wide

## dataframe_example.py
import pandas as pd
import numpy as np
from datetime import datetime

# generate some fake tick data with 1 million observations
n = 1000000
df = pd.DataFrame({
    "timestamp": [datetime.now() for t in range(n)],
    "value": np.random.uniform(-1, 1, n)
    })

## apply_example.py
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()

df = pd.DataFrame(iris.data, columns=iris.feature_names)
# apply
cols = df.columns[0:4]
# divide each numeric column by 2
df[cols].apply(lambda x: x / 2).head()

## plot_example.py
from dateutil.parser import parse
import pandas as pd

# monthly slaughter records since 1921
df = pd.read_csv("http://bit.ly/119792b")
# parse the data (we could also use pd.to_datetime)
df.date = df.date.apply(parse)
# sort the data frame by date
df = df.sort(['date'])
# create an index
	library(forecast)

	# mdeaths: Monthly Deaths from Lung Diseases in the UK
	fit <- auto.arima(mdeaths)
	#customize your confidence intervals
	forecast(fit, level=c(80, 95, 99), h=3)
	# Point Forecast Lo 80 Hi 80 Lo 95 Hi 95 Lo 99 Hi 99
	#Jan 1980 1822.863 1564.192 2081.534 1427.259 2218.467 1302.952 2342.774
	#Feb 1980 1923.190 1635.530 2210.851 1483.251 2363.130 1345.012 2501.368
	#Mar 1980 1789.153 1495.048 2083.258 1339.359 2238.947 1198.023 2380.283
	library(qcc)

	# series of value w/ mean of 10 with a little random noise added in
	x <- rep(10, 100) + rnorm(100)
	# a test series w/ a mean of 11
	new.x <- rep(11, 15) + rnorm(15)
	# qcc will flag the new points
	qcc(x, newdata=new.x, type="xbar.one")
	library(reshape2)

	# generate a unique id for each row; this let's us go back to wide format later
	iris$id <- 1:nrow(iris)

	iris.lng <- melt(iris, id=c("id", "Species"))
	head(iris.lng)
	# id Species variable value
	#1 1 setosa Sepal.Length 5.1
	#2 2 setosa Sepal.Length 4.9
	library(randomForest)

	# download Titanic Survivors data
	data <- read.table("http://math.ucdenver.edu/RTutorial/titanic.txt", h=T, sep="\t")
	# make survived into a yes/no
	data$Survived <- as.factor(ifelse(data$Survived==1, "yes", "no"))

	# split into a training and test set
	idx <- runif(nrow(data)) <= .75
	data.train <- data[idx,]
	library(RPostgreSQL)

	drv <- dbDriver("PostgreSQL")
	db <- dbConnect(drv, dbname="ncaa",
	user="YOUR USER NAME", password="YOUR PASSWORD")

	q <- "SELECT
	*
	FROM
	game_scores;"
	import numpy as np
	import pandas as pd
	import pylab as pl

	baseball = pd.read_csv("http://bit.ly/144sh7t")

	# group by year and get a summary of each numeric column
	baseball.groupby(["year"]).describe()
	# for each year, get the mean of each column
	baseball.groupby(["year"]).aggregate(np.mean)
	from pandas.core.reshape import melt, pivot

	df = pd.DataFrame(iris.data, columns=iris.feature_names)
	df['id'] = np.arange(0, len(df))
	df['species'] = pd.Factor(iris.target, levels=iris.target_names)
	df_lng = melt(df, id_vars=['id', 'species'])
	df_lng.head()
	df_lng.tail()

	# back to wide
	import pandas as pd
	import numpy as np
	from datetime import datetime

	# generate some fake tick data with 1 million observations
	n = 1000000
	df = pd.DataFrame({
	"timestamp": [datetime.now() for t in range(n)],
	"value": np.random.uniform(-1, 1, n)
	})
	import pandas as pd
	from sklearn.datasets import load_iris

	iris = load_iris()

	df = pd.DataFrame(iris.data, columns=iris.feature_names)
	# apply
	cols = df.columns[0:4]
	# divide each numeric column by 2
	df[cols].apply(lambda x: x / 2).head()
	from dateutil.parser import parse
	import pandas as pd

	# monthly slaughter records since 1921
	df = pd.read_csv("http://bit.ly/119792b")
	# parse the data (we could also use pd.to_datetime)
	df.date = df.date.apply(parse)
	# sort the data frame by date
	df = df.sort(['date'])
	# create an index