Skip to content

Instantly share code, notes, and snippets.

library(forecast)
# mdeaths: Monthly Deaths from Lung Diseases in the UK
fit <- auto.arima(mdeaths)
#customize your confidence intervals
forecast(fit, level=c(80, 95, 99), h=3)
# Point Forecast Lo 80 Hi 80 Lo 95 Hi 95 Lo 99 Hi 99
#Jan 1980 1822.863 1564.192 2081.534 1427.259 2218.467 1302.952 2342.774
#Feb 1980 1923.190 1635.530 2210.851 1483.251 2363.130 1345.012 2501.368
#Mar 1980 1789.153 1495.048 2083.258 1339.359 2238.947 1198.023 2380.283
library(qcc)
# series of value w/ mean of 10 with a little random noise added in
x <- rep(10, 100) + rnorm(100)
# a test series w/ a mean of 11
new.x <- rep(11, 15) + rnorm(15)
# qcc will flag the new points
qcc(x, newdata=new.x, type="xbar.one")
library(reshape2)
# generate a unique id for each row; this let's us go back to wide format later
iris$id <- 1:nrow(iris)
iris.lng <- melt(iris, id=c("id", "Species"))
head(iris.lng)
# id Species variable value
#1 1 setosa Sepal.Length 5.1
#2 2 setosa Sepal.Length 4.9
library(randomForest)
# download Titanic Survivors data
data <- read.table("http://math.ucdenver.edu/RTutorial/titanic.txt", h=T, sep="\t")
# make survived into a yes/no
data$Survived <- as.factor(ifelse(data$Survived==1, "yes", "no"))
# split into a training and test set
idx <- runif(nrow(data)) <= .75
data.train <- data[idx,]
@glamp
glamp / dbdriver_example.R
Created February 11, 2013 00:48
Example of R packages
library(RPostgreSQL)
drv <- dbDriver("PostgreSQL")
db <- dbConnect(drv, dbname="ncaa",
user="YOUR USER NAME", password="YOUR PASSWORD")
q <- "SELECT
*
FROM
game_scores;"
@glamp
glamp / pandas_plyr.py
Last active December 13, 2015 20:18
import numpy as np
import pandas as pd
import pylab as pl
baseball = pd.read_csv("http://bit.ly/144sh7t")
# group by year and get a summary of each numeric column
baseball.groupby(["year"]).describe()
# for each year, get the mean of each column
baseball.groupby(["year"]).aggregate(np.mean)
from pandas.core.reshape import melt, pivot
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['id'] = np.arange(0, len(df))
df['species'] = pd.Factor(iris.target, levels=iris.target_names)
df_lng = melt(df, id_vars=['id', 'species'])
df_lng.head()
df_lng.tail()
# back to wide
import pandas as pd
import numpy as np
from datetime import datetime
# generate some fake tick data with 1 million observations
n = 1000000
df = pd.DataFrame({
"timestamp": [datetime.now() for t in range(n)],
"value": np.random.uniform(-1, 1, n)
})
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# apply
cols = df.columns[0:4]
# divide each numeric column by 2
df[cols].apply(lambda x: x / 2).head()
from dateutil.parser import parse
import pandas as pd
# monthly slaughter records since 1921
df = pd.read_csv("http://bit.ly/119792b")
# parse the data (we could also use pd.to_datetime)
df.date = df.date.apply(parse)
# sort the data frame by date
df = df.sort(['date'])
# create an index