Zachary Jones zmjones

## decay.R
decay <- function(yvar, d) {
  # yvar: a binary variable
  # d: number of periods
  # returns: the cumulative sum of yvar at each point
  # where each addition to the count decays away after d periods
  yvar[is.na(yvar)] <- 0
  run <- cumsum(yvar)
  tvar <- seq_along(yvar)
  run <- 0
  sum <-  0

## cv.R
require(parallel)

validate.cv <- function(df, folds, resamples, model, loss, cores) {
  mclapply(1:resamples, function(x) {
    df$folds <- sample(rep(1:folds, length.out = nrow(df)))
    lapply(1:folds, function(test) {
      fit <- model(df[df$folds != test, ])
      loss(fit, df[df$folds == test, ])
    })}, mc.cores = CORES)
}

## pss2013.py
import requests
from bs4 import BeautifulSoup


def find_links(url, find_file=False):
    soup = BeautifulSoup(requests.get(url).content)
    try:
        links = soup.find('div', id='content-core').find_all('a')
        if find_file:
            links = [link['href'] + '/' + link.contents[2].strip()

## futurama.R
require(ggplot2)
require(plyr)

df <- read.csv("futurama.csv", colClasses = "character")
df[, c("season", "episode")] <- ldply(strsplit(as.character(df$episode), ".", fixed = TRUE))
ind <- by(df, list(df$season), function(x) {
  x <- x[order(as.integer(x$episode)), ]
  row.names(x)
})
df$title <- factor(df$title, levels = df$title[as.integer(unlist(ind))])

## maddison-new.R
pkgs <- c("reshape2", "gdata", "countrycode")
invisible(lapply(pkgs, require, character.only = TRUE))

mpd <- read.xls("http://www.ggdc.net/maddison/maddison-project/data/mpd_2013-01.xlsx",
                skip = 2, check.names = FALSE)
mpd <- mpd[, !apply(mpd, 2, function(x) all(is.na(x)))]
names(mpd)[1] <- "year"
names(mpd)[124] <- "Byzantium_Ottoman_Empire_Turkey"
colnames(mpd) <- trim(gsub("^[0-9]+|\\.|\\(|\\)|(&amp;)|'", "", colnames(mpd)))
colnames(mpd) <- gsub("-\\s+|/\\s+", "-", colnames(mpd))

## parses3logs.py
import csv
import os
import re
import dateutil
import pandas as pd
from urlparse import urlparse

log_path = ''
# parsing code: http://ferrouswheel.me/2010/01/python_tparse-fields-in-s3-logs/
log_entries = []

## partial_dependence_party.R
pkgs <- c("party", "parallel")
invisible(lapply(pkgs, require, character.only = TRUE))

partial_dependence <- function(fit, ivar, cores = 1, ...) {
    ## calculates the partial dependence of the response on explanatory variable(s)
    ## fit must be a party object
    ## ivar must be a character vector of length >= 1 all of which
    ## exist in the dataframe used to fit the model
    ## if the length of ivar > 1, joint dependence is calculated
    df <- data.frame(get("input", fit@data@env), get("response", fit@data@env))

## example.R
library(mlr) ## head
library(shiny)

r = generateLearningCurve(list("classif.rpart", "classif.knn"),
                                            task = sonar.task, percs = seq(0.2, 1, by = 0.2),
                                            measures = list(tpr, fpr, fn, fp),
                                            resampling = makeResampleDesc(method =  "Subsample", iters = 5),
                                            show.info = FALSE)
plotLearningCurve(r, interactive = TRUE)

## example.R
library(mlr)
library(checkmate)

fr = train("regr.rpart", bh.task)
dr = generatePartialPredictionData(fr, getTaskData(bh.task), c("lstat", "chas"))
plotPartialPrediction(dr, facet = "chas")

fc = train("classif.rpart", iris.task)
dc = generatePartialPredictionData(fc, getTaskData(iris.task), c("Petal.Width", "Petal.Length"),
                                   function(x) table(x) / length(x))

## example.R
lrns = list(makeLearner("classif.rpart", predict.type = "prob"),
            makeLearner("classif.nnet", predict.type = "prob"))
fit = lapply(lrns, train, task = iris.task)
pred = lapply(fit, predict, task = iris.task)
names(pred) = c("rpart", "nnet")
out = generateCalibrationData(pred)
plotCalibration(out)

fit = lapply(lrns, train, task = sonar.task)
pred = lapply(fit, predict, task = sonar.task)
	decay <- function(yvar, d) {
	# yvar: a binary variable
	# d: number of periods
	# returns: the cumulative sum of yvar at each point
	# where each addition to the count decays away after d periods
	yvar[is.na(yvar)] <- 0
	run <- cumsum(yvar)
	tvar <- seq_along(yvar)
	run <- 0
	sum <- 0
	require(parallel)

	validate.cv <- function(df, folds, resamples, model, loss, cores) {
	mclapply(1:resamples, function(x) {
	df$folds <- sample(rep(1:folds, length.out = nrow(df)))
	lapply(1:folds, function(test) {
	fit <- model(df[df$folds != test, ])
	loss(fit, df[df$folds == test, ])
	})}, mc.cores = CORES)
	}
	import requests
	from bs4 import BeautifulSoup


	def find_links(url, find_file=False):
	soup = BeautifulSoup(requests.get(url).content)
	try:
	links = soup.find('div', id='content-core').find_all('a')
	if find_file:
	links = [link['href'] + '/' + link.contents[2].strip()
	require(ggplot2)
	require(plyr)

	df <- read.csv("futurama.csv", colClasses = "character")
	df[, c("season", "episode")] <- ldply(strsplit(as.character(df$episode), ".", fixed = TRUE))
	ind <- by(df, list(df$season), function(x) {
	x <- x[order(as.integer(x$episode)), ]
	row.names(x)
	})
	df$title <- factor(df$title, levels = df$title[as.integer(unlist(ind))])
	pkgs <- c("reshape2", "gdata", "countrycode")
	invisible(lapply(pkgs, require, character.only = TRUE))

	mpd <- read.xls("http://www.ggdc.net/maddison/maddison-project/data/mpd_2013-01.xlsx",
	skip = 2, check.names = FALSE)
	mpd <- mpd[, !apply(mpd, 2, function(x) all(is.na(x)))]
	names(mpd)[1] <- "year"
	names(mpd)[124] <- "Byzantium_Ottoman_Empire_Turkey"
	colnames(mpd) <- trim(gsub("^[0-9]+\|\\.\|\\(\|\\)\|(&)\|'", "", colnames(mpd)))
	colnames(mpd) <- gsub("-\\s+\|/\\s+", "-", colnames(mpd))
	import csv
	import os
	import re
	import dateutil
	import pandas as pd
	from urlparse import urlparse

	log_path = ''
	# parsing code: http://ferrouswheel.me/2010/01/python_tparse-fields-in-s3-logs/
	log_entries = []
	pkgs <- c("party", "parallel")
	invisible(lapply(pkgs, require, character.only = TRUE))

	partial_dependence <- function(fit, ivar, cores = 1, ...) {
	## calculates the partial dependence of the response on explanatory variable(s)
	## fit must be a party object
	## ivar must be a character vector of length >= 1 all of which
	## exist in the dataframe used to fit the model
	## if the length of ivar > 1, joint dependence is calculated
	df <- data.frame(get("input", fit@data@env), get("response", fit@data@env))
	library(mlr) ## head
	library(shiny)

	r = generateLearningCurve(list("classif.rpart", "classif.knn"),
	task = sonar.task, percs = seq(0.2, 1, by = 0.2),
	measures = list(tpr, fpr, fn, fp),
	resampling = makeResampleDesc(method = "Subsample", iters = 5),
	show.info = FALSE)
	plotLearningCurve(r, interactive = TRUE)
	library(mlr)
	library(checkmate)

	fr = train("regr.rpart", bh.task)
	dr = generatePartialPredictionData(fr, getTaskData(bh.task), c("lstat", "chas"))
	plotPartialPrediction(dr, facet = "chas")

	fc = train("classif.rpart", iris.task)
	dc = generatePartialPredictionData(fc, getTaskData(iris.task), c("Petal.Width", "Petal.Length"),
	function(x) table(x) / length(x))
	lrns = list(makeLearner("classif.rpart", predict.type = "prob"),
	makeLearner("classif.nnet", predict.type = "prob"))
	fit = lapply(lrns, train, task = iris.task)
	pred = lapply(fit, predict, task = iris.task)
	names(pred) = c("rpart", "nnet")
	out = generateCalibrationData(pred)
	plotCalibration(out)

	fit = lapply(lrns, train, task = sonar.task)
	pred = lapply(fit, predict, task = sonar.task)