Skip to content

Instantly share code, notes, and snippets.

@zmjones
zmjones / decay.R
Last active October 12, 2015 00:27
time since event and decaying cumulative sums for binary time series
decay <- function(yvar, d) {
# yvar: a binary variable
# d: number of periods
# returns: the cumulative sum of yvar at each point
# where each addition to the count decays away after d periods
yvar[is.na(yvar)] <- 0
run <- cumsum(yvar)
tvar <- seq_along(yvar)
run <- 0
sum <- 0
@zmjones
zmjones / cv.R
Last active December 31, 2015 06:09
k-fold cross-validation using generic fitting and loss functions
require(parallel)
validate.cv <- function(df, folds, resamples, model, loss, cores) {
mclapply(1:resamples, function(x) {
df$folds <- sample(rep(1:folds, length.out = nrow(df)))
lapply(1:folds, function(test) {
fit <- model(df[df$folds != test, ])
loss(fit, df[df$folds == test, ])
})}, mc.cores = CORES)
}
@zmjones
zmjones / pss2013.py
Last active January 2, 2016 06:08
scrape all of the of Peace Science presentations from the 2013 meeting
import requests
from bs4 import BeautifulSoup
def find_links(url, find_file=False):
soup = BeautifulSoup(requests.get(url).content)
try:
links = soup.find('div', id='content-core').find_all('a')
if find_file:
links = [link['href'] + '/' + link.contents[2].strip()
@zmjones
zmjones / futurama.R
Last active January 2, 2016 10:18
scrapes and visualizes ratings for Futurama from IMDB
require(ggplot2)
require(plyr)
df <- read.csv("futurama.csv", colClasses = "character")
df[, c("season", "episode")] <- ldply(strsplit(as.character(df$episode), ".", fixed = TRUE))
ind <- by(df, list(df$season), function(x) {
x <- x[order(as.integer(x$episode)), ]
row.names(x)
})
df$title <- factor(df$title, levels = df$title[as.integer(unlist(ind))])
@zmjones
zmjones / maddison-new.R
Last active January 2, 2016 20:29
reshape the Maddison historical GDP and population data
pkgs <- c("reshape2", "gdata", "countrycode")
invisible(lapply(pkgs, require, character.only = TRUE))
mpd <- read.xls("http://www.ggdc.net/maddison/maddison-project/data/mpd_2013-01.xlsx",
skip = 2, check.names = FALSE)
mpd <- mpd[, !apply(mpd, 2, function(x) all(is.na(x)))]
names(mpd)[1] <- "year"
names(mpd)[124] <- "Byzantium_Ottoman_Empire_Turkey"
colnames(mpd) <- trim(gsub("^[0-9]+|\\.|\\(|\\)|(&amp;)|'", "", colnames(mpd)))
colnames(mpd) <- gsub("-\\s+|/\\s+", "-", colnames(mpd))
@zmjones
zmjones / parses3logs.py
Created February 7, 2014 13:53
parse and clean log files from AWS S3
import csv
import os
import re
import dateutil
import pandas as pd
from urlparse import urlparse
log_path = ''
# parsing code: http://ferrouswheel.me/2010/01/python_tparse-fields-in-s3-logs/
log_entries = []
@zmjones
zmjones / partial_dependence_party.R
Last active August 29, 2015 14:04
parallel calculation of marginal or joint dependence of explanatory variables from a party random forest
pkgs <- c("party", "parallel")
invisible(lapply(pkgs, require, character.only = TRUE))
partial_dependence <- function(fit, ivar, cores = 1, ...) {
## calculates the partial dependence of the response on explanatory variable(s)
## fit must be a party object
## ivar must be a character vector of length >= 1 all of which
## exist in the dataframe used to fit the model
## if the length of ivar > 1, joint dependence is calculated
df <- data.frame(get("input", fit@data@env), get("response", fit@data@env))
@zmjones
zmjones / example.R
Last active August 29, 2015 14:22
shiny/ggvis version of plotLearningCurve
library(mlr) ## head
library(shiny)
r = generateLearningCurve(list("classif.rpart", "classif.knn"),
task = sonar.task, percs = seq(0.2, 1, by = 0.2),
measures = list(tpr, fpr, fn, fp),
resampling = makeResampleDesc(method = "Subsample", iters = 5),
show.info = FALSE)
plotLearningCurve(r, interactive = TRUE)
@zmjones
zmjones / example.R
Last active August 29, 2015 14:23
partial dependence for supervised methods
library(mlr)
library(checkmate)
fr = train("regr.rpart", bh.task)
dr = generatePartialPredictionData(fr, getTaskData(bh.task), c("lstat", "chas"))
plotPartialPrediction(dr, facet = "chas")
fc = train("classif.rpart", iris.task)
dc = generatePartialPredictionData(fc, getTaskData(iris.task), c("Petal.Width", "Petal.Length"),
function(x) table(x) / length(x))
@zmjones
zmjones / example.R
Last active August 29, 2015 14:25
calibration plots for MLR
lrns = list(makeLearner("classif.rpart", predict.type = "prob"),
makeLearner("classif.nnet", predict.type = "prob"))
fit = lapply(lrns, train, task = iris.task)
pred = lapply(fit, predict, task = iris.task)
names(pred) = c("rpart", "nnet")
out = generateCalibrationData(pred)
plotCalibration(out)
fit = lapply(lrns, train, task = sonar.task)
pred = lapply(fit, predict, task = sonar.task)