Skip to content

Instantly share code, notes, and snippets.

View revodavid's full-sized avatar

David Smith revodavid

View GitHub Profile
simple_roc <- function(labels, scores){
labels <- labels[order(scores, decreasing=TRUE)]
data.frame(TPR=cumsum(labels)/sum(labels), FPR=cumsum(!labels)/sum(!labels), labels)
}
set.seed(1)
sim_widget_data <- function(N, noise=100){
x <- runif(N, min=0, max=100)
y <- 122 - x/2 + rnorm(N, sd=noise)
bad_widget <- factor(y > 100)
data.frame(x, y, bad_widget)
}
widget_data <- sim_widget_data(500, 10)
test_set_idx <- sample(1:nrow(widget_data), size=floor(nrow(widget_data)/4))
fit_glm <- glm(bad_widget ~ x, training_set, family=binomial(link="logit"))
glm_link_scores <- predict(fit_glm, test_set, type="link")
glm_response_scores <- predict(fit_glm, test_set, type="response")
score_data <- data.frame(link=glm_link_scores,
response=glm_response_scores,
bad_widget=test_set$bad_widget,
stringsAsFactors=FALSE)
library(pROC)
plot(roc(test_set$bad_widget, glm_response_scores, direction="<"),
col="yellow", lwd=3, main="The turtle finds its way")
##
## Call:
## roc.default(response = test_set$bad_widget, predictor = glm_response_scores, direction = "<")
##
## Data: glm_response_scores in 59 controls (test_set$bad_widget FALSE) < 66 cases (test_set$bad_widget TRUE).
## Area under the curve: 0.9037
glm_simple_roc <- simple_roc(test_set$bad_widget=="TRUE", glm_link_scores)
set.seed(1)
N <- 2000
P <- 0.01
rare_success <- sample(c(TRUE, FALSE), N, replace=TRUE, prob=c(P, 1-P))
guess_not <- rep(0, N)
plot(roc(rare_success, guess_not), print.auc=TRUE)
##
## Call:
## roc.default(response = rare_success, predictor = guess_not)
##
## Plot all Starbucks locations using OpenStreetMap
## Credit: http://www.computerworld.com/article/2893271/business-intelligence/5-data-visualizations-in-5-minutes-each-in-5-lines-or-less-of-r.html
library(checkpoint)
checkpoint("2016-08-22")
file <- "https://opendata.socrata.com/api/views/ddym-zvjk/rows.csv"
starbucks <- read.csv(file)
library(leaflet); library(magrittr)
leaflet() %>% addTiles() %>% setView(-84.3847, 33.7613, zoom = 16) %>%
addMarkers(data = starbucks, lat = ~ Latitude, lng = ~ Longitude, popup = starbucks$Name)
## Plot last 6 months of ANTM share price
## Credit: http://www.computerworld.com/article/2893271/business-intelligence/5-data-visualizations-in-5-minutes-each-in-5-lines-or-less-of-r.html
library(checkpoint)
checkpoint("2016-08-22")
library(quantmod)
getSymbols("ANTM", auto.assign=TRUE)
barChart(ANTM, subset = 'last 6 months')
## Plot Atlanta area unemployment
## Credit: http://www.computerworld.com/article/2893271/business-intelligence/5-data-visualizations-in-5-minutes-each-in-5-lines-or-less-of-r.html
library(checkpoint)
checkpoint("2016-08-22")
library(quantmod)
getSymbols("ATLA013URN", src = "FRED")
names(ATLA013URN) = "rate"
library(dygraphs)
dygraph(ATLA013URN, main = "Atlanta area unemployment")
## Credit: http://www.computerworld.com/article/2893271/business-intelligence/5-data-visualizations-in-5-minutes-each-in-5-lines-or-less-of-r.html
library(checkpoint)
checkpoint("2016-08-22")
## Correlation plot
file <- "https://github.com/smach/NICAR15data/raw/master/testscores.csv"
testdata <- read.csv(file, stringsAsFactors = FALSE)
library(ggvis)
ggvis(testdata, ~ pctpoor, ~ score) %>%
layer_points(size := input_slider(10, 310, label = "Point size"), opacity := input_slider(0, 1, label = "Point opacity")) %>%
@revodavid
revodavid / CRAN_pkg_history.R
Last active May 1, 2021 12:58 — forked from andrie/CRAN_pkg_history.R
Scrapes CRAN for historical number of packages per release
# Scrapes CRAN archives to determine the number of packages per release
# Create a list of pages to scrape, including both archive and current
extract_url <- function() {
url <- list(
archive = "https://cran-archive.r-project.org/bin/windows/contrib/",
active = "https://cran.r-project.org/bin/windows/contrib/"
)
get_urls <- function(url) {