View gist:e9bd0c41f5105bc540d5
SELECT Id, Body, Tags | |
FROM Posts | |
WHERE Body LIKE '%set.seed(%' AND Tags LIKE '%r%' |
View date_to_sy.R
## Date to SY function | |
## Year of 2nd argument does not matter | |
## Turns 2015-10-02 into "2015-16", and 2016-04-05 into "2015-16", with cutoff day = 2010-07-01 | |
date_to_sy <- function(date_var, last_day_of_sy){ | |
if(!(is.Date(date_var) & is.Date(last_day_of_sy))){stop("`date_var` and `last_day_of_sy` must both be class Date")} | |
cutoff_day <- day(last_day_of_sy) | |
cutoff_month <- month(last_day_of_sy) | |
case_when( | |
is.na(date_var) ~ as.character(NA), | |
month(date_var) > cutoff_month ~ paste0(year(date_var), " - ", year(date_var) + 1), # if past cutoff, X - X+! |
View email_split.R
library(stringr) | |
get_part_before_dot <- function(email){ | |
x <- str_split(email, "[.]") | |
lapply(x, `[[`, 1) %>% | |
unlist | |
} | |
dat <- data.frame(email = c("robert.rosen@tntp.org", "Sam.firke@tntp.org")) |
View gist:c0bd2b9c4d4e044b040966841e19a73b
library(pacman) | |
p_load(fuzzyjoin, dplyr) | |
# returns clusters of records that almost match | |
get_fuzzy_dupes <- function(x, max_dist = 2){ | |
result <- stringdist_inner_join(x, x, max_dist = max_dist, distance_col = "distance") | |
result <- result[result[[1]] != result[[2]], ] # remove actual 100% accurate duplicates | |
result <- t(apply(result, 1, sort)) # these two lines treat A, B as a duplicate of B, A and remove it. From http://stackoverflow.com/a/9028416 | |
result <- result[!duplicated(result), ] | |
as_data_frame(result) %>% |
View final_predictions.R
final_blank <- read_csv("data/kaggle/SampleSubmission.csv") %>% | |
separate(Id, into = c("year", "lower_team", "higher_team"), sep = "_", convert = TRUE, remove = FALSE) %>% | |
dplyr::select(-Pred) | |
final_blank_with_data <- final_blank %>% | |
add_kp_data %>% | |
create_vars_for_prediction %>% | |
mutate(lower_team_court_adv = as.factor("N")) %>% | |
dplyr::select(contains("diff"), lower_team_court_adv, contains("rank")) %>% | |
dplyr::select(-lower_pre_seas_rank_all, -higher_pre_seas_rank_all) |
View file28dc3223345c.R
Package: janitor | |
Title: Simple Tools for Examining and Cleaning Dirty Data | |
Version: 0.3.0.9000 | |
Authors@R: c(person("Sam", "Firke", email = "samuel.firke@gmail.com", role = c("aut", "cre")), | |
person("Chris", "Haid", email = "chrishaid@gmail.com", role = "ctb"), | |
person("Ryan", "Knight", email = "ryangknight@gmail.com", role = "ctb")) | |
Description: The main janitor functions can: perfectly format data.frame column | |
names; provide quick one- and two-variable tabulations (i.e., frequency | |
tables and crosstabs); and isolate duplicate records. Other janitor functions | |
nicely format the tabulation results. These tabulate-and-report functions |
View add_centered_title.R
library(ggplot2) | |
library(dplyr) | |
library(grid) | |
library(gridExtra) | |
add_centered_title <- function(p, text, font_size){ | |
title.grob <- textGrob( | |
label = text, | |
gp = gpar(fontsize = font_size, |
View split_tinker_combine_tidyverse.R
# I want to remove duplicate mpg rows where cylinder is 4 | |
# Split, tinker with the data.frames by name, bind_rows | |
library(magrittr) | |
library(dplyr) | |
mtcars %>% | |
split(., .$cyl == 4) %$% | |
bind_rows(`FALSE`, | |
`TRUE` %>% | |
distinct(mpg, .keep_all = TRUE)) |
View tidytext_wordclouds.R
library(pacman) | |
p_load(tidytext, wordcloud, janeaustenr, dplyr) | |
data("stop_words") | |
ppdf <- data.frame(prideprejudice, stringsAsFactors = FALSE) | |
# create a word cloud | |
create_word_cloud <- function(dat, col_name, exclude = "", max.words = 50, colors = "#034772", ...){ | |
col <- deparse(substitute(col_name)) | |
dat %>% |
View clean_names.R
clean_names <- function(dat){ | |
# Takes a data.frame, returns the same data frame with cleaned names | |
old_names <- names(dat) | |
new_names <- old_names %>% | |
gsub("%", "percent", .) %>% | |
make.names(.) %>% | |
gsub("[.]+", "_", .) %>% | |
tolower(.) %>% | |
gsub("_$", "", .) | |
setNames(dat, new_names) |
OlderNewer