This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
final_blank <- read_csv("data/kaggle/SampleSubmission.csv") %>% | |
separate(Id, into = c("year", "lower_team", "higher_team"), sep = "_", convert = TRUE, remove = FALSE) %>% | |
dplyr::select(-Pred) | |
final_blank_with_data <- final_blank %>% | |
add_kp_data %>% | |
create_vars_for_prediction %>% | |
mutate(lower_team_court_adv = as.factor("N")) %>% | |
dplyr::select(contains("diff"), lower_team_court_adv, contains("rank")) %>% | |
dplyr::select(-lower_pre_seas_rank_all, -higher_pre_seas_rank_all) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(pacman) | |
p_load(fuzzyjoin, dplyr) | |
# returns clusters of records that almost match | |
get_fuzzy_dupes <- function(x, max_dist = 2){ | |
result <- stringdist_inner_join(x, x, max_dist = max_dist, distance_col = "distance") | |
result <- result[result[[1]] != result[[2]], ] # remove actual 100% accurate duplicates | |
result <- t(apply(result, 1, sort)) # these two lines treat A, B as a duplicate of B, A and remove it. From http://stackoverflow.com/a/9028416 | |
result <- result[!duplicated(result), ] | |
as_data_frame(result) %>% |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
get_part_before_dot <- function(email){ | |
x <- str_split(email, "[.]") | |
lapply(x, `[[`, 1) %>% | |
unlist | |
} | |
dat <- data.frame(email = c("robert.rosen@tntp.org", "Sam.firke@tntp.org")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Date to SY function | |
## Year of 2nd argument does not matter | |
## Turns 2015-10-02 into "2015-16", and 2016-04-05 into "2015-16", with cutoff day = 2010-07-01 | |
date_to_sy <- function(date_var, last_day_of_sy){ | |
if(!(is.Date(date_var) & is.Date(last_day_of_sy))){stop("`date_var` and `last_day_of_sy` must both be class Date")} | |
cutoff_day <- day(last_day_of_sy) | |
cutoff_month <- month(last_day_of_sy) | |
case_when( | |
is.na(date_var) ~ as.character(NA), | |
month(date_var) > cutoff_month ~ paste0(year(date_var), " - ", year(date_var) + 1), # if past cutoff, X - X+! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clean_names <- function(dat){ | |
# Takes a data.frame, returns the same data frame with cleaned names | |
old_names <- names(dat) | |
new_names <- old_names %>% | |
gsub("%", "percent", .) %>% | |
make.names(.) %>% | |
gsub("[.]+", "_", .) %>% | |
tolower(.) %>% | |
gsub("_$", "", .) | |
setNames(dat, new_names) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT Id, Body, Tags | |
FROM Posts | |
WHERE Body LIKE '%set.seed(%' AND Tags LIKE '%r%' |
NewerOlder