Sam Firke sfirke

## gist:e9bd0c41f5105bc540d5
SELECT Id, Body, Tags
FROM Posts
WHERE Body LIKE '%set.seed(%' AND Tags LIKE '%r%'

## clean_names.R
clean_names <- function(dat){
  # Takes a data.frame, returns the same data frame with cleaned names
  old_names <- names(dat)
  new_names <- old_names %>%
    gsub("%", "percent", .) %>%
    make.names(.) %>%
    gsub("[.]+", "_", .) %>%
    tolower(.) %>%
    gsub("_$", "", .)
  setNames(dat, new_names)

## date_to_sy.R
## Date to SY function
## Year of 2nd argument does not matter
## Turns 2015-10-02 into "2015-16", and 2016-04-05 into "2015-16", with cutoff day = 2010-07-01
date_to_sy <- function(date_var, last_day_of_sy){
  if(!(is.Date(date_var) & is.Date(last_day_of_sy))){stop("`date_var` and `last_day_of_sy` must both be class Date")}
  cutoff_day <- day(last_day_of_sy)
  cutoff_month <- month(last_day_of_sy)
  case_when(
    is.na(date_var) ~ as.character(NA),
    month(date_var) > cutoff_month ~ paste0(year(date_var), " - ", year(date_var) + 1), # if past cutoff, X - X+!

## email_split.R
library(stringr)

get_part_before_dot <- function(email){
  x <- str_split(email, "[.]")
  lapply(x, `[[`, 1) %>%
    unlist
}


dat <- data.frame(email = c("robert.rosen@tntp.org", "Sam.firke@tntp.org"))

## gist:c0bd2b9c4d4e044b040966841e19a73b

library(pacman)
p_load(fuzzyjoin, dplyr)
# returns clusters of records that almost match
get_fuzzy_dupes <- function(x, max_dist = 2){
  result <- stringdist_inner_join(x, x, max_dist = max_dist, distance_col = "distance")
  result <- result[result[[1]] != result[[2]], ] # remove actual 100% accurate duplicates
  result <- t(apply(result, 1, sort)) # these two lines treat A, B as a duplicate of B, A and remove it.  From http://stackoverflow.com/a/9028416
  result <- result[!duplicated(result), ]
  as_data_frame(result) %>%

## final_predictions.R
final_blank <- read_csv("data/kaggle/SampleSubmission.csv") %>%
  separate(Id, into = c("year", "lower_team", "higher_team"), sep = "_", convert = TRUE, remove = FALSE) %>%
  dplyr::select(-Pred)

final_blank_with_data <- final_blank %>%
  add_kp_data %>%
  create_vars_for_prediction %>%
  mutate(lower_team_court_adv = as.factor("N")) %>%
  dplyr::select(contains("diff"), lower_team_court_adv, contains("rank")) %>%
  dplyr::select(-lower_pre_seas_rank_all, -higher_pre_seas_rank_all)

## file28dc3223345c.R
Package: janitor
Title: Simple Tools for Examining and Cleaning Dirty Data
Version: 0.3.0.9000
Authors@R: c(person("Sam", "Firke", email = "samuel.firke@gmail.com", role = c("aut", "cre")),
    person("Chris", "Haid", email = "chrishaid@gmail.com", role = "ctb"),
    person("Ryan", "Knight", email = "ryangknight@gmail.com", role = "ctb"))
Description: The main janitor functions can: perfectly format data.frame column
    names; provide quick one- and two-variable tabulations (i.e., frequency
    tables and crosstabs); and isolate duplicate records. Other janitor functions
    nicely format the tabulation results. These tabulate-and-report functions

## add_centered_title.R
library(ggplot2)
library(dplyr)
library(grid)
library(gridExtra)

add_centered_title <- function(p, text, font_size){

  title.grob <- textGrob(
    label = text,
    gp = gpar(fontsize = font_size,

## fix_surveymonkey_two_row_headers.R
# Fix dual-row names: if the first row is not NA or containing the word "response", use the one from the first row
# Note: read your SurveyMonkey .csv with readr::read_csv, not read.csv - otherwise this may not work

library(dplyr)
library(janitor)

fix_SM_dual_row_names <- function(dat){
  current_names <- names(dat)
  row_1 <- unlist(dat[1, ])


## render_keep_md.R
# From https://stackoverflow.com/questions/45575971/compile-a-vignette-using-devtoolsbuild-vignette-so-that-md-is-kept-in-the-v
# Usage: render_keep_md("tabyls")

render_keep_md <- function(vignette_name){
  # added the "encoding" argument to get the oe character passed through correctly to the resulting .Md
  rmarkdown::render(paste0("./vignettes/",vignette_name, ".Rmd"), clean=FALSE,  encoding = 'UTF-8')
  files_to_remove = paste0("./vignettes/",vignette_name, c(".html",".knit.md",".utf8.md"))
  lapply(files_to_remove, file.remove)
}
	SELECT Id, Body, Tags
	FROM Posts
	WHERE Body LIKE '%set.seed(%' AND Tags LIKE '%r%'
	clean_names <- function(dat){
	# Takes a data.frame, returns the same data frame with cleaned names
	old_names <- names(dat)
	new_names <- old_names %>%
	gsub("%", "percent", .) %>%
	make.names(.) %>%
	gsub("[.]+", "_", .) %>%
	tolower(.) %>%
	gsub("_$", "", .)
	setNames(dat, new_names)
	## Date to SY function
	## Year of 2nd argument does not matter
	## Turns 2015-10-02 into "2015-16", and 2016-04-05 into "2015-16", with cutoff day = 2010-07-01
	date_to_sy <- function(date_var, last_day_of_sy){
	if(!(is.Date(date_var) & is.Date(last_day_of_sy))){stop("`date_var` and `last_day_of_sy` must both be class Date")}
	cutoff_day <- day(last_day_of_sy)
	cutoff_month <- month(last_day_of_sy)
	case_when(
	is.na(date_var) ~ as.character(NA),
	month(date_var) > cutoff_month ~ paste0(year(date_var), " - ", year(date_var) + 1), # if past cutoff, X - X+!
	library(stringr)

	get_part_before_dot <- function(email){
	x <- str_split(email, "[.]")
	lapply(x, `[[`, 1) %>%
	unlist
	}


	dat <- data.frame(email = c("robert.rosen@tntp.org", "Sam.firke@tntp.org"))

	library(pacman)
	p_load(fuzzyjoin, dplyr)
	# returns clusters of records that almost match
	get_fuzzy_dupes <- function(x, max_dist = 2){
	result <- stringdist_inner_join(x, x, max_dist = max_dist, distance_col = "distance")
	result <- result[result[[1]] != result[[2]], ] # remove actual 100% accurate duplicates
	result <- t(apply(result, 1, sort)) # these two lines treat A, B as a duplicate of B, A and remove it. From http://stackoverflow.com/a/9028416
	result <- result[!duplicated(result), ]
	as_data_frame(result) %>%
	final_blank <- read_csv("data/kaggle/SampleSubmission.csv") %>%
	separate(Id, into = c("year", "lower_team", "higher_team"), sep = "_", convert = TRUE, remove = FALSE) %>%
	dplyr::select(-Pred)

	final_blank_with_data <- final_blank %>%
	add_kp_data %>%
	create_vars_for_prediction %>%
	mutate(lower_team_court_adv = as.factor("N")) %>%
	dplyr::select(contains("diff"), lower_team_court_adv, contains("rank")) %>%
	dplyr::select(-lower_pre_seas_rank_all, -higher_pre_seas_rank_all)
	Package: janitor
	Title: Simple Tools for Examining and Cleaning Dirty Data
	Version: 0.3.0.9000
	Authors@R: c(person("Sam", "Firke", email = "samuel.firke@gmail.com", role = c("aut", "cre")),
	person("Chris", "Haid", email = "chrishaid@gmail.com", role = "ctb"),
	person("Ryan", "Knight", email = "ryangknight@gmail.com", role = "ctb"))
	Description: The main janitor functions can: perfectly format data.frame column
	names; provide quick one- and two-variable tabulations (i.e., frequency
	tables and crosstabs); and isolate duplicate records. Other janitor functions
	nicely format the tabulation results. These tabulate-and-report functions
	library(ggplot2)
	library(dplyr)
	library(grid)
	library(gridExtra)

	add_centered_title <- function(p, text, font_size){

	title.grob <- textGrob(
	label = text,
	gp = gpar(fontsize = font_size,
	# Fix dual-row names: if the first row is not NA or containing the word "response", use the one from the first row
	# Note: read your SurveyMonkey .csv with readr::read_csv, not read.csv - otherwise this may not work

	library(dplyr)
	library(janitor)

	fix_SM_dual_row_names <- function(dat){
	current_names <- names(dat)
	row_1 <- unlist(dat[1, ])
	# From https://stackoverflow.com/questions/45575971/compile-a-vignette-using-devtoolsbuild-vignette-so-that-md-is-kept-in-the-v
	# Usage: render_keep_md("tabyls")

	render_keep_md <- function(vignette_name){
	# added the "encoding" argument to get the oe character passed through correctly to the resulting .Md
	rmarkdown::render(paste0("./vignettes/",vignette_name, ".Rmd"), clean=FALSE, encoding = 'UTF-8')
	files_to_remove = paste0("./vignettes/",vignette_name, c(".html",".knit.md",".utf8.md"))
	lapply(files_to_remove, file.remove)
	}