Last active
March 5, 2019 15:13
-
-
Save martinctc/53444d4847055a870417f5b279f1b8bd to your computer and use it in GitHub Desktop.
[surveyr] Useful functions for tidy survey analysis. Under development... #R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### Source this from github using following link #### | |
# devtools::source_gist("https://gist.github.com/martinctc/53444d4847055a870417f5b279f1b8bd", filename="surveyr.R") | |
#' Calculate weight table using a single source/target column | |
#' (indicating what proportions need to be multiplied to achieve weighted proportions) | |
#' | |
#' @param x Data frame containing columns to be weighted | |
#' @param target bare (unquoted) column name of the variable used for creating weight | |
#' @param weights vector containing the proportions of what each value of the target column should take | |
calc_weights <- | |
function(x, target, weights){ | |
require(broom) | |
targ <- eval(substitute(target),x) | |
targ %>% | |
table() %>% | |
tidy() %>% | |
mutate(Freq2= Freq / sum(Freq)) %>% | |
cbind(weights) %>% | |
mutate(WFreq = Freq2 / weights) %>% | |
select("Category"=1,"Weights"=5) %>% | |
as.tibble() | |
} | |
#' Attach weights to data frame | |
#' | |
#' Add weights to the original data frame, to be used together with calc_weights() | |
#' This function is not necessary if a 'weights' column is already present in data set | |
#' | |
#' @param x Data frame to attach required weights | |
#' @param target Column from which the weights are calculated from. | |
#' @param weight_table Output produced from calc_weights(). | |
add_weights <- function(x, target, weight_table){ | |
targ <- enquo(target) | |
temp <- create_named_list(weight_table$Category, weight_table$Weights) | |
x %>% | |
mutate(WEIGHT=!!targ) %>% | |
mutate_at(vars(WEIGHT),funs(recode(.,!!!temp))) | |
} | |
#' Create a named list object with two vectors | |
#' Used for constructing arguments for add_weights(). | |
#' | |
#' @param x Vector containing character strings, e.g. "A" | |
#' @param y Vector containing numeric values, e.g. 1, 3 | |
create_named_list <- function(x, y){ | |
quostr <- function(list){ | |
options(useFancyQuotes = FALSE) | |
list2 <- lapply(list,function(x)toString(dQuote(x))) | |
list2 | |
} | |
weave <- function(list1, list2, num){ | |
paste0(list1[num],"=",list2[num]) | |
} | |
len <- length(x) | |
temp <- sapply(1:len,weave, list1=quostr(x),list2=y) | |
temp <- paste(temp, collapse = ", ") | |
temp <- paste0("c(",temp,")") | |
temp2 <- eval(parse(text=temp)) | |
temp2 | |
} | |
#' Create cross table with an option to add weights (single-choice) | |
#' | |
#' @param x Data frame with the required columns. | |
#' @param downbreak The main question to analyse by. | |
#' @param crossbreak The 'banner' by which the downbreak would be split by. | |
#' @param weights Pass the weight column into this argument - defaults to NULL for unweighted tables. | |
xtab <- | |
function(x,downbreak,crossbreak,weights=NULL){ | |
db.q <- enquo(downbreak) | |
cb.q <- enquo(crossbreak) | |
cb.string <- deparse(substitute(crossbreak)) #Convert to string | |
if(deparse(substitute(weights))!="NULL"){ | |
#If weights are used | |
wts.q <- enquo(weights) | |
x %>% | |
count(!!cb.q,!!db.q,!!wts.q) %>% #Counts of all | |
spread_(cb.string, "n", fill=0) %>% #Using standard evaluation | |
mutate(Total=rowSums(select(.,-!!db.q,-!!wts.q)))%>% #Create Total column | |
mutate_at(vars(-!!db.q,-!!wts.q), | |
funs(.*!!wts.q)) %>% | |
mutate_at(vars(-!!db.q,-!!wts.q), | |
funs(prop.of.itself)) %>% | |
#Create weighted proportions here | |
select(-!!wts.q) %>% | |
group_by(!!db.q) %>% | |
summarise_all(sum) | |
} else { | |
#If no weights applied | |
x %>% | |
count(!!cb.q,!!db.q) %>% | |
spread_(cb.string, "n", fill=0) %>% | |
mutate(Total=rowSums(select(.,-!!db.q)))%>% | |
mutate_at(vars(-!!db.q), | |
funs(prop.of.itself)) | |
} | |
} | |
#'Returns the proportions of each value in a vector as the sum of all values in the vector | |
#' | |
#'@param x Value in vector | |
prop.of.itself <- function(x){x / sum(x)} #Return proportion of vector | |
#' Create single-variable table with an option to add weights (single-choice) | |
#' | |
#' @param x Data frame with the required columns. | |
#' @param downbreak The main question to analyse by. | |
#' @param weights Pass the weight column into this argument - defaults to NULL for unweighted tables. | |
tab <- | |
function(x,downbreak, weights=NULL){ | |
db.q <- enquo(downbreak) | |
wts.q <- enquo(weights) | |
t.string <- deparse(substitute(target)) | |
if(deparse(substitute(weights))!="NULL"){ | |
x %>% | |
count(!!db.q, !!wts.q) %>% | |
mutate_at(vars(n), funs(.*!!wts.q))%>% | |
mutate_at(vars(n), funs(prop.of.itself))%>% | |
group_by(!!db.q) %>% | |
summarise(sum = sum(n)) | |
} else { | |
#If no weights are applied | |
x %>% count(!!db.q) %>% | |
mutate_at(vars(-!!db.q),funs(prop.of.itself)) | |
} | |
} | |
#' Convert a Likert scale from one scale to another | |
#' | |
#' This is used for converting data, for instance, from a 6-point scale to a 5-point scale. | |
#' The scale "dimensions" are specified in the function argument, and the function converts the numeric vector that is passed through. | |
#' | |
#' @param x Numeric vector to be passed through. | |
#' @param top.x Top value of the original scale. This would be 6 on a 0-6 scale. | |
#' @param bot.x Bottom value of the original scale. This would be 0 on a 0-6 scale. | |
#' @param top.y Top value of the new/target scale. This would be 5 on a 0-5 scale. | |
#' @param bot.y Bottom value of the new/target scale. This would be 0 on a 0-5 scale. | |
#' @keywords likert scale | |
#' @examples | |
#' data <-c(5, 4, 3, 2, 1) | |
#'likert.scaler(data,5,0,10,0) #5-point scale to 10-point scale | |
#'[1] 10 8 6 4 2 | |
likert.convert <- function(x, top.x, bot.x, top.y, bot.y){ | |
y <- ((top.y-bot.y)*(x-bot.x)/(top.x-bot.x))+bot.y | |
y | |
} | |
#' Reverse a Likert scale | |
#' | |
#' Reverse a Likert scale such that on a 0-10 scale, 10 becomes 0 and 0 becomes 10. | |
#' | |
#' @param x Numeric vector | |
#' @param top Top value of the scale for the variable, e.g. 10 for a 0-10 scale. | |
#' @param top Bottom value of the scale for the variable, e.g. 0 for a 0-10 scale. | |
#' @keywords likert scale | |
#' @examples | |
#' data <-c(5, 4, 3, 2, 6) | |
#' likert.reverse(data, 6, 0) | |
#' [1] 1 2 3 4 0 | |
likert.reverse <- function(x,top,bottom){ | |
total <-top+bottom | |
x <- total-x | |
x | |
} | |
#' Max-Min Scaling Function | |
#' | |
#' This function allows you to scale vectors or an entire data frame using the max-min scaling method, returning a vector | |
#' Suitable to be used with pipes | |
#' | |
#' @param x Pass a vector or the required columns of a data frame through this argument. | |
#' @keywords max-min | |
#' @export | |
#' @examples | |
#' rand.data <-cbind(sample(1000,234:697),sample(1000,234:697)) %>% as.data.frame() | |
#' maxmin(rand.data) | |
#' | |
#' rand.data <-sample(1000,234:677) | |
#' maxmin(rand.data) | |
maxmin <- function(x, na.rm=TRUE){ | |
if(is.vector(x)){ | |
maxs <- max(x,na.rm=na.rm) | |
mins <- min(x,na.rm=na.rm) | |
as.numeric(scale(x,center=mins,scale=maxs-mins)) | |
} else { | |
maxs <- apply(x, 2,max,na.rm=na.rm) | |
mins <- apply(x, 2,min,na.rm=na.rm) | |
as.numeric(scale(x, center = mins, scale = maxs - mins)) | |
} | |
} | |
#'Split the data into a simple training and testing set | |
#' | |
#'@param x Pass your data frame or matrix here. | |
#'@param part A numeric value between 0 and 1 to represent the proportion of the whole data you want to use as the training set. | |
#'@keywords train, test | |
#'@examples | |
#'x <-as.data.frame(matrix(1:5000,250,20)) | |
#'x.train <- split.tt(x,.7)$train | |
#'x.test <- split.tt(x,.7)$test | |
#' | |
#'dim(x.train) | |
#'dim(x.test) | |
split.tt <-function(x, part){ | |
rowz <- nrow(x) | |
samp <- sample(seq_len(rowz),floor(rowz * part)) | |
output <-list("train"=x[samp,],"test"=x[-samp,]) | |
output | |
} | |
#' Translate function using transltr.org API | |
#' | |
#' This function allows you to translate character strings through the transltr.org API. | |
#' @param what What text string to translate | |
#' @param from What language to translate from. Defaults to NULL | |
#' @param to What language to translate to. Defaults to "en" | |
#' @keywords translate | |
#' @export | |
#' @examples | |
#' trans.t(what="Blanc",from="fr",to="en") | |
trans.t <- function(what=NULL,from=NULL,to="en"){ | |
url <- paste0("http://transltr.org/api/translate?text=",URLencode(what), | |
"&to=",to,"&from=",from) | |
output<-jsonlite::fromJSON(txt=url) | |
output$translationText | |
} | |
#' Translate function using the Microsoft Translator API | |
#' | |
#' This function allows you to translate character strings through the Microsoft Translator API. | |
#' To use this function, you will need to first acquire an access token and pass it through the key argument. | |
#' @param what What text string to translate | |
#' @param from What language to translate from. Defaults to NULL | |
#' @param to What language to translate to. Defaults to "en" | |
#' @keywords translate | |
#' @export | |
#' @examples | |
#' trans.m(what="Blanc",from="fr",to="en",key="F023kljadfoilkjlkj") | |
trans.m <- function(what, from=NULL,to="en",key=MyKey){ | |
token_url <- "https://api.cognitive.microsoft.com/sts/v1.0/issueToken" | |
token_body <- list('Subscription-Key'=key) | |
resp<-httr::POST(token_url,query=token_body) | |
tok_string <- resp %>% httr::content(type="text",encoding='utf-8') | |
w.token <- paste("Bearer",tok_string) | |
transurl <- "http://api.microsofttranslator.com/V2/Http.svc/Translate" | |
tranz <- httr::GET(transurl, | |
query=list('appId'=w.token, | |
text=what,from=from, | |
to=to)) | |
tranz %>% xmlParse %>% xmlToList() | |
} | |
#' Returns sum of matched instances of pattern in string | |
str_count <- function(string,pattern){ | |
sum(str_detect(string, pattern),na.rm=TRUE) | |
} | |
#' str_count function with dual condition | |
#' Returns sum of matched instances of both pattern1 and pattern2 in string | |
#' Both pattern1 and pattern2 must be TRUE to be counted | |
str_count_2 <- function(string,pattern1,pattern2){ | |
sum(str_detect(string, pattern1) & str_detect(string, pattern2), | |
na.rm=TRUE) | |
} | |
#' Count the number of words in text string | |
#' | |
#' @param string Pass text string here. | |
#' @param pseudo Determines whether groups of special characters are matched. Defaults to FALSE (not matched) | |
#' | |
nwords <- function(string, pseudo=F){ | |
ifelse( pseudo, | |
pattern <- "\\S+", | |
pattern <- "[[:alpha:]]+" | |
) | |
str_count(string, pattern) | |
} | |
#' Split long sentences into two lines | |
#' | |
#' @param string Pass text string here. | |
#' @param words The maximum number of words allowed in the first line | |
#' @examples | |
#' string_liner("This is a lovely cup of tea",2) | |
#' string_liner("This is a lovely cup of tea",3) | |
str_next_line <- | |
function(string, words) { | |
nwords(string) -> p | |
words_list <- str_split(string, " ") %>% | |
unlist() | |
words_plus <- words + 1 | |
if (p > words_plus) { | |
material_1 <- paste0(words_list[1:words], collapse = " ") %>% unlist() | |
material_2 <- "\n" | |
material_3 <-paste0(words_list[words_plus:p], collapse = " ") %>% unlist() | |
paste(material_1, material_2, material_3, collapse = " ") %>% | |
str_replace_all("NA","") %>% | |
str_trim() | |
} else if (p == words_plus) { | |
material_1 <- paste0(words_list[1:words], collapse = " ") %>% unlist() | |
material_2 <- "\n" | |
material_3 <- paste0(words_list[p], collapse = " ") %>% unlist() | |
paste(material_1, material_2, material_3, collapse = " ") %>% | |
str_replace_all("NA","") %>% | |
str_trim() | |
} else if (p <= words){ | |
string | |
} | |
} | |
#' Returning a data frame summarising the counts and percentages of row-by-row identical values across the columns of a data frame. | |
#' Produce a data frame of all columns on whether they match values of target column | |
#' | |
#' @param df Pass a data frame containing some columns including a "target" column | |
#' @param target Main column that you want to compare with other columns - use bare / unquoted reference. | |
match_counter <- function(df,target){ | |
arguments <- as.list(match.call()) | |
targ <- deparse(substitute(target)) #variable to string | |
df %>% | |
mutate_all(funs(.==eval(parse(text=targ)))) %>% | |
colSums(na.rm = TRUE) %>% data.frame() %>% | |
rownames_to_column() -> matches | |
names(matches)[2] <- "counts" #rename column as "counts" | |
matches$counts[matches$rowname==targ] -> periods #Count number of periods | |
matches %>% mutate(prop=counts/periods) %>% #Create proportion column | |
select("Variable"=rowname,"Counts"=counts, "Percentage"=prop) | |
} | |
#' Create file name with time stamp | |
#' | |
#' @param main The main file name to be used. | |
#' @param extension The file extension to be used, e.g. ".csv" | |
#' @examples | |
#' timed_fn("Q15. ",".xlsx") | |
timed_fn <- function(main,extension){ | |
Sys.time() %>% | |
stringr::str_replace_all("[[:punct:]]","") %>% | |
stringr::str_replace_all(" ","_") -> xtime | |
paste(main,xtime,extension) | |
} | |
#' Saves or overwrites a worksheet to an Excel Workbook in a form compatible with the pipe style - using the openxlsx package. | |
#' | |
#' @param data Data to be written. | |
#' @param workbook Workbook object created with openxlsx::createWorkbook(). | |
#' @param Worksheet Character string containing name of the worksheet. | |
#' @examples | |
#' ## Accompanying functions | |
#' library(openxlsx) | |
#' wb <- createWorkbook() | |
#' saveWorkbook(wb,filename,overwrite=TRUE) | |
write_to_ws <- function(data, workbook, worksheet){ | |
if (worksheet %in% names(workbook)) { | |
writeData(workbook, worksheet, data) | |
} | |
else{ | |
addWorksheet(workbook, worksheet) | |
writeData(workbook, worksheet, data) | |
} | |
} | |
#' Clean strings so that they can be used as variable names or column names | |
#' | |
#' @param string Character string to be "cleaned". | |
#' @param treat_dups Set to FALSE to allow duplication of strings. Defaults to TRUE | |
clean_strings <- function(string,treat_dups=TRUE){ | |
new_string <- string %>% | |
gsub("'", "", .) %>% | |
gsub("\"","", .) %>% | |
gsub("%", "percent", .) %>% | |
gsub("^[ ]+","", .) %>% | |
make.names(.) %>% | |
gsub("[.]+", "_", .) %>% | |
gsub("[_]+", "_", .) %>% | |
tolower(.) %>% | |
gsub("_$", "",.) | |
dupe_count <- sapply(1:length(new_string), function(i) { | |
sum(new_string[i] == new_string[1:i]) | |
}) | |
if(treat_dups==TRUE){ | |
new_string[dupe_count > 1] <- paste(new_string[dupe_count > | |
1], dupe_count[dupe_count > 1], sep = "_") | |
return(new_string) | |
} else if(treat_dups==FALSE){ | |
return(new_string) | |
} | |
} | |
#' Replace nth occurring value in a string | |
#' A more refined replacement function than str_replace() and str_replace_all() | |
#' | |
#' @param x Pass string character | |
#' @param pattern String containing characters to match | |
#' @param replacement String containing to replace | |
#' @param n Nth term to be replaced | |
str_replace_nth <- function(x, pattern, replacement, n) { | |
g <- gregexpr(pattern, x)[[1]][n] | |
s <- scan(text = gsub("[()]", "", pattern), | |
sep = "|", | |
what = "") | |
substr(x, g, g) <- replacement[match(substr(x, g, g), s)] | |
x | |
} | |
#' Append an item to a list dynamically (pipe-optimised) | |
#' | |
#' Function directly appends item to list in Global Environment | |
#' @param x An object to append to list, e.g. vector, data frame. | |
#' @param list_x Target list to append object to. | |
#' @param name Specify a character string for the name of the list. Defaults to blank | |
#' | |
#' @example | |
#' a_list <- list(NULL) | |
#' append_to_list(iris,a_list,"iris") | |
append_to_list <- function(x, list_x, name=""){ | |
temp <- deparse(substitute(list_x)) | |
list_len <- length(list_x) | |
list_len1 <- list_len + 1 | |
if(is.null(list_x[[1]])){ # Treatment if list is empty | |
list_x[[1]] <- x | |
names(list_x)[1] <- name | |
assign(temp,list_x,envir = .GlobalEnv) | |
} else { # Treatment if list has already got at least one object | |
list_x[[list_len1]] <- x | |
names(list_x)[list_len1] <- name | |
assign(temp,list_x,envir = .GlobalEnv) | |
} | |
} | |
#' Calculate percentage change of a vector relative to a lag k | |
#' | |
#' @param x A vector to be passed through | |
#' @param lag The number of lags used, defaults to 1 | |
pc_change <-function(x,lag=1){ | |
base <- lag(x,lag) # base to be divided by | |
c(rep(NA,lag),diff(x,lag))/base | |
} | |
#' Copy a data frame to clipboard for pasting in Excel | |
#' Pipe optimised | |
#' @param expand Add number to manually expand clipboard size | |
#' @param quietly Set this to TRUE to not print data frame on console | |
copy_df <-function(x,row.names=FALSE,col.names=TRUE,expand="",quietly=FALSE,...) { | |
expand_x <- stringr::str_remove_all(expand,"-") | |
if(expand==""){ | |
write.table(x,paste0("clipboard",expand),sep="\t",row.names=row.names,col.names=col.names,...) | |
} else { | |
expand_x <- paste0("-",expand_x) | |
write.table(x,paste0("clipboard",expand_x),sep="\t",row.names=row.names,col.names=col.names,...) | |
} | |
if(quietly==FALSE) print(x) | |
} | |
#' Remove all columns which contain only NAs | |
#' Pipe optimised | |
#' @param df Data frame to be passed through | |
na_remover <- function(df){ | |
Filter(function(x)!all(is.na(x)),df) | |
} | |
#' Remove all columns which contain only zeros | |
#' Pipe optimised | |
#' @param df Data frame to be passed through | |
zero_remover <- function(df){ | |
Filter(function(x)!all(x==0),df) | |
} | |
#' Set working directory - for personal use only | |
set_wd <- function(path,where_you_at="office"){ | |
if(where_you_at=="office"){ | |
setwd(path) | |
} else if(where_you_at=="home"){ | |
setwd(stringr::str_replace_all(path,".Chan","")) | |
} | |
} | |
#' Sum rowwise, returning a dataframe with a new column created | |
#' | |
#' @param df Data frame to be passed through | |
#' @param col_name String input for the name of the new column | |
#' @param condition dplyr-based selection arguments in dplyr::select | |
#' @param na_rm Logical - whether to remove NAs when calculatng sum. Defaults to TRUE | |
sum_rowwise <- function(df,col_name="RowSum",condition,na_rm=TRUE){ | |
df %>% | |
mutate(!!sym(col_name):= | |
apply( | |
select(.,condition), | |
1,sum, na.rm=na_rm)) | |
} | |
# sum_rowwise(df_nbenefits,"n_benefits",starts_with("Q120")) | |
#' Conditional sum rowwise, returning a dataframe with a new column created | |
#' Sum up cell values if meeting criteria | |
#' | |
#' @param df Data frame to be passed through | |
#' @param col_name String input for the name of the new column | |
#' @param condition vector containing values to match in selected range in rows | |
#' @param select_helpers dplyr-based selection arguments in dplyr::select | |
#' @param na_rm Logical - whether to remove NAs when calculatng sum. Defaults to TRUE | |
sum_rowwise_c <- function(df,col_name="RowSum", | |
select_helpers, | |
condition, | |
na_rm=TRUE){ | |
df %>% | |
mutate(!!sym(col_name):= | |
apply( | |
select(.,select_helpers), | |
1,function(x)sum(x %in% condition, na.rm=na_rm))) | |
} | |
# sum_rowwise_c(df_nbenefits,"n_benefits",c(1,12,3),starts_with("Q120")) | |
#' Convenience function to apply the mean function rowwise, using select helpers from tidyverse | |
mean_rowwise <- function(df,col_name="RowSum", | |
select_helpers, | |
na_rm=TRUE){ | |
df %>% | |
mutate(!!sym(col_name):= | |
apply( | |
select(.,select_helpers), | |
1,function(x)mean(x, na.rm=na_rm))) | |
} | |
#' Convert a numeric into a quantile categorical variable, labelled by lower and upper bounds of quantiles (string) | |
#' @param dat Vector, column, or numeric variable to be passed through | |
labelled_quantile <- function(dat){ | |
pt1 <- quantile(dat, probs = seq(0, 1, by = 0.2), type = 7) | |
pt2 <- unique(as.data.frame(pt1), fromLast = TRUE) | |
paste0(pt2[1,],"_TO_",pt2[2,]) -> q1_lab | |
paste0(pt2[2,],"_TO_",pt2[3,]) -> q2_lab | |
paste0(pt2[3,],"_TO_",pt2[4,]) -> q3_lab | |
paste0(pt2[4,],"_TO_",pt2[5,]) -> q4_lab | |
paste0(pt2[5,],"_UP") -> q5_lab | |
pt3 <- rownames(pt2) | |
pt4 <- as.integer(strsplit(pt3, "%")) | |
datp <- pt4[as.integer(.bincode(dat, c(0, pt2$pt1), right=FALSE,include.lowest=TRUE))] | |
datp | |
case_when(datp==0~q1_lab, | |
datp==20~q1_lab, | |
datp==40~q2_lab, | |
datp==60~q3_lab, | |
datp==80~q4_lab, | |
datp==100~q5_lab) | |
} | |
#' CAGR Calculator | |
#' http://www.investopedia.com/terms/c/cagr.asp | |
CAGR <- function(value.beginning, value.ending, number.of.periods){ | |
((value.ending / value.beginning) ^ (1 / number.of.periods)) - 1 | |
} | |
# tibdf <- tibble(company = rep(c("ABC", "CDE"), each = 5), | |
# year = rep(2000:2004, 2), | |
# variable = rep("revenue", 10), | |
# data = c(10, 15, 12, 25, 30, 5, 8, 17, 9, 34)) | |
# | |
# tibdf %>% | |
# arrange(company,year) %>% | |
# group_by(company) %>% | |
# mutate(change=data-lag(data), | |
# CAGR=CAGR(lag(data,2),data,2)) | |
#' Capitalise all letters | |
#' @param x String, or a character vector | |
cap_all <- function(string) { | |
gsub("(^|[[:space:]])([[:alpha:]])", "\\1\\U\\2", string, perl=TRUE) | |
} | |
#' Iteratively filter different columns by the same criteria to summarise a single column by mean | |
#' To be developed for sum | |
iterative_mean <- function(df,equals,select_helpers,mean_var){ | |
df %>% select(select_helpers) %>% names() -> var_list | |
mean_var_str <- deparse(substitute(mean_var)) | |
mean_var <- enquo(mean_var) | |
lapply(1:length(var_list),function(i){ | |
df %>% | |
filter(!!sym(var_list[[i]]) %in% equals) %>% | |
summarise(!!sym(mean_var_str):=mean(!!mean_var,na.rm=TRUE)) | |
}) %>% bind_rows() -> p | |
cbind(var_list,p) | |
} | |
#' Convert numeric variable into categorical variable | |
#' Essentially a wrapper around the cut() function | |
#' @param x vector to be passed through | |
#' @param break points for the categories. Both the top and bottom breaks must be provided. | |
categorise <- function(x, breaks){ | |
x %>% | |
cut(breaks=breaks, | |
# labels=letters[1:(length(breaks)-1)], | |
include.lowest=TRUE, | |
right=TRUE) %>% | |
unlist() %>% as.character() | |
} | |
# seq(10) %>% categorise(breaks=c(0,3,10)) | |
#' Replace NAs randomly from a selected range with replacement | |
#' @param range Vector of values to replace NA with | |
replace_na_range <- function(x,range){ | |
sum_na <- sum(is.na(x)) | |
rand <- sample(range,sum_na,replace=TRUE) | |
x[is.na(x)] <- rand | |
x | |
} | |
# replace_na_range(c(1,NA,2,3,1,NA,2),c(1,2,3)) | |
#' Convert single-code column(s) into "multiple choice" formats - fill data with 1s | |
#' This function was orignally used in the context of a 'Select Any' question, where the outputs were created such that each 'cell' value represents the statement, rather than having the columns represent the statements. | |
#' tcol refers to the number of columns to create. | |
#' 'Cell' values must be in numeric. | |
#' Version with select incorporated | |
superspread <- function(df,tcol,select_helpers){ | |
df_original <- df | |
df <- dplyr::select(df,select_helpers) | |
frameNA <-matrix(NA,nrow(df),tcol) %>% as.data.frame() | |
names(frameNA) <-seq(1,tcol) | |
for (i in 1:tcol){ | |
frameNA[,i] <- as.numeric(apply(df, 1, function(x) ifelse(i %in% x, 1, 0))) | |
} | |
cbind(df_original,frameNA) | |
} | |
#' Convert single-code column(s) into "multiple choice" formats, filling data from a target column | |
superspread_fill <- function(df,tcol,select_helpers,target_col){ | |
df_original <- df | |
df <- dplyr::select(df,select_helpers) | |
dft <- dplyr::select(df_original,target_col) | |
frameNA <-matrix(NA,nrow(df),tcol) %>% as.data.frame() | |
names(frameNA) <-seq(1,tcol) | |
for (i in 1:tcol){ # For each column | |
frameNA[,i] <- as.numeric(sapply(1:nrow(df), function(x) ifelse(i %in% df[x,], | |
sapply(x,function(y) dft[y,1]), | |
0))) | |
} | |
cbind(df,frameNA) | |
} | |
# data.frame(a=c(1,2,4,5,6,7,8),b=c(244,333,434,453,123,123,435)) %>% superspread_fill(8,"a","b") | |
#' Convert single-code column(s) into "multiple choice" formats, filling data with count rather than 1s | |
superspread_count <- function(df,tcol,select_helpers){ | |
df_original <- df | |
df <- select(df,select_helpers) | |
frameNA <-matrix(NA,nrow(df),tcol) %>% as.data.frame() | |
names(frameNA) <-seq(1,tcol) | |
for (i in 1:tcol){ | |
frameNA[,i] <- as.numeric(apply(df, 1, function(x) ifelse(i %in% x, sum(i==x,na.rm = TRUE), 0))) # count if 'column value' exists in the matrix | |
} | |
names(frameNA) <- paste0("x",names(frameNA)) | |
cbind(df_original,frameNA) | |
} | |
#' Serialise a SPSS file to RDS | |
#' Old file name is retained and saved in the same location as RDS if export is unspecified | |
sav_to_rds<-function(import,export=""){ | |
p <- haven::read_sav(import) | |
export<-ifelse(export=="", | |
str_replace_all(import,".sav",".rds"), | |
export) | |
saveRDS(p,file=export) | |
} | |
# Return the n number of characters from the right | |
# Wrapper convenience function | |
str_right <- function(x,n){ | |
nx <- n-1 | |
str_sub(x,nchar(x)-nx,nchar(x)) | |
} | |
#' Function that returns TRUE/FALSE if value exists in x, but returns NA if x consists entirely of NAs | |
any_x <- function(x, value){ | |
if(all(is.na(x))){ | |
return(NA) | |
} else(any(x %in% value,na.rm=TRUE)) | |
} | |
# any_x(c(1,0,1),1) # TRUE | |
# any_x(c(1,NA,1),1) # TRUE | |
# any_x(c(0,0,NA),1) # FALSE | |
# any_x(c(NA,NA,NA),1) # NA | |
#' Return all ngram combinations with a character vector as input | |
tokenise_ngram <- function(x, n = 2, collapse = " ") { | |
unlist(lapply(NLP::ngrams(words(x), n), paste, collapse = collapse), use.names = FALSE) | |
} | |
# tokenise_ngram("a c b e f") | |
# [1] "a c" "c b" "b e" "e f" | |
#' Remove NA from lists | |
na.omit.list <-function(y) { | |
return(y[!sapply(y, function(x) | |
all(is.na(x)))]) | |
} | |
#' Return value labels as tibble | |
#' Only works if variable as value label | |
#' @param variable Enter variable name as string | |
extract_vallab <- function(x,variable){ | |
if(is.null(attr(x[[variable]],'labels'))){ | |
return (NULL) # Return NULL if no value labels found | |
} else { | |
x[[variable]] %>% | |
attr('labels') %>% | |
stack() %>% | |
dplyr::as_tibble() %>% | |
`names<-`(c("id",variable)) %>% | |
mutate_at(vars(variable),funs(as.character)) | |
} | |
} | |
#' Replace x values with corresponding values using a key | |
#' | |
#' @param x a string vector | |
#' @param table lookup table | |
#' @param index character string of the key/index column used for matching | |
#' @param column the column to return from the lookup table. Default is the second column | |
#' @example iris %>% | |
#' .$Petal.Width %>% | |
#' look_up(tibble(var=0.2,return="zero-point-two")) | |
look_up <- function(x,table,index="var",column=2){ | |
sapply(1:length(x), | |
function(i){ | |
if(is.na(x[[i]])){ | |
return("") | |
} else { | |
unlist(table[table[[index]]==x[[i]],column]) | |
} | |
}) %>% as.character() -> matched_x | |
sapply(1:length(x), | |
function(i){ | |
if(matched_x[[i]]=="character(0)"){ | |
x[[i]] | |
} else { | |
matched_x[[i]] | |
} | |
}) %>% ifelse(.=="",NA_character_,.) | |
} | |
#' Create a Data Dictionary from a data frame with Variable and Value Labels | |
#' Exported objects from SPSS | |
#' | |
#' @param x data frame | |
#' @param max_char Maximum character allowed on Variable and Value label columns. | |
data_dict <-function(x,max_char=32760){ | |
suppressWarnings( | |
lapply(1:ncol(x), | |
function(i){ | |
namex <- names(x)[[i]] | |
purrr::pluck(x,namex,purrr::attr_getter("label")) %>% as.character() -> label_var | |
purrr::pluck(x,namex,purrr::attr_getter("labels")) %>% | |
names() %>% | |
paste(collapse="; ") %>% | |
stringr::str_trunc(max_char)-> label_val | |
x[[namex]] %>% | |
unique() %>% | |
paste(collapse="; ") %>% | |
stringr::str_trunc(max_char)-> val_val | |
ifelse(is_null(label_var),"",label_var)->label_var | |
ifelse(is_null(label_val),"",label_val)->label_val | |
data.frame(var=namex,label_var,label_val,value=val_val) | |
}) %>% dplyr::bind_rows() | |
) | |
} | |
#' Set variable labels | |
#' Pipe-workflow optimised | |
#' | |
#' @param x Variable to assign variable labels to | |
#' @param variable_label String vector to be assigned as the variable label | |
#' @example | |
#'library(tidyverse) | |
#'tibble(RESPID=1:1000, | |
#' Q1=sample(c(0,1,2),1000,replace=TRUE), | |
#' Q2=sample(c(0,1),1000,replace=TRUE))-> df | |
#' df %>% | |
#' mutate_at("Q1",funs(set_varl(.,"Which of the following groups do you fall into?"))) %>% | |
#' .$Q1 | |
set_varl <- function(x,variable_label){ | |
attr(x,'label') <- variable_label | |
x | |
} | |
#' Set value labels | |
#' Pipe-workflow optimised | |
#' | |
#' | |
#' @param x Variable to assign value labels to | |
#' @param value_labels Named character vector to be assigned as value labels | |
#' @example | |
#' | |
#' library(tidyverse) | |
#' tibble(RESPID=1:1000, | |
#' Q1=sample(c(0,1,2),1000,replace=TRUE), | |
#' Q2=sample(c(0,1),1000,replace=TRUE))-> df | |
#' df %>% | |
#' mutate_at("Q2",funs(set_varl(.,"What is your answer to this yes/no question?"))) %>% | |
#' mutate_at("Q2",funs(set_vall(.,c("No"=0,"Yes"=1)))) %>% | |
#' .$Q2 %>% attributes() | |
set_vall <- function(x,value_labels){ | |
haven::labelled(x,value_labels) | |
} | |
#' Safe way to collapse rows in case they have duplicates | |
#' | |
#' @param x Variables to collapse. Designed to be use with mutate_at() or mutate_all() | |
#' @param numeric Set to TRUE to convert result to numeric value | |
collapse_rows <- function(x,numeric=TRUE){ | |
de_NA_x<- x[!is.na(x)] | |
if(length(de_NA_x)>1){ | |
stop("Error: rows cannot contain more than one non-NA value") | |
} else if(numeric==TRUE){ | |
as.numeric(paste0(de_NA_x,collapse=";")) | |
} else if(numeric==FALSE){ | |
paste0(de_NA_x,collapse=";") | |
} | |
} | |
# collapse_rows(c(NA,1)) | |
# collapse_rows(c(NA,1,2)) | |
#' Convert character variable to labelled integer variable | |
#' | |
char_to_lab <- function(x){ | |
unique_x <- unique(x) | |
gen_df <- tibble::tibble(id=1:length(unique_x), | |
var=as.character(unique_x)) | |
value_labels <- unlist(create_named_list(gen_df$var,gen_df$id)) | |
sapply(1:length(x),function(i){ | |
if(is.na(x[[i]])){ | |
NA | |
} else { | |
gen_df[gen_df$var==x[[i]],1] %>% drop_na() %>% unlist() | |
} | |
}) -> outcome # Convert current values to assigned numbers | |
return(set_vall(outcome,value_labels)) # Set value labels | |
} | |
#' Convert numeric variable to NPS variable | |
as_nps <-function(x){ | |
dplyr::case_when(x >=0 & x<=6~-100, | |
x >=7 & x<=8~0, | |
x >=9 & x<=10~100, | |
TRUE~NA_real_)-> out_x | |
labz <- c("Detractor"=-100, | |
"Passive"=0, | |
"Promoter"=100, | |
"Missing value"=NA_real_) | |
haven::labelled(out_x,labz) | |
} | |
#' Convert labelled double variable to character variable | |
#' Returns error if variable is not of class labelled double | |
#' | |
lab_to_char <- function(x){ | |
if(!haven::is.labelled(x)){ | |
stop("Variable is not a labelled double") | |
} else if(haven::is.labelled(x)){ | |
names(attr(x,'labels')) -> id | |
attr(x,'labels')-> var | |
tibble::tibble(id=id,var=var)-> key_tab | |
look_up(x,key_tab,column = 1) | |
} | |
} | |
#' Convert ordinal variables into binary variables by "boxing" | |
#' E.g. create Top Two Box variable from a 7-point agreement scale question | |
#' @param x Variable to be passed through | |
#' @param which Character string to specify which end of spectrum to take values - valid inputs are "top" and "bottom" | |
#' @param number Number to take values from | |
#' @example | |
#' box_it(sample(1:10,100,replace=TRUE)) | |
#' Converted to binary variable where 9, 10 are selected | |
box_it <-function(x,which="top",number=2){ | |
max_x <- max(x,na.rm = TRUE) | |
min_x <- min(x,na.rm = TRUE) | |
if(which=="top"){ | |
valid_range <- (max_x - number + 1):max_x | |
} else if(which=="bottom"){ | |
valid_range <- min_x:(min_x + number - 1) | |
} | |
dplyr::case_when(x %in% valid_range~1, | |
!(x %in% valid_range)~0, | |
TRUE~NA_real_) -> output | |
attr(output,'labels') <- c("Selected"=1,"Not selected"=0,"Missing value"=NA_real_) | |
range_print <- paste0(valid_range,collapse=", ") | |
message(paste0("Converted to binary variable where ",range_print," are selected")) | |
return(output) | |
} | |
#' Convert correlation matrix into a tidy data frame | |
#' Use label_table as an input | |
#' Identify matching id | |
#' | |
#' @param cor_m Correlation matrix in the form of an output from cor() | |
#' @param label_table A two-column table with one column used for matching and the other for returning labels. | |
#' @param id Use this argument to specify a string to refers to the matching / id column in the label_table. | |
#' | |
cor_to_df <- function(cor_m, label_table, id){ | |
if(ncol(label_table)!=2){ | |
stop("Matching table must have an id column and a return column.") | |
} else { | |
matched_lab <- names(label_table)[names(label_table)!=id] | |
cor_m %>% | |
as.data.frame() %>% | |
tibble::rownames_to_column("cor_matrix") %>% | |
left_join(label_table,by=c("cor_matrix"=id)) %>% | |
select(cor_matrix,matched_lab,everything())-> output_df | |
labels <- output_df[[matched_lab]] | |
names(output_df) <- c("cor_matrix",matched_lab,labels) | |
return(output_df) | |
} | |
} | |
#' Convert as percent (string) | |
#' Convert a numeric value into a string with percentage sign. | |
as_percent <- function(num, rounding = 0){ | |
paste0(round(num * 100, rounding),"%") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment