Skip to content

Instantly share code, notes, and snippets.

@martinctc
Last active March 5, 2019 15:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save martinctc/53444d4847055a870417f5b279f1b8bd to your computer and use it in GitHub Desktop.
Save martinctc/53444d4847055a870417f5b279f1b8bd to your computer and use it in GitHub Desktop.
[surveyr] Useful functions for tidy survey analysis. Under development... #R
#### Source this from github using following link ####
# devtools::source_gist("https://gist.github.com/martinctc/53444d4847055a870417f5b279f1b8bd", filename="surveyr.R")
#' Calculate weight table using a single source/target column
#' (indicating what proportions need to be multiplied to achieve weighted proportions)
#'
#' @param x Data frame containing columns to be weighted
#' @param target bare (unquoted) column name of the variable used for creating weight
#' @param weights vector containing the proportions of what each value of the target column should take
calc_weights <-
function(x, target, weights){
require(broom)
targ <- eval(substitute(target),x)
targ %>%
table() %>%
tidy() %>%
mutate(Freq2= Freq / sum(Freq)) %>%
cbind(weights) %>%
mutate(WFreq = Freq2 / weights) %>%
select("Category"=1,"Weights"=5) %>%
as.tibble()
}
#' Attach weights to data frame
#'
#' Add weights to the original data frame, to be used together with calc_weights()
#' This function is not necessary if a 'weights' column is already present in data set
#'
#' @param x Data frame to attach required weights
#' @param target Column from which the weights are calculated from.
#' @param weight_table Output produced from calc_weights().
add_weights <- function(x, target, weight_table){
targ <- enquo(target)
temp <- create_named_list(weight_table$Category, weight_table$Weights)
x %>%
mutate(WEIGHT=!!targ) %>%
mutate_at(vars(WEIGHT),funs(recode(.,!!!temp)))
}
#' Create a named list object with two vectors
#' Used for constructing arguments for add_weights().
#'
#' @param x Vector containing character strings, e.g. "A"
#' @param y Vector containing numeric values, e.g. 1, 3
create_named_list <- function(x, y){
quostr <- function(list){
options(useFancyQuotes = FALSE)
list2 <- lapply(list,function(x)toString(dQuote(x)))
list2
}
weave <- function(list1, list2, num){
paste0(list1[num],"=",list2[num])
}
len <- length(x)
temp <- sapply(1:len,weave, list1=quostr(x),list2=y)
temp <- paste(temp, collapse = ", ")
temp <- paste0("c(",temp,")")
temp2 <- eval(parse(text=temp))
temp2
}
#' Create cross table with an option to add weights (single-choice)
#'
#' @param x Data frame with the required columns.
#' @param downbreak The main question to analyse by.
#' @param crossbreak The 'banner' by which the downbreak would be split by.
#' @param weights Pass the weight column into this argument - defaults to NULL for unweighted tables.
xtab <-
function(x,downbreak,crossbreak,weights=NULL){
db.q <- enquo(downbreak)
cb.q <- enquo(crossbreak)
cb.string <- deparse(substitute(crossbreak)) #Convert to string
if(deparse(substitute(weights))!="NULL"){
#If weights are used
wts.q <- enquo(weights)
x %>%
count(!!cb.q,!!db.q,!!wts.q) %>% #Counts of all
spread_(cb.string, "n", fill=0) %>% #Using standard evaluation
mutate(Total=rowSums(select(.,-!!db.q,-!!wts.q)))%>% #Create Total column
mutate_at(vars(-!!db.q,-!!wts.q),
funs(.*!!wts.q)) %>%
mutate_at(vars(-!!db.q,-!!wts.q),
funs(prop.of.itself)) %>%
#Create weighted proportions here
select(-!!wts.q) %>%
group_by(!!db.q) %>%
summarise_all(sum)
} else {
#If no weights applied
x %>%
count(!!cb.q,!!db.q) %>%
spread_(cb.string, "n", fill=0) %>%
mutate(Total=rowSums(select(.,-!!db.q)))%>%
mutate_at(vars(-!!db.q),
funs(prop.of.itself))
}
}
#'Returns the proportions of each value in a vector as the sum of all values in the vector
#'
#'@param x Value in vector
prop.of.itself <- function(x){x / sum(x)} #Return proportion of vector
#' Create single-variable table with an option to add weights (single-choice)
#'
#' @param x Data frame with the required columns.
#' @param downbreak The main question to analyse by.
#' @param weights Pass the weight column into this argument - defaults to NULL for unweighted tables.
tab <-
function(x,downbreak, weights=NULL){
db.q <- enquo(downbreak)
wts.q <- enquo(weights)
t.string <- deparse(substitute(target))
if(deparse(substitute(weights))!="NULL"){
x %>%
count(!!db.q, !!wts.q) %>%
mutate_at(vars(n), funs(.*!!wts.q))%>%
mutate_at(vars(n), funs(prop.of.itself))%>%
group_by(!!db.q) %>%
summarise(sum = sum(n))
} else {
#If no weights are applied
x %>% count(!!db.q) %>%
mutate_at(vars(-!!db.q),funs(prop.of.itself))
}
}
#' Convert a Likert scale from one scale to another
#'
#' This is used for converting data, for instance, from a 6-point scale to a 5-point scale.
#' The scale "dimensions" are specified in the function argument, and the function converts the numeric vector that is passed through.
#'
#' @param x Numeric vector to be passed through.
#' @param top.x Top value of the original scale. This would be 6 on a 0-6 scale.
#' @param bot.x Bottom value of the original scale. This would be 0 on a 0-6 scale.
#' @param top.y Top value of the new/target scale. This would be 5 on a 0-5 scale.
#' @param bot.y Bottom value of the new/target scale. This would be 0 on a 0-5 scale.
#' @keywords likert scale
#' @examples
#' data <-c(5, 4, 3, 2, 1)
#'likert.scaler(data,5,0,10,0) #5-point scale to 10-point scale
#'[1] 10 8 6 4 2
likert.convert <- function(x, top.x, bot.x, top.y, bot.y){
y <- ((top.y-bot.y)*(x-bot.x)/(top.x-bot.x))+bot.y
y
}
#' Reverse a Likert scale
#'
#' Reverse a Likert scale such that on a 0-10 scale, 10 becomes 0 and 0 becomes 10.
#'
#' @param x Numeric vector
#' @param top Top value of the scale for the variable, e.g. 10 for a 0-10 scale.
#' @param top Bottom value of the scale for the variable, e.g. 0 for a 0-10 scale.
#' @keywords likert scale
#' @examples
#' data <-c(5, 4, 3, 2, 6)
#' likert.reverse(data, 6, 0)
#' [1] 1 2 3 4 0
likert.reverse <- function(x,top,bottom){
total <-top+bottom
x <- total-x
x
}
#' Max-Min Scaling Function
#'
#' This function allows you to scale vectors or an entire data frame using the max-min scaling method, returning a vector
#' Suitable to be used with pipes
#'
#' @param x Pass a vector or the required columns of a data frame through this argument.
#' @keywords max-min
#' @export
#' @examples
#' rand.data <-cbind(sample(1000,234:697),sample(1000,234:697)) %>% as.data.frame()
#' maxmin(rand.data)
#'
#' rand.data <-sample(1000,234:677)
#' maxmin(rand.data)
maxmin <- function(x, na.rm=TRUE){
if(is.vector(x)){
maxs <- max(x,na.rm=na.rm)
mins <- min(x,na.rm=na.rm)
as.numeric(scale(x,center=mins,scale=maxs-mins))
} else {
maxs <- apply(x, 2,max,na.rm=na.rm)
mins <- apply(x, 2,min,na.rm=na.rm)
as.numeric(scale(x, center = mins, scale = maxs - mins))
}
}
#'Split the data into a simple training and testing set
#'
#'@param x Pass your data frame or matrix here.
#'@param part A numeric value between 0 and 1 to represent the proportion of the whole data you want to use as the training set.
#'@keywords train, test
#'@examples
#'x <-as.data.frame(matrix(1:5000,250,20))
#'x.train <- split.tt(x,.7)$train
#'x.test <- split.tt(x,.7)$test
#'
#'dim(x.train)
#'dim(x.test)
split.tt <-function(x, part){
rowz <- nrow(x)
samp <- sample(seq_len(rowz),floor(rowz * part))
output <-list("train"=x[samp,],"test"=x[-samp,])
output
}
#' Translate function using transltr.org API
#'
#' This function allows you to translate character strings through the transltr.org API.
#' @param what What text string to translate
#' @param from What language to translate from. Defaults to NULL
#' @param to What language to translate to. Defaults to "en"
#' @keywords translate
#' @export
#' @examples
#' trans.t(what="Blanc",from="fr",to="en")
trans.t <- function(what=NULL,from=NULL,to="en"){
url <- paste0("http://transltr.org/api/translate?text=",URLencode(what),
"&to=",to,"&from=",from)
output<-jsonlite::fromJSON(txt=url)
output$translationText
}
#' Translate function using the Microsoft Translator API
#'
#' This function allows you to translate character strings through the Microsoft Translator API.
#' To use this function, you will need to first acquire an access token and pass it through the key argument.
#' @param what What text string to translate
#' @param from What language to translate from. Defaults to NULL
#' @param to What language to translate to. Defaults to "en"
#' @keywords translate
#' @export
#' @examples
#' trans.m(what="Blanc",from="fr",to="en",key="F023kljadfoilkjlkj")
trans.m <- function(what, from=NULL,to="en",key=MyKey){
token_url <- "https://api.cognitive.microsoft.com/sts/v1.0/issueToken"
token_body <- list('Subscription-Key'=key)
resp<-httr::POST(token_url,query=token_body)
tok_string <- resp %>% httr::content(type="text",encoding='utf-8')
w.token <- paste("Bearer",tok_string)
transurl <- "http://api.microsofttranslator.com/V2/Http.svc/Translate"
tranz <- httr::GET(transurl,
query=list('appId'=w.token,
text=what,from=from,
to=to))
tranz %>% xmlParse %>% xmlToList()
}
#' Returns sum of matched instances of pattern in string
str_count <- function(string,pattern){
sum(str_detect(string, pattern),na.rm=TRUE)
}
#' str_count function with dual condition
#' Returns sum of matched instances of both pattern1 and pattern2 in string
#' Both pattern1 and pattern2 must be TRUE to be counted
str_count_2 <- function(string,pattern1,pattern2){
sum(str_detect(string, pattern1) & str_detect(string, pattern2),
na.rm=TRUE)
}
#' Count the number of words in text string
#'
#' @param string Pass text string here.
#' @param pseudo Determines whether groups of special characters are matched. Defaults to FALSE (not matched)
#'
nwords <- function(string, pseudo=F){
ifelse( pseudo,
pattern <- "\\S+",
pattern <- "[[:alpha:]]+"
)
str_count(string, pattern)
}
#' Split long sentences into two lines
#'
#' @param string Pass text string here.
#' @param words The maximum number of words allowed in the first line
#' @examples
#' string_liner("This is a lovely cup of tea",2)
#' string_liner("This is a lovely cup of tea",3)
str_next_line <-
function(string, words) {
nwords(string) -> p
words_list <- str_split(string, " ") %>%
unlist()
words_plus <- words + 1
if (p > words_plus) {
material_1 <- paste0(words_list[1:words], collapse = " ") %>% unlist()
material_2 <- "\n"
material_3 <-paste0(words_list[words_plus:p], collapse = " ") %>% unlist()
paste(material_1, material_2, material_3, collapse = " ") %>%
str_replace_all("NA","") %>%
str_trim()
} else if (p == words_plus) {
material_1 <- paste0(words_list[1:words], collapse = " ") %>% unlist()
material_2 <- "\n"
material_3 <- paste0(words_list[p], collapse = " ") %>% unlist()
paste(material_1, material_2, material_3, collapse = " ") %>%
str_replace_all("NA","") %>%
str_trim()
} else if (p <= words){
string
}
}
#' Returning a data frame summarising the counts and percentages of row-by-row identical values across the columns of a data frame.
#' Produce a data frame of all columns on whether they match values of target column
#'
#' @param df Pass a data frame containing some columns including a "target" column
#' @param target Main column that you want to compare with other columns - use bare / unquoted reference.
match_counter <- function(df,target){
arguments <- as.list(match.call())
targ <- deparse(substitute(target)) #variable to string
df %>%
mutate_all(funs(.==eval(parse(text=targ)))) %>%
colSums(na.rm = TRUE) %>% data.frame() %>%
rownames_to_column() -> matches
names(matches)[2] <- "counts" #rename column as "counts"
matches$counts[matches$rowname==targ] -> periods #Count number of periods
matches %>% mutate(prop=counts/periods) %>% #Create proportion column
select("Variable"=rowname,"Counts"=counts, "Percentage"=prop)
}
#' Create file name with time stamp
#'
#' @param main The main file name to be used.
#' @param extension The file extension to be used, e.g. ".csv"
#' @examples
#' timed_fn("Q15. ",".xlsx")
timed_fn <- function(main,extension){
Sys.time() %>%
stringr::str_replace_all("[[:punct:]]","") %>%
stringr::str_replace_all(" ","_") -> xtime
paste(main,xtime,extension)
}
#' Saves or overwrites a worksheet to an Excel Workbook in a form compatible with the pipe style - using the openxlsx package.
#'
#' @param data Data to be written.
#' @param workbook Workbook object created with openxlsx::createWorkbook().
#' @param Worksheet Character string containing name of the worksheet.
#' @examples
#' ## Accompanying functions
#' library(openxlsx)
#' wb <- createWorkbook()
#' saveWorkbook(wb,filename,overwrite=TRUE)
write_to_ws <- function(data, workbook, worksheet){
if (worksheet %in% names(workbook)) {
writeData(workbook, worksheet, data)
}
else{
addWorksheet(workbook, worksheet)
writeData(workbook, worksheet, data)
}
}
#' Clean strings so that they can be used as variable names or column names
#'
#' @param string Character string to be "cleaned".
#' @param treat_dups Set to FALSE to allow duplication of strings. Defaults to TRUE
clean_strings <- function(string,treat_dups=TRUE){
new_string <- string %>%
gsub("'", "", .) %>%
gsub("\"","", .) %>%
gsub("%", "percent", .) %>%
gsub("^[ ]+","", .) %>%
make.names(.) %>%
gsub("[.]+", "_", .) %>%
gsub("[_]+", "_", .) %>%
tolower(.) %>%
gsub("_$", "",.)
dupe_count <- sapply(1:length(new_string), function(i) {
sum(new_string[i] == new_string[1:i])
})
if(treat_dups==TRUE){
new_string[dupe_count > 1] <- paste(new_string[dupe_count >
1], dupe_count[dupe_count > 1], sep = "_")
return(new_string)
} else if(treat_dups==FALSE){
return(new_string)
}
}
#' Replace nth occurring value in a string
#' A more refined replacement function than str_replace() and str_replace_all()
#'
#' @param x Pass string character
#' @param pattern String containing characters to match
#' @param replacement String containing to replace
#' @param n Nth term to be replaced
str_replace_nth <- function(x, pattern, replacement, n) {
g <- gregexpr(pattern, x)[[1]][n]
s <- scan(text = gsub("[()]", "", pattern),
sep = "|",
what = "")
substr(x, g, g) <- replacement[match(substr(x, g, g), s)]
x
}
#' Append an item to a list dynamically (pipe-optimised)
#'
#' Function directly appends item to list in Global Environment
#' @param x An object to append to list, e.g. vector, data frame.
#' @param list_x Target list to append object to.
#' @param name Specify a character string for the name of the list. Defaults to blank
#'
#' @example
#' a_list <- list(NULL)
#' append_to_list(iris,a_list,"iris")
append_to_list <- function(x, list_x, name=""){
temp <- deparse(substitute(list_x))
list_len <- length(list_x)
list_len1 <- list_len + 1
if(is.null(list_x[[1]])){ # Treatment if list is empty
list_x[[1]] <- x
names(list_x)[1] <- name
assign(temp,list_x,envir = .GlobalEnv)
} else { # Treatment if list has already got at least one object
list_x[[list_len1]] <- x
names(list_x)[list_len1] <- name
assign(temp,list_x,envir = .GlobalEnv)
}
}
#' Calculate percentage change of a vector relative to a lag k
#'
#' @param x A vector to be passed through
#' @param lag The number of lags used, defaults to 1
pc_change <-function(x,lag=1){
base <- lag(x,lag) # base to be divided by
c(rep(NA,lag),diff(x,lag))/base
}
#' Copy a data frame to clipboard for pasting in Excel
#' Pipe optimised
#' @param expand Add number to manually expand clipboard size
#' @param quietly Set this to TRUE to not print data frame on console
copy_df <-function(x,row.names=FALSE,col.names=TRUE,expand="",quietly=FALSE,...) {
expand_x <- stringr::str_remove_all(expand,"-")
if(expand==""){
write.table(x,paste0("clipboard",expand),sep="\t",row.names=row.names,col.names=col.names,...)
} else {
expand_x <- paste0("-",expand_x)
write.table(x,paste0("clipboard",expand_x),sep="\t",row.names=row.names,col.names=col.names,...)
}
if(quietly==FALSE) print(x)
}
#' Remove all columns which contain only NAs
#' Pipe optimised
#' @param df Data frame to be passed through
na_remover <- function(df){
Filter(function(x)!all(is.na(x)),df)
}
#' Remove all columns which contain only zeros
#' Pipe optimised
#' @param df Data frame to be passed through
zero_remover <- function(df){
Filter(function(x)!all(x==0),df)
}
#' Set working directory - for personal use only
set_wd <- function(path,where_you_at="office"){
if(where_you_at=="office"){
setwd(path)
} else if(where_you_at=="home"){
setwd(stringr::str_replace_all(path,".Chan",""))
}
}
#' Sum rowwise, returning a dataframe with a new column created
#'
#' @param df Data frame to be passed through
#' @param col_name String input for the name of the new column
#' @param condition dplyr-based selection arguments in dplyr::select
#' @param na_rm Logical - whether to remove NAs when calculatng sum. Defaults to TRUE
sum_rowwise <- function(df,col_name="RowSum",condition,na_rm=TRUE){
df %>%
mutate(!!sym(col_name):=
apply(
select(.,condition),
1,sum, na.rm=na_rm))
}
# sum_rowwise(df_nbenefits,"n_benefits",starts_with("Q120"))
#' Conditional sum rowwise, returning a dataframe with a new column created
#' Sum up cell values if meeting criteria
#'
#' @param df Data frame to be passed through
#' @param col_name String input for the name of the new column
#' @param condition vector containing values to match in selected range in rows
#' @param select_helpers dplyr-based selection arguments in dplyr::select
#' @param na_rm Logical - whether to remove NAs when calculatng sum. Defaults to TRUE
sum_rowwise_c <- function(df,col_name="RowSum",
select_helpers,
condition,
na_rm=TRUE){
df %>%
mutate(!!sym(col_name):=
apply(
select(.,select_helpers),
1,function(x)sum(x %in% condition, na.rm=na_rm)))
}
# sum_rowwise_c(df_nbenefits,"n_benefits",c(1,12,3),starts_with("Q120"))
#' Convenience function to apply the mean function rowwise, using select helpers from tidyverse
mean_rowwise <- function(df,col_name="RowSum",
select_helpers,
na_rm=TRUE){
df %>%
mutate(!!sym(col_name):=
apply(
select(.,select_helpers),
1,function(x)mean(x, na.rm=na_rm)))
}
#' Convert a numeric into a quantile categorical variable, labelled by lower and upper bounds of quantiles (string)
#' @param dat Vector, column, or numeric variable to be passed through
labelled_quantile <- function(dat){
pt1 <- quantile(dat, probs = seq(0, 1, by = 0.2), type = 7)
pt2 <- unique(as.data.frame(pt1), fromLast = TRUE)
paste0(pt2[1,],"_TO_",pt2[2,]) -> q1_lab
paste0(pt2[2,],"_TO_",pt2[3,]) -> q2_lab
paste0(pt2[3,],"_TO_",pt2[4,]) -> q3_lab
paste0(pt2[4,],"_TO_",pt2[5,]) -> q4_lab
paste0(pt2[5,],"_UP") -> q5_lab
pt3 <- rownames(pt2)
pt4 <- as.integer(strsplit(pt3, "%"))
datp <- pt4[as.integer(.bincode(dat, c(0, pt2$pt1), right=FALSE,include.lowest=TRUE))]
datp
case_when(datp==0~q1_lab,
datp==20~q1_lab,
datp==40~q2_lab,
datp==60~q3_lab,
datp==80~q4_lab,
datp==100~q5_lab)
}
#' CAGR Calculator
#' http://www.investopedia.com/terms/c/cagr.asp
CAGR <- function(value.beginning, value.ending, number.of.periods){
((value.ending / value.beginning) ^ (1 / number.of.periods)) - 1
}
# tibdf <- tibble(company = rep(c("ABC", "CDE"), each = 5),
# year = rep(2000:2004, 2),
# variable = rep("revenue", 10),
# data = c(10, 15, 12, 25, 30, 5, 8, 17, 9, 34))
#
# tibdf %>%
# arrange(company,year) %>%
# group_by(company) %>%
# mutate(change=data-lag(data),
# CAGR=CAGR(lag(data,2),data,2))
#' Capitalise all letters
#' @param x String, or a character vector
cap_all <- function(string) {
gsub("(^|[[:space:]])([[:alpha:]])", "\\1\\U\\2", string, perl=TRUE)
}
#' Iteratively filter different columns by the same criteria to summarise a single column by mean
#' To be developed for sum
iterative_mean <- function(df,equals,select_helpers,mean_var){
df %>% select(select_helpers) %>% names() -> var_list
mean_var_str <- deparse(substitute(mean_var))
mean_var <- enquo(mean_var)
lapply(1:length(var_list),function(i){
df %>%
filter(!!sym(var_list[[i]]) %in% equals) %>%
summarise(!!sym(mean_var_str):=mean(!!mean_var,na.rm=TRUE))
}) %>% bind_rows() -> p
cbind(var_list,p)
}
#' Convert numeric variable into categorical variable
#' Essentially a wrapper around the cut() function
#' @param x vector to be passed through
#' @param break points for the categories. Both the top and bottom breaks must be provided.
categorise <- function(x, breaks){
x %>%
cut(breaks=breaks,
# labels=letters[1:(length(breaks)-1)],
include.lowest=TRUE,
right=TRUE) %>%
unlist() %>% as.character()
}
# seq(10) %>% categorise(breaks=c(0,3,10))
#' Replace NAs randomly from a selected range with replacement
#' @param range Vector of values to replace NA with
replace_na_range <- function(x,range){
sum_na <- sum(is.na(x))
rand <- sample(range,sum_na,replace=TRUE)
x[is.na(x)] <- rand
x
}
# replace_na_range(c(1,NA,2,3,1,NA,2),c(1,2,3))
#' Convert single-code column(s) into "multiple choice" formats - fill data with 1s
#' This function was orignally used in the context of a 'Select Any' question, where the outputs were created such that each 'cell' value represents the statement, rather than having the columns represent the statements.
#' tcol refers to the number of columns to create.
#' 'Cell' values must be in numeric.
#' Version with select incorporated
superspread <- function(df,tcol,select_helpers){
df_original <- df
df <- dplyr::select(df,select_helpers)
frameNA <-matrix(NA,nrow(df),tcol) %>% as.data.frame()
names(frameNA) <-seq(1,tcol)
for (i in 1:tcol){
frameNA[,i] <- as.numeric(apply(df, 1, function(x) ifelse(i %in% x, 1, 0)))
}
cbind(df_original,frameNA)
}
#' Convert single-code column(s) into "multiple choice" formats, filling data from a target column
superspread_fill <- function(df,tcol,select_helpers,target_col){
df_original <- df
df <- dplyr::select(df,select_helpers)
dft <- dplyr::select(df_original,target_col)
frameNA <-matrix(NA,nrow(df),tcol) %>% as.data.frame()
names(frameNA) <-seq(1,tcol)
for (i in 1:tcol){ # For each column
frameNA[,i] <- as.numeric(sapply(1:nrow(df), function(x) ifelse(i %in% df[x,],
sapply(x,function(y) dft[y,1]),
0)))
}
cbind(df,frameNA)
}
# data.frame(a=c(1,2,4,5,6,7,8),b=c(244,333,434,453,123,123,435)) %>% superspread_fill(8,"a","b")
#' Convert single-code column(s) into "multiple choice" formats, filling data with count rather than 1s
superspread_count <- function(df,tcol,select_helpers){
df_original <- df
df <- select(df,select_helpers)
frameNA <-matrix(NA,nrow(df),tcol) %>% as.data.frame()
names(frameNA) <-seq(1,tcol)
for (i in 1:tcol){
frameNA[,i] <- as.numeric(apply(df, 1, function(x) ifelse(i %in% x, sum(i==x,na.rm = TRUE), 0))) # count if 'column value' exists in the matrix
}
names(frameNA) <- paste0("x",names(frameNA))
cbind(df_original,frameNA)
}
#' Serialise a SPSS file to RDS
#' Old file name is retained and saved in the same location as RDS if export is unspecified
sav_to_rds<-function(import,export=""){
p <- haven::read_sav(import)
export<-ifelse(export=="",
str_replace_all(import,".sav",".rds"),
export)
saveRDS(p,file=export)
}
# Return the n number of characters from the right
# Wrapper convenience function
str_right <- function(x,n){
nx <- n-1
str_sub(x,nchar(x)-nx,nchar(x))
}
#' Function that returns TRUE/FALSE if value exists in x, but returns NA if x consists entirely of NAs
any_x <- function(x, value){
if(all(is.na(x))){
return(NA)
} else(any(x %in% value,na.rm=TRUE))
}
# any_x(c(1,0,1),1) # TRUE
# any_x(c(1,NA,1),1) # TRUE
# any_x(c(0,0,NA),1) # FALSE
# any_x(c(NA,NA,NA),1) # NA
#' Return all ngram combinations with a character vector as input
tokenise_ngram <- function(x, n = 2, collapse = " ") {
unlist(lapply(NLP::ngrams(words(x), n), paste, collapse = collapse), use.names = FALSE)
}
# tokenise_ngram("a c b e f")
# [1] "a c" "c b" "b e" "e f"
#' Remove NA from lists
na.omit.list <-function(y) {
return(y[!sapply(y, function(x)
all(is.na(x)))])
}
#' Return value labels as tibble
#' Only works if variable as value label
#' @param variable Enter variable name as string
extract_vallab <- function(x,variable){
if(is.null(attr(x[[variable]],'labels'))){
return (NULL) # Return NULL if no value labels found
} else {
x[[variable]] %>%
attr('labels') %>%
stack() %>%
dplyr::as_tibble() %>%
`names<-`(c("id",variable)) %>%
mutate_at(vars(variable),funs(as.character))
}
}
#' Replace x values with corresponding values using a key
#'
#' @param x a string vector
#' @param table lookup table
#' @param index character string of the key/index column used for matching
#' @param column the column to return from the lookup table. Default is the second column
#' @example iris %>%
#' .$Petal.Width %>%
#' look_up(tibble(var=0.2,return="zero-point-two"))
look_up <- function(x,table,index="var",column=2){
sapply(1:length(x),
function(i){
if(is.na(x[[i]])){
return("")
} else {
unlist(table[table[[index]]==x[[i]],column])
}
}) %>% as.character() -> matched_x
sapply(1:length(x),
function(i){
if(matched_x[[i]]=="character(0)"){
x[[i]]
} else {
matched_x[[i]]
}
}) %>% ifelse(.=="",NA_character_,.)
}
#' Create a Data Dictionary from a data frame with Variable and Value Labels
#' Exported objects from SPSS
#'
#' @param x data frame
#' @param max_char Maximum character allowed on Variable and Value label columns.
data_dict <-function(x,max_char=32760){
suppressWarnings(
lapply(1:ncol(x),
function(i){
namex <- names(x)[[i]]
purrr::pluck(x,namex,purrr::attr_getter("label")) %>% as.character() -> label_var
purrr::pluck(x,namex,purrr::attr_getter("labels")) %>%
names() %>%
paste(collapse="; ") %>%
stringr::str_trunc(max_char)-> label_val
x[[namex]] %>%
unique() %>%
paste(collapse="; ") %>%
stringr::str_trunc(max_char)-> val_val
ifelse(is_null(label_var),"",label_var)->label_var
ifelse(is_null(label_val),"",label_val)->label_val
data.frame(var=namex,label_var,label_val,value=val_val)
}) %>% dplyr::bind_rows()
)
}
#' Set variable labels
#' Pipe-workflow optimised
#'
#' @param x Variable to assign variable labels to
#' @param variable_label String vector to be assigned as the variable label
#' @example
#'library(tidyverse)
#'tibble(RESPID=1:1000,
#' Q1=sample(c(0,1,2),1000,replace=TRUE),
#' Q2=sample(c(0,1),1000,replace=TRUE))-> df
#' df %>%
#' mutate_at("Q1",funs(set_varl(.,"Which of the following groups do you fall into?"))) %>%
#' .$Q1
set_varl <- function(x,variable_label){
attr(x,'label') <- variable_label
x
}
#' Set value labels
#' Pipe-workflow optimised
#'
#'
#' @param x Variable to assign value labels to
#' @param value_labels Named character vector to be assigned as value labels
#' @example
#'
#' library(tidyverse)
#' tibble(RESPID=1:1000,
#' Q1=sample(c(0,1,2),1000,replace=TRUE),
#' Q2=sample(c(0,1),1000,replace=TRUE))-> df
#' df %>%
#' mutate_at("Q2",funs(set_varl(.,"What is your answer to this yes/no question?"))) %>%
#' mutate_at("Q2",funs(set_vall(.,c("No"=0,"Yes"=1)))) %>%
#' .$Q2 %>% attributes()
set_vall <- function(x,value_labels){
haven::labelled(x,value_labels)
}
#' Safe way to collapse rows in case they have duplicates
#'
#' @param x Variables to collapse. Designed to be use with mutate_at() or mutate_all()
#' @param numeric Set to TRUE to convert result to numeric value
collapse_rows <- function(x,numeric=TRUE){
de_NA_x<- x[!is.na(x)]
if(length(de_NA_x)>1){
stop("Error: rows cannot contain more than one non-NA value")
} else if(numeric==TRUE){
as.numeric(paste0(de_NA_x,collapse=";"))
} else if(numeric==FALSE){
paste0(de_NA_x,collapse=";")
}
}
# collapse_rows(c(NA,1))
# collapse_rows(c(NA,1,2))
#' Convert character variable to labelled integer variable
#'
char_to_lab <- function(x){
unique_x <- unique(x)
gen_df <- tibble::tibble(id=1:length(unique_x),
var=as.character(unique_x))
value_labels <- unlist(create_named_list(gen_df$var,gen_df$id))
sapply(1:length(x),function(i){
if(is.na(x[[i]])){
NA
} else {
gen_df[gen_df$var==x[[i]],1] %>% drop_na() %>% unlist()
}
}) -> outcome # Convert current values to assigned numbers
return(set_vall(outcome,value_labels)) # Set value labels
}
#' Convert numeric variable to NPS variable
as_nps <-function(x){
dplyr::case_when(x >=0 & x<=6~-100,
x >=7 & x<=8~0,
x >=9 & x<=10~100,
TRUE~NA_real_)-> out_x
labz <- c("Detractor"=-100,
"Passive"=0,
"Promoter"=100,
"Missing value"=NA_real_)
haven::labelled(out_x,labz)
}
#' Convert labelled double variable to character variable
#' Returns error if variable is not of class labelled double
#'
lab_to_char <- function(x){
if(!haven::is.labelled(x)){
stop("Variable is not a labelled double")
} else if(haven::is.labelled(x)){
names(attr(x,'labels')) -> id
attr(x,'labels')-> var
tibble::tibble(id=id,var=var)-> key_tab
look_up(x,key_tab,column = 1)
}
}
#' Convert ordinal variables into binary variables by "boxing"
#' E.g. create Top Two Box variable from a 7-point agreement scale question
#' @param x Variable to be passed through
#' @param which Character string to specify which end of spectrum to take values - valid inputs are "top" and "bottom"
#' @param number Number to take values from
#' @example
#' box_it(sample(1:10,100,replace=TRUE))
#' Converted to binary variable where 9, 10 are selected
box_it <-function(x,which="top",number=2){
max_x <- max(x,na.rm = TRUE)
min_x <- min(x,na.rm = TRUE)
if(which=="top"){
valid_range <- (max_x - number + 1):max_x
} else if(which=="bottom"){
valid_range <- min_x:(min_x + number - 1)
}
dplyr::case_when(x %in% valid_range~1,
!(x %in% valid_range)~0,
TRUE~NA_real_) -> output
attr(output,'labels') <- c("Selected"=1,"Not selected"=0,"Missing value"=NA_real_)
range_print <- paste0(valid_range,collapse=", ")
message(paste0("Converted to binary variable where ",range_print," are selected"))
return(output)
}
#' Convert correlation matrix into a tidy data frame
#' Use label_table as an input
#' Identify matching id
#'
#' @param cor_m Correlation matrix in the form of an output from cor()
#' @param label_table A two-column table with one column used for matching and the other for returning labels.
#' @param id Use this argument to specify a string to refers to the matching / id column in the label_table.
#'
cor_to_df <- function(cor_m, label_table, id){
if(ncol(label_table)!=2){
stop("Matching table must have an id column and a return column.")
} else {
matched_lab <- names(label_table)[names(label_table)!=id]
cor_m %>%
as.data.frame() %>%
tibble::rownames_to_column("cor_matrix") %>%
left_join(label_table,by=c("cor_matrix"=id)) %>%
select(cor_matrix,matched_lab,everything())-> output_df
labels <- output_df[[matched_lab]]
names(output_df) <- c("cor_matrix",matched_lab,labels)
return(output_df)
}
}
#' Convert as percent (string)
#' Convert a numeric value into a string with percentage sign.
as_percent <- function(num, rounding = 0){
paste0(round(num * 100, rounding),"%")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment