martinctc/surveyr.R

## surveyr.R
#### Source this from github using following link ####
# devtools::source_gist("https://gist.github.com/martinctc/53444d4847055a870417f5b279f1b8bd", filename="surveyr.R")

#' Calculate weight table using a single source/target column
#' (indicating what proportions need to be multiplied to achieve weighted proportions)
#'
#' @param x Data frame containing columns to be weighted
#' @param target bare (unquoted) column name of the variable used for creating weight
#' @param weights vector containing the proportions of what each value of the target column should take

calc_weights <-
  function(x, target, weights){
    require(broom)
    targ <- eval(substitute(target),x)

    targ %>%
      table() %>%
      tidy() %>%
      mutate(Freq2= Freq / sum(Freq)) %>%
      cbind(weights) %>%
      mutate(WFreq = Freq2 / weights) %>%
      select("Category"=1,"Weights"=5) %>%
      as.tibble()
  }


#' Attach weights to data frame
#'
#' Add weights to the original data frame, to be used together with calc_weights()
#' This function is not necessary if a 'weights' column is already present in data set
#'
#' @param x Data frame to attach required weights
#' @param target Column from which the weights are calculated from.
#' @param weight_table Output produced from calc_weights().

add_weights <- function(x, target, weight_table){

  targ <- enquo(target)
  temp <- create_named_list(weight_table$Category, weight_table$Weights)

  x %>%
    mutate(WEIGHT=!!targ) %>%
    mutate_at(vars(WEIGHT),funs(recode(.,!!!temp)))
}

#' Create a named list object with two vectors
#' Used for constructing arguments for add_weights().
#'
#' @param x Vector containing character strings, e.g. "A"
#' @param y Vector containing numeric values, e.g. 1, 3

create_named_list <- function(x, y){
  quostr <- function(list){
    options(useFancyQuotes = FALSE)
    list2 <- lapply(list,function(x)toString(dQuote(x)))
    list2
  }

  weave <- function(list1, list2, num){
    paste0(list1[num],"=",list2[num])
  }

  len <- length(x)

  temp <- sapply(1:len,weave, list1=quostr(x),list2=y)
  temp <- paste(temp, collapse = ", ")
  temp <- paste0("c(",temp,")")
  temp2 <- eval(parse(text=temp))
  temp2
}


#' Create cross table with an option to add weights (single-choice)
#'
#' @param x Data frame with the required columns.
#' @param downbreak The main question to analyse by.
#' @param crossbreak The 'banner' by which the downbreak would be split by.
#' @param weights Pass the weight column into this argument - defaults to NULL for unweighted tables.

xtab <-
  function(x,downbreak,crossbreak,weights=NULL){
    db.q <- enquo(downbreak)
    cb.q <- enquo(crossbreak)
    cb.string <- deparse(substitute(crossbreak)) #Convert to string


    if(deparse(substitute(weights))!="NULL"){
      #If weights are used

      wts.q <- enquo(weights)
      x %>%
        count(!!cb.q,!!db.q,!!wts.q) %>% #Counts of all
        spread_(cb.string, "n", fill=0) %>% #Using standard evaluation
        mutate(Total=rowSums(select(.,-!!db.q,-!!wts.q)))%>% #Create Total column
        mutate_at(vars(-!!db.q,-!!wts.q),
                  funs(.*!!wts.q)) %>%
        mutate_at(vars(-!!db.q,-!!wts.q),
                  funs(prop.of.itself)) %>%
        #Create weighted proportions here
        select(-!!wts.q) %>%
        group_by(!!db.q) %>%
        summarise_all(sum)

    } else {

      #If no weights applied
      x %>%
        count(!!cb.q,!!db.q) %>%
        spread_(cb.string, "n", fill=0) %>%
        mutate(Total=rowSums(select(.,-!!db.q)))%>%
        mutate_at(vars(-!!db.q),
                  funs(prop.of.itself))
    }
  }

#'Returns the proportions of each value in a vector as the sum of all values in the vector
#'
#'@param x Value in vector

prop.of.itself <- function(x){x / sum(x)} #Return proportion of vector


#' Create single-variable table with an option to add weights (single-choice)
#'
#' @param x Data frame with the required columns.
#' @param downbreak The main question to analyse by.
#' @param weights Pass the weight column into this argument - defaults to NULL for unweighted tables.

tab <-
  function(x,downbreak, weights=NULL){
    db.q <- enquo(downbreak)
    wts.q <- enquo(weights)
    t.string <- deparse(substitute(target))

    if(deparse(substitute(weights))!="NULL"){
      x %>%
        count(!!db.q, !!wts.q) %>%
        mutate_at(vars(n), funs(.*!!wts.q))%>%
        mutate_at(vars(n), funs(prop.of.itself))%>%
        group_by(!!db.q) %>%
        summarise(sum = sum(n))

    } else {

      #If no weights are applied
      x %>% count(!!db.q) %>%
        mutate_at(vars(-!!db.q),funs(prop.of.itself))

    }
  }

#' Convert a Likert scale from one scale to another
#'
#' This is used for converting data, for instance, from a 6-point scale to a 5-point scale.
#' The scale "dimensions" are specified in the function argument, and the function converts the numeric vector that is passed through.
#'
#' @param x Numeric vector to be passed through.
#' @param top.x Top value of the original scale. This would be 6 on a 0-6 scale.
#' @param bot.x Bottom value of the original scale. This would be 0 on a 0-6 scale.
#' @param top.y Top value of the new/target scale. This would be 5 on a 0-5 scale.
#' @param bot.y Bottom value of the new/target scale. This would be 0 on a 0-5 scale.
#' @keywords likert scale
#' @examples
#' data <-c(5, 4, 3, 2, 1)
#'likert.scaler(data,5,0,10,0) #5-point scale to 10-point scale
#'[1] 10  8  6  4  2


likert.convert <- function(x, top.x, bot.x, top.y, bot.y){
    y <- ((top.y-bot.y)*(x-bot.x)/(top.x-bot.x))+bot.y
    y
}

#' Reverse a Likert scale
#'
#' Reverse a Likert scale such that on a 0-10 scale, 10 becomes 0 and 0 becomes 10.
#'
#' @param x Numeric vector
#' @param top Top value of the scale for the variable, e.g. 10 for a 0-10 scale.
#' @param top Bottom value of the scale for the variable, e.g. 0 for a 0-10 scale.
#' @keywords likert scale
#' @examples
#' data <-c(5, 4, 3, 2, 6)
#' likert.reverse(data, 6, 0)
#' [1] 1 2 3 4 0

likert.reverse <- function(x,top,bottom){
  total <-top+bottom
  x <- total-x
  x
}

#' Max-Min Scaling Function
#'
#' This function allows you to scale vectors or an entire data frame using the max-min scaling method, returning a vector
#' Suitable to be used with pipes
#'
#' @param x Pass a vector or the required columns of a data frame through this argument.
#' @keywords max-min
#' @export
#' @examples
#' rand.data <-cbind(sample(1000,234:697),sample(1000,234:697)) %>% as.data.frame()
#' maxmin(rand.data)
#'
#' rand.data <-sample(1000,234:677)
#' maxmin(rand.data)

maxmin <- function(x, na.rm=TRUE){
  if(is.vector(x)){
    maxs <- max(x,na.rm=na.rm)
    mins <- min(x,na.rm=na.rm)
    as.numeric(scale(x,center=mins,scale=maxs-mins))
  } else {
    maxs <- apply(x, 2,max,na.rm=na.rm)
    mins <- apply(x, 2,min,na.rm=na.rm)
    as.numeric(scale(x, center = mins, scale = maxs - mins))
  }
}

#'Split the data into a simple training and testing set
#'
#'@param x Pass your data frame or matrix here.
#'@param part A numeric value between 0 and 1 to represent the proportion of the whole data you want to use as the training set.
#'@keywords train, test
#'@examples
#'x <-as.data.frame(matrix(1:5000,250,20))
#'x.train <- split.tt(x,.7)$train
#'x.test <- split.tt(x,.7)$test
#'
#'dim(x.train)
#'dim(x.test)


split.tt <-function(x, part){
  rowz <- nrow(x)
  samp <- sample(seq_len(rowz),floor(rowz * part))
  output <-list("train"=x[samp,],"test"=x[-samp,])
  output
}

#' Translate function using transltr.org API
#'
#' This function allows you to translate character strings through the transltr.org API.
#' @param what What text string to translate
#' @param from What language to translate from. Defaults to NULL
#' @param to What language to translate to. Defaults to "en"
#' @keywords translate
#' @export
#' @examples
#' trans.t(what="Blanc",from="fr",to="en")

trans.t <- function(what=NULL,from=NULL,to="en"){
  url <- paste0("http://transltr.org/api/translate?text=",URLencode(what),
                "&to=",to,"&from=",from)
  output<-jsonlite::fromJSON(txt=url)
  output$translationText
}

#' Translate function using the Microsoft Translator API
#'
#' This function allows you to translate character strings through the Microsoft Translator API.
#' To use this function, you will need to first acquire an access token and pass it through the key argument.
#' @param what What text string to translate
#' @param from What language to translate from. Defaults to NULL
#' @param to What language to translate to. Defaults to "en"
#' @keywords translate
#' @export
#' @examples
#' trans.m(what="Blanc",from="fr",to="en",key="F023kljadfoilkjlkj")

trans.m <- function(what, from=NULL,to="en",key=MyKey){
  token_url <- "https://api.cognitive.microsoft.com/sts/v1.0/issueToken"
  token_body <- list('Subscription-Key'=key)
  resp<-httr::POST(token_url,query=token_body)
  tok_string <- resp %>% httr::content(type="text",encoding='utf-8')
  w.token <- paste("Bearer",tok_string)

  transurl <- "http://api.microsofttranslator.com/V2/Http.svc/Translate"
  tranz <- httr::GET(transurl,
                     query=list('appId'=w.token,
                                text=what,from=from,
                                to=to))
  tranz %>% xmlParse %>% xmlToList()
}

#' Returns sum of matched instances of pattern in string
str_count <- function(string,pattern){
  sum(str_detect(string, pattern),na.rm=TRUE)
}

#' str_count function with dual condition
#' Returns sum of matched instances of both pattern1 and pattern2 in string
#' Both pattern1 and pattern2 must be TRUE to be counted
str_count_2 <- function(string,pattern1,pattern2){
  sum(str_detect(string, pattern1) & str_detect(string, pattern2),
      na.rm=TRUE)
}

#' Count the number of words in text string
#'
#' @param string Pass text string here.
#' @param pseudo Determines whether groups of special characters are matched. Defaults to FALSE (not matched)
#'
nwords <- function(string, pseudo=F){
  ifelse( pseudo,
          pattern <- "\\S+",
          pattern <- "[[:alpha:]]+"
  )
  str_count(string, pattern)
}

#' Split long sentences into two lines
#'
#' @param string Pass text string here.
#' @param words The maximum number of words allowed in the first line
#' @examples
#' string_liner("This is a lovely cup of tea",2)
#' string_liner("This is a lovely cup of tea",3)

str_next_line <-
  function(string, words) {
    nwords(string) -> p
    words_list <- str_split(string, " ") %>%
      unlist()
    words_plus <- words + 1

    if (p > words_plus) {

      material_1 <- paste0(words_list[1:words], collapse = " ") %>% unlist()
      material_2 <- "\n"
      material_3 <-paste0(words_list[words_plus:p], collapse = " ") %>% unlist()
      paste(material_1, material_2, material_3, collapse = " ") %>%
        str_replace_all("NA","") %>%
        str_trim()

    } else if (p == words_plus) {

      material_1 <- paste0(words_list[1:words], collapse = " ") %>% unlist()
      material_2 <- "\n"
      material_3 <- paste0(words_list[p], collapse = " ") %>% unlist()
      paste(material_1, material_2, material_3, collapse = " ") %>%
        str_replace_all("NA","") %>%
        str_trim()

    } else if (p <= words){
      string
    }
  }

#' Returning a data frame summarising the counts and percentages of row-by-row identical values across the columns of a data frame.
#' Produce a data frame of all columns on whether they match values of target column
#'
#' @param df Pass a data frame containing some columns including a "target" column
#' @param target  Main column that you want to compare with other columns - use bare / unquoted reference.

 match_counter <- function(df,target){

  arguments <- as.list(match.call())
  targ <- deparse(substitute(target)) #variable to string


  df %>%
    mutate_all(funs(.==eval(parse(text=targ)))) %>%
    colSums(na.rm = TRUE) %>% data.frame() %>%
    rownames_to_column() -> matches

  names(matches)[2] <- "counts" #rename column as "counts"
  matches$counts[matches$rowname==targ] -> periods #Count number of periods
  matches %>% mutate(prop=counts/periods) %>% #Create proportion column
    select("Variable"=rowname,"Counts"=counts, "Percentage"=prop)
}

#' Create file name with time stamp
#'
#'  @param main The main file name to be used.
#'  @param extension The file extension to be used, e.g. ".csv"
#'  @examples
#'  timed_fn("Q15. ",".xlsx")


 timed_fn <- function(main,extension){
   Sys.time() %>%
     stringr::str_replace_all("[[:punct:]]","") %>%
     stringr::str_replace_all(" ","_") -> xtime

   paste(main,xtime,extension)
 }

 #' Saves or overwrites a worksheet to an Excel Workbook in a form compatible with the pipe style - using the openxlsx package.
 #'
 #' @param data Data to be written.
 #' @param workbook Workbook object created with openxlsx::createWorkbook().
 #' @param Worksheet Character string containing name of the worksheet.
 #' @examples
 #' ## Accompanying functions
 #' library(openxlsx)
 #' wb <- createWorkbook()
 #' saveWorkbook(wb,filename,overwrite=TRUE)


 write_to_ws <- function(data, workbook, worksheet){
   if (worksheet %in% names(workbook)) {
     writeData(workbook, worksheet, data)
   }
   else{
     addWorksheet(workbook, worksheet)
     writeData(workbook, worksheet, data)
   }
 }

#' Clean strings so that they can be used as variable names or column names
#'
#' @param string Character string to be "cleaned".
#' @param treat_dups Set to FALSE to allow duplication of strings. Defaults to TRUE

clean_strings <- function(string,treat_dups=TRUE){
  new_string <- string %>%
    gsub("'", "", .) %>%
    gsub("\"","", .) %>%
    gsub("%", "percent", .) %>%
    gsub("^[ ]+","", .) %>%
    make.names(.) %>%
    gsub("[.]+", "_", .) %>%
    gsub("[_]+", "_", .) %>%
    tolower(.) %>%
    gsub("_$", "",.)

  dupe_count <- sapply(1:length(new_string), function(i) {
    sum(new_string[i] == new_string[1:i])
  })

  if(treat_dups==TRUE){
    new_string[dupe_count > 1] <- paste(new_string[dupe_count >
                                                     1], dupe_count[dupe_count > 1], sep = "_")
    return(new_string)
  } else if(treat_dups==FALSE){
    return(new_string)
  }
}


#' Replace nth occurring value in a string
#' A more refined replacement function than str_replace() and str_replace_all()
#'
#' @param x Pass string character
#' @param pattern String containing characters to match
#' @param replacement String containing to replace
#' @param n Nth term to be replaced

 str_replace_nth <- function(x, pattern, replacement, n) {
   g <- gregexpr(pattern, x)[[1]][n]
   s <- scan(text = gsub("[()]", "", pattern),
             sep = "|",
             what = "")
   substr(x, g, g) <- replacement[match(substr(x, g, g), s)]
   x
 }

 #' Append an item to a list dynamically (pipe-optimised)
 #'
 #' Function directly appends item to list in Global Environment
 #' @param x An object to append to list, e.g. vector, data frame.
 #' @param list_x Target list to append object to.
 #' @param name Specify a character string for the name of the list. Defaults to blank
 #'
 #' @example
 #' a_list <- list(NULL)
 #' append_to_list(iris,a_list,"iris")
 append_to_list <- function(x, list_x, name=""){

   temp <- deparse(substitute(list_x))

   list_len <- length(list_x)
   list_len1 <- list_len + 1

   if(is.null(list_x[[1]])){ # Treatment if list is empty
     list_x[[1]] <- x
     names(list_x)[1] <- name
     assign(temp,list_x,envir = .GlobalEnv)
   } else {  # Treatment if list has already got at least one object
     list_x[[list_len1]] <- x
     names(list_x)[list_len1] <- name
     assign(temp,list_x,envir = .GlobalEnv)
   }
 }

#' Calculate percentage change of a vector relative to a lag k
#'
#' @param x A vector to be passed through
#' @param lag The number of lags used, defaults to 1

pc_change <-function(x,lag=1){
  base <- lag(x,lag) # base to be divided by
  c(rep(NA,lag),diff(x,lag))/base
}

#' Copy a data frame to clipboard for pasting in Excel
#' Pipe optimised
#' @param expand Add number to manually expand clipboard size
#' @param quietly Set this to TRUE to not print data frame on console

copy_df <-function(x,row.names=FALSE,col.names=TRUE,expand="",quietly=FALSE,...) {
  expand_x <- stringr::str_remove_all(expand,"-")
  if(expand==""){
    write.table(x,paste0("clipboard",expand),sep="\t",row.names=row.names,col.names=col.names,...)
  } else {
    expand_x <- paste0("-",expand_x)
    write.table(x,paste0("clipboard",expand_x),sep="\t",row.names=row.names,col.names=col.names,...)
  }
  if(quietly==FALSE) print(x)
}

#' Remove all columns which contain only NAs
#' Pipe optimised
#' @param df Data frame to be passed through
na_remover <- function(df){
  Filter(function(x)!all(is.na(x)),df)
}

#' Remove all columns which contain only zeros
#' Pipe optimised
#' @param df Data frame to be passed through
zero_remover <- function(df){
  Filter(function(x)!all(x==0),df)
}

#' Set working directory - for personal use only
set_wd <- function(path,where_you_at="office"){
  if(where_you_at=="office"){
    setwd(path)
  } else if(where_you_at=="home"){
    setwd(stringr::str_replace_all(path,".Chan",""))
  }
}

#' Sum rowwise, returning a dataframe with a new column created
#'
#' @param df Data frame to be passed through
#' @param col_name String input for the name of the new column
#' @param condition dplyr-based selection arguments in dplyr::select
#' @param na_rm Logical - whether to remove NAs when calculatng sum. Defaults to TRUE
sum_rowwise <- function(df,col_name="RowSum",condition,na_rm=TRUE){
  df %>%
    mutate(!!sym(col_name):=
             apply(
               select(.,condition),
               1,sum, na.rm=na_rm))
}

# sum_rowwise(df_nbenefits,"n_benefits",starts_with("Q120"))

#' Conditional sum rowwise, returning a dataframe with a new column created
#' Sum up cell values if meeting criteria
#'
#' @param df Data frame to be passed through
#' @param col_name String input for the name of the new column
#' @param condition vector containing values to match in selected range in rows
#' @param select_helpers dplyr-based selection arguments in dplyr::select
#' @param na_rm Logical - whether to remove NAs when calculatng sum. Defaults to TRUE

sum_rowwise_c <- function(df,col_name="RowSum",
                            select_helpers,
                            condition,
                            na_rm=TRUE){
  df %>%
    mutate(!!sym(col_name):=
             apply(
               select(.,select_helpers),
               1,function(x)sum(x %in% condition, na.rm=na_rm)))
}
# sum_rowwise_c(df_nbenefits,"n_benefits",c(1,12,3),starts_with("Q120"))

#' Convenience function to apply the mean function rowwise, using select helpers from tidyverse
mean_rowwise <- function(df,col_name="RowSum",
                          select_helpers,
                          na_rm=TRUE){
  df %>%
    mutate(!!sym(col_name):=
             apply(
               select(.,select_helpers),
               1,function(x)mean(x, na.rm=na_rm)))
}


#' Convert a numeric into a quantile categorical variable, labelled by lower and upper bounds of quantiles (string)
#' @param dat Vector, column, or numeric variable to be passed through
labelled_quantile <- function(dat){
  pt1 <- quantile(dat, probs = seq(0, 1, by = 0.2), type = 7)
  pt2 <- unique(as.data.frame(pt1), fromLast = TRUE)

  paste0(pt2[1,],"_TO_",pt2[2,]) -> q1_lab
  paste0(pt2[2,],"_TO_",pt2[3,]) -> q2_lab
  paste0(pt2[3,],"_TO_",pt2[4,]) -> q3_lab
  paste0(pt2[4,],"_TO_",pt2[5,]) -> q4_lab
  paste0(pt2[5,],"_UP") -> q5_lab

  pt3 <- rownames(pt2)
  pt4 <- as.integer(strsplit(pt3, "%"))
  datp <- pt4[as.integer(.bincode(dat, c(0, pt2$pt1), right=FALSE,include.lowest=TRUE))]
  datp

  case_when(datp==0~q1_lab,
            datp==20~q1_lab,
            datp==40~q2_lab,
            datp==60~q3_lab,
            datp==80~q4_lab,
            datp==100~q5_lab)
}

#' CAGR Calculator
#' http://www.investopedia.com/terms/c/cagr.asp
CAGR <- function(value.beginning, value.ending, number.of.periods){

  ((value.ending / value.beginning) ^ (1 / number.of.periods)) - 1

}

# tibdf <- tibble(company = rep(c("ABC", "CDE"), each = 5),
#                 year = rep(2000:2004, 2),
#                 variable = rep("revenue", 10),
#                 data = c(10, 15, 12, 25, 30, 5, 8, 17, 9, 34))
#
# tibdf %>%
#   arrange(company,year) %>%
#   group_by(company) %>%
#   mutate(change=data-lag(data),
#          CAGR=CAGR(lag(data,2),data,2))

#' Capitalise all letters
#' @param x String, or a character vector
cap_all <- function(string) {
	gsub("(^|[[:space:]])([[:alpha:]])", "\\1\\U\\2", string, perl=TRUE)
}

#' Iteratively filter different columns by the same criteria to summarise a single column by mean
#' To be developed for sum
iterative_mean <- function(df,equals,select_helpers,mean_var){
  df %>%  select(select_helpers) %>% names() -> var_list

  mean_var_str <- deparse(substitute(mean_var))
  mean_var <- enquo(mean_var)

  lapply(1:length(var_list),function(i){
    df %>%
      filter(!!sym(var_list[[i]]) %in% equals) %>%
      summarise(!!sym(mean_var_str):=mean(!!mean_var,na.rm=TRUE))

  }) %>% bind_rows() -> p

  cbind(var_list,p)
}

#' Convert numeric variable into categorical variable
#' Essentially a wrapper around the cut() function
#' @param x vector to be passed through
#' @param break points for the categories. Both the top and bottom breaks must be provided.
categorise <- function(x, breaks){
  x %>%
    cut(breaks=breaks,
        # labels=letters[1:(length(breaks)-1)],
        include.lowest=TRUE,
        right=TRUE) %>%
    unlist() %>% as.character()
  }

# seq(10) %>% categorise(breaks=c(0,3,10))


#' Replace NAs randomly from a selected range with replacement
#' @param range Vector of values to replace NA with
 replace_na_range <- function(x,range){
  sum_na <- sum(is.na(x))
  rand <- sample(range,sum_na,replace=TRUE)
  x[is.na(x)] <- rand
  x
}
# replace_na_range(c(1,NA,2,3,1,NA,2),c(1,2,3))

#' Convert single-code column(s) into "multiple choice" formats - fill data with 1s
#' This function was orignally used in the context of a 'Select Any' question, where the outputs were created such that each 'cell' value represents the statement, rather than having the columns represent the statements.
#' tcol refers to the number of columns to create.
#' 'Cell' values must be in numeric.
#' Version with select incorporated
superspread <- function(df,tcol,select_helpers){
  df_original <- df
  df <- dplyr::select(df,select_helpers)
  frameNA <-matrix(NA,nrow(df),tcol) %>% as.data.frame()
  names(frameNA) <-seq(1,tcol)
  for (i in 1:tcol){
    frameNA[,i] <- as.numeric(apply(df, 1, function(x) ifelse(i %in% x, 1, 0)))
  }
  cbind(df_original,frameNA)
}

#' Convert single-code column(s) into "multiple choice" formats, filling data from a target column
superspread_fill <- function(df,tcol,select_helpers,target_col){
  df_original <- df
  df <- dplyr::select(df,select_helpers)
  dft <- dplyr::select(df_original,target_col)

  frameNA <-matrix(NA,nrow(df),tcol) %>% as.data.frame()
  names(frameNA) <-seq(1,tcol)

  for (i in 1:tcol){ # For each column
    frameNA[,i] <- as.numeric(sapply(1:nrow(df), function(x) ifelse(i %in% df[x,],
                                                                    sapply(x,function(y) dft[y,1]),
                                                                    0)))
  }
  cbind(df,frameNA)
}
# data.frame(a=c(1,2,4,5,6,7,8),b=c(244,333,434,453,123,123,435)) %>% superspread_fill(8,"a","b")

#' Convert single-code column(s) into "multiple choice" formats, filling data with count rather than 1s
superspread_count <- function(df,tcol,select_helpers){
  df_original <- df
  df <- select(df,select_helpers)

  frameNA <-matrix(NA,nrow(df),tcol) %>% as.data.frame()
  names(frameNA) <-seq(1,tcol)
  for (i in 1:tcol){
    frameNA[,i] <- as.numeric(apply(df, 1, function(x) ifelse(i %in% x, sum(i==x,na.rm = TRUE), 0))) # count if 'column value' exists in the matrix
  }

  names(frameNA) <- paste0("x",names(frameNA))

  cbind(df_original,frameNA)
}

#' Serialise a SPSS file to RDS
#' Old file name is retained and saved in the same location as RDS if export is unspecified
sav_to_rds<-function(import,export=""){
  p <- haven::read_sav(import)
  export<-ifelse(export=="",
                 str_replace_all(import,".sav",".rds"),
                 export)
  saveRDS(p,file=export)
}

# Return the n number of characters from the right
# Wrapper convenience function
str_right <- function(x,n){
  nx <- n-1
  str_sub(x,nchar(x)-nx,nchar(x))
}

#' Function that returns TRUE/FALSE if value exists in x, but returns NA if x consists entirely of NAs
any_x <- function(x, value){
  if(all(is.na(x))){
    return(NA)
  } else(any(x %in% value,na.rm=TRUE))
}

# any_x(c(1,0,1),1) # TRUE
# any_x(c(1,NA,1),1) # TRUE
# any_x(c(0,0,NA),1) # FALSE
# any_x(c(NA,NA,NA),1) # NA

#' Return all ngram combinations with a character vector as input
tokenise_ngram <- function(x, n = 2, collapse = " ") {
    unlist(lapply(NLP::ngrams(words(x), n), paste, collapse = collapse), use.names = FALSE)
}

# tokenise_ngram("a c b e f")
# [1] "a c" "c b" "b e" "e f"

#' Remove NA from lists
na.omit.list <-function(y) {
    return(y[!sapply(y, function(x)
      all(is.na(x)))])
  }

#' Return value labels as tibble
#' Only works if variable as value label
#' @param variable Enter variable name as string
extract_vallab <- function(x,variable){
  if(is.null(attr(x[[variable]],'labels'))){
    return (NULL) # Return NULL if no value labels found
  } else {
    x[[variable]] %>%
      attr('labels') %>%
      stack() %>%
      dplyr::as_tibble() %>%
      `names<-`(c("id",variable)) %>%
      mutate_at(vars(variable),funs(as.character))
  }
}

#' Replace x values with corresponding values using a key
#'
#' @param x a string vector
#' @param table lookup table
#' @param index character string of the key/index column used for matching
#' @param column the column to return from the lookup table. Default is the second column
#' @example iris %>%
#' .$Petal.Width %>%
#' look_up(tibble(var=0.2,return="zero-point-two"))

look_up <- function(x,table,index="var",column=2){
  sapply(1:length(x),
         function(i){
           if(is.na(x[[i]])){
             return("")
           } else {
             unlist(table[table[[index]]==x[[i]],column])
           }
         }) %>% as.character() -> matched_x

  sapply(1:length(x),
         function(i){
           if(matched_x[[i]]=="character(0)"){
             x[[i]]
           } else {
             matched_x[[i]]
           }
         }) %>% ifelse(.=="",NA_character_,.)
}

#' Create a Data Dictionary from a data frame with Variable and Value Labels
#' Exported objects from SPSS
#'
#' @param x data frame
#' @param max_char Maximum character allowed on Variable and Value label columns.

data_dict <-function(x,max_char=32760){
  suppressWarnings(
    lapply(1:ncol(x),
           function(i){
             namex <- names(x)[[i]]
             purrr::pluck(x,namex,purrr::attr_getter("label")) %>% as.character() -> label_var
             purrr::pluck(x,namex,purrr::attr_getter("labels")) %>%
               names() %>%
               paste(collapse="; ") %>%
               stringr::str_trunc(max_char)-> label_val
             x[[namex]] %>%
               unique() %>%
               paste(collapse="; ") %>%
               stringr::str_trunc(max_char)-> val_val


             ifelse(is_null(label_var),"",label_var)->label_var
             ifelse(is_null(label_val),"",label_val)->label_val

             data.frame(var=namex,label_var,label_val,value=val_val)
           }) %>% dplyr::bind_rows()
  )
}

#' Set variable labels
#' Pipe-workflow optimised
#'
#' @param x Variable to assign variable labels to
#' @param variable_label String vector to be assigned as the variable label
#' @example
#'library(tidyverse)
#'tibble(RESPID=1:1000,
#'        Q1=sample(c(0,1,2),1000,replace=TRUE),
#'        Q2=sample(c(0,1),1000,replace=TRUE))-> df
#' df %>%
#'   mutate_at("Q1",funs(set_varl(.,"Which of the following groups do you fall into?"))) %>%
#'   .$Q1

set_varl <- function(x,variable_label){
  attr(x,'label') <- variable_label
  x
}


#' Set value labels
#' Pipe-workflow optimised
#'
#'
#' @param x Variable to assign value labels to
#' @param value_labels Named character vector to be assigned as value labels
#' @example
#'
#' library(tidyverse)
#' tibble(RESPID=1:1000,
#'        Q1=sample(c(0,1,2),1000,replace=TRUE),
#'        Q2=sample(c(0,1),1000,replace=TRUE))-> df
#' df %>%
#'   mutate_at("Q2",funs(set_varl(.,"What is your answer to this yes/no question?"))) %>%
#'   mutate_at("Q2",funs(set_vall(.,c("No"=0,"Yes"=1)))) %>%
#'   .$Q2 %>% attributes()

set_vall <- function(x,value_labels){
  haven::labelled(x,value_labels)
}

#' Safe way to collapse rows in case they have duplicates
#'
#' @param x Variables to collapse. Designed to be use with mutate_at() or mutate_all()
#' @param numeric Set to TRUE to convert result to numeric value
collapse_rows <- function(x,numeric=TRUE){
  de_NA_x<- x[!is.na(x)]
  if(length(de_NA_x)>1){
   stop("Error: rows cannot contain more than one non-NA value")
  } else if(numeric==TRUE){
    as.numeric(paste0(de_NA_x,collapse=";"))
  } else if(numeric==FALSE){
    paste0(de_NA_x,collapse=";")
  }
}

# collapse_rows(c(NA,1))
# collapse_rows(c(NA,1,2))


#' Convert character variable to labelled integer variable
#'
char_to_lab <- function(x){
  unique_x <- unique(x)

  gen_df <- tibble::tibble(id=1:length(unique_x),
             var=as.character(unique_x))

  value_labels <- unlist(create_named_list(gen_df$var,gen_df$id))

  sapply(1:length(x),function(i){
    if(is.na(x[[i]])){
      NA
    } else {
      gen_df[gen_df$var==x[[i]],1] %>% drop_na() %>% unlist()
    }
  }) -> outcome # Convert current values to assigned numbers

  return(set_vall(outcome,value_labels)) # Set value labels
}


#' Convert numeric variable to NPS variable
as_nps <-function(x){
  dplyr::case_when(x >=0 & x<=6~-100,
                   x >=7 & x<=8~0,
                   x >=9 & x<=10~100,
                   TRUE~NA_real_)-> out_x

  labz <- c("Detractor"=-100,
            "Passive"=0,
            "Promoter"=100,
            "Missing value"=NA_real_)

  haven::labelled(out_x,labz)
}

#' Convert labelled double variable to character variable
#' Returns error if variable is not of class labelled double
#'
lab_to_char <- function(x){
  if(!haven::is.labelled(x)){
    stop("Variable is not a labelled double")
  } else if(haven::is.labelled(x)){
    names(attr(x,'labels')) -> id
    attr(x,'labels')-> var

    tibble::tibble(id=id,var=var)-> key_tab

    look_up(x,key_tab,column = 1)
  }
}

#' Convert ordinal variables into binary variables by "boxing"
#' E.g. create Top Two Box variable from a 7-point agreement scale question
#' @param x Variable to be passed through
#' @param which Character string to specify which end of spectrum to take values - valid inputs are "top" and "bottom"
#' @param number Number to take values from
#' @example
#' box_it(sample(1:10,100,replace=TRUE))
#' Converted to binary variable where 9, 10 are selected

box_it <-function(x,which="top",number=2){
  max_x <- max(x,na.rm = TRUE)
  min_x <- min(x,na.rm = TRUE)

  if(which=="top"){
    valid_range <- (max_x - number + 1):max_x
  } else if(which=="bottom"){
    valid_range <- min_x:(min_x + number - 1)
  }

  dplyr::case_when(x %in% valid_range~1,
                   !(x %in% valid_range)~0,
                   TRUE~NA_real_) -> output

  attr(output,'labels') <- c("Selected"=1,"Not selected"=0,"Missing value"=NA_real_)

  range_print <- paste0(valid_range,collapse=", ")
  message(paste0("Converted to binary variable where ",range_print," are selected"))
  return(output)
}


#' Convert correlation matrix into a tidy data frame
#' Use label_table as an input
#' Identify matching id
#'
#' @param cor_m Correlation matrix in the form of an output from cor()
#' @param label_table A two-column table with one column used for matching and the other for returning labels.
#' @param id Use this argument to specify a string to refers to the matching / id column in the label_table.
#'
cor_to_df <- function(cor_m, label_table, id){
  if(ncol(label_table)!=2){
    stop("Matching table must have an id column and a return column.")
  } else {
    matched_lab <- names(label_table)[names(label_table)!=id]

    cor_m %>%
      as.data.frame() %>%
      tibble::rownames_to_column("cor_matrix") %>%
      left_join(label_table,by=c("cor_matrix"=id)) %>%
      select(cor_matrix,matched_lab,everything())-> output_df

    labels <- output_df[[matched_lab]]

    names(output_df) <- c("cor_matrix",matched_lab,labels)
    return(output_df)
  }
}


#' Convert as percent (string)
#' Convert a numeric value into a string with percentage sign.
as_percent <- function(num, rounding = 0){
  paste0(round(num * 100, rounding),"%")
}