Skip to content

Instantly share code, notes, and snippets.

@nmatzke
Created January 20, 2016 04:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nmatzke/b21fcccb09b42959897d to your computer and use it in GitHub Desktop.
Save nmatzke/b21fcccb09b42959897d to your computer and use it in GitHub Desktop.
# =============================================
# genericR_v1.R: many useful utility functions
# for R
#
# by Nick Matzke
# Copyright 2011-infinity
# matzkeATberkeley.edu
# January 2011
#
# Please link/cite if you use this, email me if you have
# thoughts/improvements/corrections.
#
##############################################################
#
# Free to use/redistribute under:
# Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the above license, linked here:
#
# http://creativecommons.org/licenses/by-nc-sa/3.0/
#
# Summary:
#
# You are free:
#
# * to Share — to copy, distribute and transmit the work
# * to Remix — to adapt the work
#
# Under the following conditions:
#
# * Attribution — You must attribute the work in the manner
# specified by the author or licensor (but not in any way that
# suggests that they endorse you or your use of the work).
# * Noncommercial — You may not use this work for commercial purposes.
#
# * Share Alike — If you alter, transform, or build upon this work,
# you may distribute the resulting work only under the same or
# similar license to this one.
#
# http://creativecommons.org/licenses/by-nc-sa/3.0/
#
###################################################################
# Generic utility functions for R
# Load with:
# sourcedir = '/drives/Dropbox/_njm/'
# source3 = '_genericR_v1.R'
# source(paste(sourcedir, source3, sep=""))
# NOTE: MANY OF THESE FUNCTIONS ARE BEING MOVED TO:
#sourceall(path="/Dropbox/_njm/__packages/BioGeoBEARS_setup/")
# for e.g. calc_loglike
# sourcedir = '/Dropbox/_njm/'
# source3 = '_R_tree_functions_v1.R'
# source(paste(sourcedir, source3, sep=""))
# source all .R files in a directory, except "compile" and "package" files
sourceall <- function(path=path, pattern="\\.R", ...)
{
Rfiles = list.files(path=path, pattern="\\.R", ...)
# Files to remove
Rfiles_remove_TF1 = grepl("compile", Rfiles)
Rfiles_remove_TF2 = grepl("package", Rfiles)
Rfiles_remove_TF = (Rfiles_remove_TF1 + Rfiles_remove_TF2) >= 1
Rfiles = Rfiles[Rfiles_remove_TF == FALSE]
cat("\nSourcing Rfiles in ", path, "...\n", sep="")
for (Rfile in Rfiles)
{
cat("Sourcing Rfile: ", Rfile, "\n", sep="")
fullfn = slashslash(paste(path, Rfile, sep=""))
source(fullfn, chdir=TRUE)
}
cat("\nDone sourcing Rfiles in ", path, "...\n", sep="")
return(path)
}
# Get a list of directories
# http://stackoverflow.com/questions/4749783/how-to-obtain-a-list-of-directories-within-a-directory-like-list-files-but-i
list_dirs <- function(path=".", pattern=NULL, all.dirs=FALSE, full.names=FALSE, ignore.case=FALSE)
{
all <- list.files(path, pattern, all.dirs, full.names, recursive=FALSE, ignore.case)
all[file.info(all)$isdir]
}
#######################################################
# slashslash:
#######################################################
#' Remove double slash (slash a slash)
#'
#' Shortcut for: \code{gsub(pattern="//", replacement="/", x=tmpstr)}
#'
#' This function is useful for removing double slashes that can
#' appear in full pathnames due to inconsistencies in trailing
#' slashes in working directories etc.
#'
#' @param tmpstr a path that you want to remove double slashes from
#' @return outstr a string of the fixed path
#' @export
#' @seealso \code{\link[base]{getwd}}, \code{\link[base]{setwd}}, \code{\link[base]{gsub}}
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @examples
#' tmpstr = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/modiscloud/extdata/2002_raw_MYD//MYD03.A2002185.0645.005.2009192031332.hdf"
#'
#' outstr = slashslash(tmpstr)
#' outstr
#'
slashslash <- function(tmpstr)
{
outstr = gsub(pattern="//", replacement="/", x=tmpstr)
return(outstr)
}
##################################################################
# abbreviations
##################################################################
# Abbreviation of paste; default separation is "" instead of " "
p <- function(x, sep="", collapse=NULL)
{
return(paste(x, sep, collapse))
}
# Return a row with lots of standard summary stats
summstats <- function(values, db="", descrip="")
{
# Calculate a variety of standard estimates
num_estimates = length(values)
minval = min(values)
maxval = max(values)
rangewidth = maxval - minval
q025 = quantile(values, 0.025)
q975 = quantile(values, 0.975)
HPD95width = q975 - q025
meanTT = mean(values, na.rm=TRUE)
sdTT = sd(values, na.rm=TRUE)
varTT = var(values, na.rm=TRUE)
sd95_2tailed = 1.96*sdTT
CIupper = meanTT + sd95_2tailed
CIlower = meanTT - sd95_2tailed
CIwidth = CIupper - CIlower
db = db
descrip = descrip
tmprow = c(db, descrip, num_estimates, meanTT, sdTT, varTT, sd95_2tailed, CIlower, CIupper, CIwidth, q025, q975, HPD95width, minval, maxval, rangewidth)
return(tmprow)
}
# Return the names of the columns in summstats
summstats_colnames <- function()
{
tmpnames = c("db", "timerange", "num_estimates", "meanTT", "sdTT", "varTT", "sd95_2tailed", "CIlower", "CIupper", "CIwidth", "q025", "q975", "HPD95width", "minval", "maxval", "rangewidth")
return(tmpnames)
}
# Print the first numlines part of the item to screen
hp <- function(item, numlines=15)
{
txt = capture.output(print(item))
cat("\n")
cat("hp() -- 'head print' of first ", numlines, " lines of item:\n\n")
for (i in 1:numlines)
{
cat(txt[i], "\n", sep="")
}
return()
}
#######################################################
# Common manipulations of R objects
#######################################################
unlist_df_factor <- function(df)
{
outdf <- data.frame(lapply(df, function(x) factor(unlist(x))))
}
# length, like in Python
len <- function(x)
{
return(length(x))
}
# the number of distinct values may be an issue,
# let's count those
count_uniq <- function(x)
{
length(unique(x))
}
count_zeros <- function(x)
{
sum(x==0)
}
# Return the number of matching characters
count_chars <- function(char_to_count, tmpstr)
{
tmpchars = strsplit(tmpstr, "")
matches = tmpchars == char_to_count
number_that_match = sum(matches)
return(number_that_match)
}
# Return current date and time in a format suitable for labeling a file
datetime <- function()
{
txt = as.character(Sys.time())
# "2012-03-18 19:51:32 PDT"
n = nchar(txt)
txt2 = substr(txt, start=1, stop=n-0)
txt3 = gsub(" ", "_", txt2)
txt4 = gsub("\\:", "", txt3)
return(txt4)
}
# Convert # of seconds since the beginning of 1970
# (numeric, or derived from POSIX_ct times)
# to a POSIXlt date object.
#
#
# Note: R assumes seconds are in your computer's local timezone,
# and if your output timezone is GMT, the time will be altered.
#
# I.e., if your computer is in PST, the UTC/GMT time that results
# will be +2 more hours; so -2 is the correct adjustment, even though
# in real life, PST = UTC-8 (weird I know).
datetime_from_secs <- function(POSIX_ct_dates, local_timezone_shift_from_UTC=-2)
{
cat("\n\nNote: datetime_from_secs() is assuming that the input seconds are\nfrom a timezone at UTC", as.character(local_timezone_shift_from_UTC), ".\n", sep="")
# Convert, and correct for the addition R makes to convert PST to UTC
POSIX_lt_dates = as.POSIXlt(POSIX_ct_dates+(local_timezone_shift_from_UTC*60*60), tz="UTC", origin="1970-01-01 00:00.00 UTC")
return(POSIX_lt_dates)
}
# 2014-01-30_NJM:
# moving to BioGeoBEARS readwrite
# for process_DIVA_output
# Trim surrounding spaces, AND remove any tabs
trim2 <- function(tmpstr)
{
require(gdata) # for trim
# Convert any tabs to space; leading/trailing tabs will be removed by trim
tmpstr = gsub(pattern="\t", replacement=" ", tmpstr)
tmpstr = trim(tmpstr)
return(tmpstr)
}
# Get fns with suffix, from either directory or fns list
# Do NOT use dot in the suffix
get_fns_matching_txt <- function(tmpdir=NULL, fns=NULL, text_to_match = NULL, returnfullnames=TRUE)
{
# If tmpdir is NOT null, get those files from list.files.
if (is.null(tmpdir) == FALSE)
{
fns = list.files(tmpdir, full.names=returnfullnames)
# Remove //, change to /
fns = slashslash(fns)
fns_without_paths = list.files(tmpdir, full.names=FALSE)
}
# Return true/false for matched text
TF = grepl(pattern=text_to_match, x=fns_without_paths)
matching_fns = fns[TF]
return(matching_fns)
}
# Get fns with suffix, from either directory or fns list
# Do NOT use dot in the suffix
get_fns_with_suffix <- function(tmpdir=NULL, fns=NULL, suffix_to_match = "newick")
{
# If tmpdir is NOT null, get those files from list.files.
if (is.null(tmpdir) == FALSE)
{
fns = list.files(tmpdir)
}
suffixes = get_fn_suffixes(fns=fns)
nums = match_item_in_list2(item=suffix_to_match, list2=suffixes, return_indices=TRUE)
matching_fns = fns[nums]
return(matching_fns)
}
# Get the suffixes of a list of filenames
get_fn_suffixes <- function(tmpdir=NULL, fns=NULL)
{
if (is.null(tmpdir) == FALSE)
{
fns = list.files(tmpdir)
}
# Split the filenames on "."
splitfns = strsplit(fns, "\\.")
# For each item, return last
suffixes = mapply(return_last_nosingles, splitfns)
return(suffixes)
}
# Return last item
return_last <- function(tmplist)
{
return(tmplist[length(tmplist)])
}
# Return last item, NULL if only one item
return_last_nosingles <- function(tmplist)
{
if (length(tmplist) == 1)
{
return(NULL)
} else {
return(tmplist[length(tmplist)])
}
}
##################################
# head/tail lines to file
# Instead of grep, use sed
##################################
# http://www.unix.com/shell-programming-scripting/14477-how-extract-range-lines-file.html
# cat combined_75000_trees.trees | | cat
# eliminate all spaces, tabs, whitespace,
# don't allow "" in result
strsplit_nowhitespace <- function(tmpstr)
{
# split on any whitespace
words = strsplit(tmpstr, "[[:space:]+]")[[1]]
# remove any blanks that remain
words = words[words != ""]
return(words)
}
defaults='
numlines_str = " 75091 combined_75000_trees.trees"
# numlines
# numlines()
'
linecount <- function(fn)
{
# SLLLLLOOWW as this does a full word count
# cmdstr = paste("wc -l ", fn, sep="")
# numlines_str = system(cmdstr, intern=TRUE)
# words = strsplit_nowhitespace(numlines_str)
#print(words)
# numlines = as.numeric(words[1])
# Faster Line Count With Grep Rather Than Wc
# http://www.dzone.com/snippets/faster-line-count-grep-rather
cmdstr = paste("grep -c '\n' ", fn, sep="")
numlines_str = system(cmdstr, intern=TRUE)
numlines = as.numeric(numlines_str)
# Starting at 5:20:
# Using these commands:
# http://www.theunixschool.com/2010/06/5-different-ways-to-count-total-lines.html
# cat /Users/nickm/Downloads/AT1_MrB_simple3.time.trees.txt | wc -l
# sed -n '$=' "/Users/nickm/Downloads/AT1_MrB_simple3.time.trees.txt"
# grep -c '\n' /Users/nickm/Downloads/AT1_MrB_simple3.time.trees.txt
# awk 'END {print NR}' /Users/nickm/Downloads/AT1_MrB_simple3.time.trees.txt
#
# LOL...all identical -- 5-6 minutes!
return(numlines)
}
# Extract the string (strings?) matching the regexp
extract_regexp <- function(tmpstr, tmpre_string)
{
matches = gregexpr(tmpre_string, tmpstr)[[1]]
matches_end = matches-1+attr(matches,"match.length")
x = mapply(substr, tmpstr, matches, matches_end)
return(x)
}
# Get the first digit. If it is a :, it's not a tip...
get_firstchar <- function(tmpstr)
{
tmpchars = strsplit(tmpstr, "")[[1]]
first_char = tmpchars[1]
return(first_char)
}
# Normalize values to a large positive number
normalize_vals <- function(char_dtf, maxvals = NULL, baseval=0, normval=100)
{
# Normalize values to between 0 and 1
# set these vals to 1
if (is.null(maxvals))
{
maxvals = apply(char_dtf, 2, max)
}
char_dtf_fraction_of_one = char_dtf
for (rownum in 1:nrow(char_dtf))
{
char_dtf_fraction_of_one[rownum, ] = char_dtf[rownum, ] / maxvals
}
# Multiply by 100
char_dtf_fraction_of_100 = char_dtf_fraction_of_one * 100
# Add 1000
char_dtf_1000 = char_dtf_fraction_of_100 + 1000
# to reverse, subtract 1000, divide by 100, multiply by maxval
}
# Convert 0-1 values to logit
Pvals_to_logit <- function(x, minP=0.0001)
{
x1 = x
maxP = 1-minP
vals_LT_min_TF = x < minP
vals_GT_max_TF = x > maxP
cat("\nRunning Pvals_to_logit():\n")
cat("Warning: ", sum(vals_LT_min_TF), " of ", length(x), " values converted to ", minP, "\n", sep="")
cat("Warning: ", sum(vals_GT_max_TF), " of ", length(x), " values converted to ", 1-minP, "\n", sep="")
x1[vals_LT_min_TF] = minP
x1[vals_GT_max_TF] = 1-minP
logitx = log(x / (1-x))
#########################################################
# Error check (weird floating-point behavior
# seems to sometimes produce logit values outside the allowed range...
#########################################################
min_logit = log(minP / (1-minP))
max_logit = log(maxP / (1-maxP))
logitx[logitx < min_logit] = min_logit
logitx[logitx > max_logit] = max_logit
return(logitx)
}
# Convert logit values to 0-1
logit_to_Pvals <- function(x)
{
x1 = x
#vals_LT_min_TF = x < minP
#vals_GT_max_TF = x > (1-minP)
#cat("\nRunning Pvals_to_logit():\n")
#cat("Warning: ", sum(vals_LT_min_TF), " of ", length(x), " values converted to ", minP, "\n", sep="")
#cat("Warning: ", sum(vals_GT_max_TF), " of ", length(x), " values converted to ", 1-minP, "\n", sep="")
#x1[vals_LT_min_TF] = minP
#x1[vals_GT_max_TF] = 1-minP
invlogitx = exp(x) / (1 + exp(x))
return(invlogitx)
}
# Long summarize object
smml <- function(x, mxl=NA)
{
smm(x, mxl=NA)
}
# Short summarize object
smm <- function(x, mxl=10)
{
cat("\n")
#cat("smm(): PROVIDING OVERALL SUMMARY OF OBJECT...\n")
cat("CLASS: ", class(x), "\n", sep="")
dims = dim(x)
if (length(dims) > 1)
{
# if maxcols is NA, print all cols
if (is.na(mxl))
{
mxl = dims[2]
}
# if maxcols is GT dims2, use dims2
if (mxl > dims[2])
{
mxl = dims[2]
}
# Cols to print
colstp = 1:mxl
}
total_length = 1
for (d in 1:length(dims))
{
total_length = total_length * dims[d]
}
cat("#DIMS: ", length(dims), "\n", sep="")
if (!is.null(dims))
{
if (length(dims) == 1)
{
cat("LENGTH: ", length(x), "\n", sep="")
}
if (length(dims) == 2)
{
cat("DIMS: nrow=", dims[1], " ncol=", dims[2], "\n", sep="")
}
if (length(dims) == 3)
{
cat("DIMS: nrow=", dims[1], " ncol=", dims[2], " nz = ", dims[3], "\n", sep="")
}
if (length(dims) > 3)
{
cat("DIMS: ", dims, "\n", sep="")
}
}
cat("TTLEN: ", total_length, "\n", sep="")
cat("NAMES: ")
if (length(dims > 1))
{
cat(names(x)[1:mxl])
} else {
cat(names(x))
}
cat("\n")
cat("HEAD/TAIL:\n")
# if dim is null, just do length etc.
if (is.null(dims))
{
cat("LENGTH: ", length(x), "\n", sep="")
cat("1ST4: \n")
print(x[1:4])
cat("LAST4: \n")
maxval = length(x)
print(x[(maxval-3) : maxval])
}
if (length(dims) == 3)
{
cat("TOP4: \n")
print(x[1:4, colstp, 1])
print(x[1:4, colstp, 2])
cat("BOT4: \n")
print(x[(dims[1]-3):(dims[1]-0), colstp, 1])
print(x[(dims[1]-3):(dims[1]-0), colstp, 2])
}
if (length(dims) == 2)
{
cat("TOP4: \n")
print(x[1:4, colstp])
cat("BOT4: \n")
print(x[(dims[1]-3):(dims[1]-0), colstp])
}
if (length(dims) == 1)
{
cat("1ST4: \n")
print(x[1:4])
cat("LAST4: \n")
print(x[(dims[1]-3):(dims[1]-0)])
}
if (length(dims) > 3)
{
cat("1ST4: \n")
print(x[1:4])
cat("LAST4: \n")
print(x[(dims[1]-3):(dims[1]-0)])
}
}
# Summarize object
summ <- function(x)
{
cat("\n")
cat("SUMM(): PROVIDING OVERALL SUMMARY OF OBJECT...\n")
cat("\nCLASS OF OBJECT:\n")
print(class(x))
cat("\nDIMENSIONS OF OBJECT:\n")
print(dim(x))
cat("\nLENGTH OF OBJECT:\n")
print(length(x))
cat("\nATTRIBUTES OF OBJECT:\n")
print(attributes(x))
cat("\nSUMMARY() OF OBJECT:\n")
print(summary(x))
cat("\ncls.df(): print the classes of each column (if it's a data.frame):\n")
cls.df(x, printout=TRUE)
}
print.default2 <- function (x, digits = NULL, quote = TRUE, na.print = NULL, print.gap = NULL, right = FALSE, max = NULL, useSource = TRUE, ...)
{
if (is.numeric(x))
{
x <- as.numeric(sprintf("%7.3f", x))
}
noOpt <- missing(digits) && missing(quote) && missing(na.print) && missing(print.gap) && missing(right) && missing(max) && missing(useSource) && length(list(...)) == 0L
.Internal(print.default(x, digits, quote, na.print, print.gap, right, max, useSource, noOpt))
}
setup = "
startline=1
endline=100
startstr_to_find='Iteration'
"
get_startline <- function(fn, startline=1, endline=100, startstr_to_find)
{
# Read part of a large file, and find the starting string
skipnum = startline - 1
tmp_numlines = endline - startline +1
# Read lines
# readlines
tmplines = scan(file=fn, what="character", skip=skipnum, nlines=tmp_numlines, sep="\n")
for (i in 1:length(tmplines))
{
tmpline = tmplines[i]
#print(tmpline)
if (grepl(startstr_to_find, tmpline))
{
outline_num = skipnum + i
return(outline_num)
}
}
# If startstring not found, return NA
return(NA)
}
# Run something like this:
# grep -m 2 --line-number "Iteration[[:space:]]" params_ln_continuous_BayesTraits.txt.log.txt
grep_startline <- function(fn, hitnum=1, startstr_to_find="Iteration[[:space:]]")
{
# Use grep to return the line number of the FIRST line containing the matching string
cmdstr = paste("grep -m ", hitnum, " --line-number ", startstr_to_find, " ", fn, sep="")
# intern = TRUE reports the result to R
grep_return = system(cmdstr, intern=TRUE)
# split on whitespace (spaces and tabs)
grep_list = strsplit(grep_return, ":")[[1]]
linenum = as.numeric(grep_list[1])
return(linenum)
# If startstring not found, return NA
return(NA)
}
setup="
linenum = 100
fn = 'params_ln_continuous_BayesTraits.txt.log.txt'
"
sed_a_line_by_number <- function(fn, linenum)
{
# retrieve a particular line by line number (fastest)
cmdstr = paste("sed '", linenum, "q;d' ", fn, sep="")
sed_result = system(cmdstr, intern=TRUE)
return(sed_result)
}
get_numlines <- function(fn)
{
# Get the number of lines in a file
# Check if the file exists
TF = file.exists(fn)
if (TF == FALSE)
{
txt = paste0("\nWARNING in get_numlines(): file '", fn, "' does not exist in directory\n'", getwd(), "'. Returning linecount=0.")
cat(txt)
linecount = 0
return(linecount)
}
# intern = TRUE reports the result to R
cmdstr = paste("wc -l ", fn, sep="")
wc_return = system(cmdstr, intern=TRUE)
# split on whitespace (spaces and tabs)
wc_list = strsplit_whitespace(wc_return)
linecount = as.numeric(wc_list[1])
return(linecount)
} # END get_numlines <- function(fn)
ppdf <- function(tmp_pdffn=pdffn, a="")
{
# open whatever pdffn you have been working on
#dev.off()
# Turn off all
if (a == "all")
{
graphics.off()
}
cmdstr = paste("open ", tmp_pdffn, sep="")
system(cmdstr)
}
# Merge PDFs using Ghostscript
# http://hints.macworld.com/article.php?story=2003083122212228
merge_pdfs <- function(pdffns, merge_pdffn, wd=getwd())
{
setwd(wd)
pdfs_string = paste(pdffns, sep="", collapse=" ")
gs_string = "gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile="
merge_pdffn = paste(wd, "/", merge_pdffn, sep="")
merge_pdffn = slashslash(merge_pdffn)
cmdstr = paste(gs_string, merge_pdffn, " ", pdfs_string, sep="")
print(cmdstr)
system(cmdstr)
}
# prflag <- function(x, printflag=TRUE)
# {
# # A standard function to print (or not) certain variables,
# # based on a master printflag
# # This avoids having to comment in/out various code chunks
# # while debugging.
# if (printflag == TRUE)
# {
# # CAT instead of PRINT if it's a string or numeric
# if (is.character(x))
# {
# cat(x, "\n", sep="")
# }
# if (is.numeric(x))
# {
# cat(x, "\n", sep="")
# } else {
# print(x)
# }
# }
# else
# {
# pass="BLAH"
# }
# }
checkwd <- function(tmpwd, lastchar_should_be = "/")
{
# Error check: tmpwd must end in slash.
# (for Windows, \\ )
characters = strsplit(tmpwd, "")[[1]]
if (characters[length(characters)] != "/")
{
tmpwd = paste(tmpwd, "/", sep="")
} else {
tmpwd
}
return(tmpwd)
}
#######################################################
# extract_fn_from_path:
#######################################################
#' Get the filename from a path
#'
#' The filename is split on slashes, and the last item is taken; this should be just
#' the filename.
#'
#' @param fn_with_path The filename, with partial or full path
#' @return \code{fn} The extracted filename
#' @export
#' @seealso \code{\link[base]{strsplit}}
#' @author Nicholas J. Matzke \email{matzke@@berkeley.edu}
#' @examples
#' fn_with_path = "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/modiscloud/extdata/2002_raw_MYD/MYD35_L2.A2002185.1910.005.2007206043609.hdf"
#' extract_fn_from_path(fn_with_path)
#'
extract_fn_from_path <- function(fn_with_path)
{
words = strsplit(fn_with_path, split="/")[[1]]
fn = words[length(words)]
return(fn)
}
# get path
# getpath_
# get_path_
# get_suffix
# get suffix_
# Backup file, if it exists
fn_bkup <- function(tmpdir, fn)
{
# Check if the old log file exists
files_list = list.files(path=tmpdir)
if (fn %in% files_list)
{
cmdstr = paste("mv ", fn, " ", paste(fn, ".bkup", sep=""), sep="")
system(cmdstr)
}
}
# printall <- function(dtf, chunksize_toprint = 40, printflag=TRUE)
# {
# # Print everything in a data frame, in chunks of e.g. 50 rows
# if (nrow(dtf) <= chunksize_toprint)
# {
# prflag(dtf, printflag=printflag)
# return(dtf)
# }
# rows_toprint = seq(1, nrow(dtf), chunksize_toprint)
#
# if (printflag == TRUE)
# {
# for (i in 1 : (length(rows_toprint)-1) )
# {
# tmp_rows_toprint_start = rows_toprint[i]
# tmp_rows_toprint_end = rows_toprint[i+1]
# prflag(dtf[tmp_rows_toprint_start:tmp_rows_toprint_end, ])
# }
#
# # Then print the end
# tmp_rows_toprint_start = rows_toprint[length(rows_toprint)]
# tmp_rows_toprint_end = nrow(dtf)
# prflag(dtf[tmp_rows_toprint_start:tmp_rows_toprint_end, ])
# }
# }
printall2 <- function(dtf, maxprint=99999)
{
old_maxprint = getOption("max.print")
options(max.print = maxprint)
print(dtf)
options(max.print = old_maxprint)
}
# moved to biogeobears stratified
# # Get indices of all matches to a list
# findall <- function(what, inlist)
# {
# TFmatches = inlist == what
# indices = 1:length(inlist)
# matching_indices = indices[TFmatches]
# return(matching_indices)
# }
# print to screen the header of a file
headf <- function(fn, nlines=5)
{
#lines = scan(file=fn, what="character", sep="\n")
for (i in 1:nlines)
{
line = scan(file=fn, what="character", sep="\n", skip=(i-1), nlines=1)
print(line)
}
}
# print to screen the tail of a file
tailf <- function(fn)
{
#lines = scan(file=fn, what="character", sep="\n")
#numlines = length(lines)
numlines = linecount(fn)
for (i in (numlines-5):numlines)
{
#print(lines[i])
line = scan(file=fn, what="character", sep="\n", skip=(i-1), nlines=1)
print(line)
}
}
tailfast <- function(fn)
{
cmdstr = paste("tail ", fn, sep="")
system(cmdstr)
}
# remove punctuation
remove_punct <- function(tempstr)
{
# Do find-replace to convert the names to all underscores, no spaces, no ' or "
# spaces to _
tempstr = gsub(" ", "_", tempstr)
# - to _
tempstr = gsub("-", "_", tempstr)
# . to _
tempstr = gsub("\\.", "_", tempstr)
# "'" to ""
tempstr = gsub("'", "", tempstr)
# '"' to ''
tempstr = gsub('"', '', tempstr)
return(tempstr)
}
# remove punctuation from a list
remove_punct_list <- function(templist)
{
for (k in 1:length(templist))
{
tempstr = templist[[k]]
templist[[k]] = remove_punct(tempstr)
}
return(templist)
}
#######################################
# remove "'" from a file
#######################################
# remove apostrophes
remove_apostrophes <- function(tempstr)
{
# Do find-replace to convert the names to all underscores, no spaces, no ' or "
# spaces to _
tempstr = gsub("'", "", tempstr)
return(tempstr)
}
# (e.g., MrBayes, read_nexus_data2
# remove punctuation from a list
remove_apostrophes_from_list <- function(templist)
{
for (k in 1:length(templist))
{
tempstr = templist[[k]]
templist[[k]] = remove_apostrophes(tempstr)
}
return(templist)
}
remove_apostrophes_from_file <- function(fn, outfn=paste(fn, "_noApostrophes", sep=""))
{
# Get the list of strings from the file
templist = scan(fn, what="character", sep="\n")
# Remove the apostrophes
templist2 = remove_apostrophes_from_list(templist)
write_lines_good(templist2, outfn)
return(outfn)
}
# split out: Taxon (subtaxon)
split_out_subtaxon <- function(templist)
{
templist_col1 = templist
templist_col2 = templist
for (k in 1:length(templist))
{
tempstr = templist[[k]]
words = strsplit(tempstr, " \\(")[[1]]
if (length(words) == 1)
{
templist_col1[[k]] = words[1]
templist_col2[[k]] = ""
}
if (length(words) == 2)
{
templist_col1[[k]] = words[1]
templist_col2[[k]] = gsub("\\)", "", words[2])
}
}
newcols = cbind(templist_col1, templist_col2)
return(newcols)
}
# http://r.789695.n4.nabble.com/Remove-empty-list-from-list-td806706.html
delete.NULLs <- function(x.list)
{
# delele null/empty entries in a list
x.list[unlist(lapply(x.list, length) != 0)]
}
remove_len_zero <- function(tmparr)
{
newarr = tmparr[lapply(tmparr, nchar) != 0]
return(newarr)
}
# match using 2 criteria
match_items <- function(tmplist, crit1, crit2)
{
outlist = c()
crit1_list = c()
for (i in 1:length(tmplist))
{
if (grepl(crit1, tmplist[i]))
{
crit1_list = c(crit1_list, tmplist[i])
}
}
crit2_list = c()
for (i in 1:length(crit1_list))
{
if (grepl(crit2, crit1_list[i]))
{
crit2_list = c(crit2_list, crit1_list[i])
}
}
return(crit2_list)
}
# match using crit1, but skip (leave out) if they have crit2
match_item_nomatch <- function(tmplist, crit1, crit2)
{
outlist = c()
crit1_list = c()
for (i in 1:length(tmplist))
{
if (grepl(crit1, tmplist[i]))
{
crit1_list = c(crit1_list, tmplist[i])
}
}
crit2_list = c()
for (i in 1:length(crit1_list))
{
if (grepl(crit2, crit1_list[i]))
{
blah = "blah"
}
else
{
crit2_list = c(crit2_list, crit1_list[i])
}
}
return(crit2_list)
}
# Get corresponding rownums for matching cells, in order
get_corresponding_rownums <- function(names1, names2)
{
names_from_1_in_2_TF = names1 %in% names2
names_from_2_in_1_TF = names2 %in% names1
# names_from_1_in_2 = tr_beastcon1$prt_beast_nodestats$tipnames[names_from_1_in_2_TF]
# names_from_2_in_1 = tr_beastcon2$prt_beast_nodestats$tipnames[names_from_2_in_1_TF]
names_from_1_in_2 = names1[names_from_1_in_2_TF]
names_from_2_in_1 = names2[names_from_2_in_1_TF]
rownums1 = (1:length(names_from_1_in_2_TF))[names_from_1_in_2_TF]
rownums2 = (1:length(names_from_2_in_1_TF))[names_from_2_in_1_TF]
alpha_order_kept_names_1 = order(names_from_1_in_2)
alpha_order_kept_names_2 = order(names_from_2_in_1)
corresp_node_rownums1 = rownums1[alpha_order_kept_names_1]
corresp_node_rownums2 = rownums2[alpha_order_kept_names_2]
corresp_rownums = cbind(corresp_node_rownums1, corresp_node_rownums2)
return(corresp_rownums)
}
replace_str_file <- function(fn, oldstr, newstr)
{
# Read a text file into a list of strings
lines = scan(fn, what="character", sep="\n")
# Replace output date-calibrated parsimony trees file
newlines = replace_str_lines(lines, oldstr, newstr)
# Write the list of lines to a file
write.table(newlines, file=fn, quote=FALSE, append=FALSE, sep="", row.names = FALSE, col.names=FALSE)
cat("In: ", fn, ", '", oldstr, "' replaced with '", newstr, "'\n", sep="")
return(fn)
}
replace_str_lines <- function(lines, oldstr, newstr)
{
# Replace output date-calibrated parsimony trees file
lines = gsub(oldstr, newstr, lines)
return(lines)
}
# replace each in a text file that matches, with newstr
replace_each_matching_line <- function(fn, str_to_find, newstr)#, blankstr="___")
{
# Read a text file into a list of strings
lines = scan(fn, what="character", blank.lines.skip=FALSE, sep="\n")
for (i in 1:length(lines))
{
# if (lines[i] == "")
# {
# lines[i] = blankstr
# }
# else
# {
if (grepl(str_to_find, lines[i]))
{
#print(str_to_find)
#print(lines[i])
lines[i] = newstr
cat("In: ", fn, ", line #", i, " replaced with '", newstr, "'\n", sep="")
}
# }
}
# Write the list of lines to a file
newfn = paste(fn, "_new", sep="")
write.table(lines, file=newfn, quote=FALSE, append=FALSE, sep="", row.names = FALSE, col.names=FALSE)
return(newfn)
}
# Null lines that match
null_lines <- function(list_of_lines, str_to_match)
{
list_of_lines2 = c()
for (i in 1:length(list_of_lines))
{
if (grepl(str_to_match, list_of_lines[i]))
{
# This, in effect, deletes the row containing the string
blah = 0
}
else
{
list_of_lines2 = c(list_of_lines2, list_of_lines[i])
}
}
return(list_of_lines2)
}
# Concatenate a list of files into one big file
cat_files <- function(infiles_list, outfn, tmpskip=0)
{
new_lines = c()
list_num_files = c()
for (i in 1:length(infiles_list))
{
# input file
infn = infiles_list[i]
lines = scan(infn, what="character", sep="\n", skip=tmpskip)
list_num_files = c(list_num_files, length(lines))
new_lines = c(new_lines, lines)
}
new_lines = write.table(new_lines, file=outfn, quote=FALSE, append=FALSE, sep="", row.names = FALSE, col.names=FALSE)
return(list_num_files)
}
print_table_sigs <- function(x, numsig=4, printout=FALSE)
{
new.table = signif_digits_df(dfnums_to_numeric(x), numsig, printout)
if (printout == TRUE)
{
cat("\n")
print(new.table)
cat("\n")
}
return(new.table)
}
print_table_row <- function(x, numsig=4, printout=FALSE)
{
require(gdata) # for "trim" function
if (length(numsig) == 1)
{
new.table = signif(as.numeric(x), numsig)
}
else
{
tmplist = rep(NA, length(numsig))
for (i in 1:length(numsig))
{
cmdstr = paste('tmplist[i] = sprintf("%1.', numsig[i], 'f", x[i])', sep="")
eval(parse(text = cmdstr))
}
}
outstr = ""
for (item in tmplist)
{
outstr = paste(outstr, trim(item), sep=" ")
}
print(paste(tmplist, sep=" "), quote=FALSE)
#cat("\n")
return(tmplist)
}
df_everything_to_char <- function(dtf, max_NAs=0.5)
{
dtf_names = names(dtf)
numcols = ncol(dtf)
cls_col_list = c()
for (i in 1:numcols)
{
# Get one column, convert to character:
cmdstr = paste("dtf$", dtf_names[i], " = as.character(dtf$", dtf_names[i], ")", sep="")
eval(parse(text = cmdstr))
}
return(dtf)
}
df_everything_to_factor <- function(dtf, max_NAs=0.5)
{
dtf_names = names(dtf)
numcols = ncol(dtf)
cls_col_list = c()
for (i in 1:numcols)
{
# Get one column, convert to character:
cmdstr = paste("dtf$", dtf_names[i], " = as.factor(dtf$", dtf_names[i], ")", sep="")
eval(parse(text = cmdstr))
}
return(dtf)
}
df_factors_to_char <- function(dtf, max_NAs=0.5)
{
dtf_classes = cls.df(dtf, printout=TRUE)
dtf_names = names(dtf)
numcols = ncol(dtf)
cls_col_list = c()
for (i in 1:numcols)
{
# Get one column:
cmdstr = paste("cls_col = class(dtf$'", dtf_names[i], "')", sep="")
eval(parse(text = cmdstr))
#cat(i, ": ", dtf_names[i], " = ", cls_col, "\n", sep="")
cls_col_list[i] = cls_col
}
for (i in 1:numcols)
{
if (cls_col_list[i] == "factor")
{
# Get one column, convert to character:
cmdstr = paste("newcol = as.character(dtf$", dtf_names[i], ")", sep="")
eval(parse(text = cmdstr))
cmdstr = paste("dtf$", dtf_names[i], " = newcol", sep="")
eval(parse(text = cmdstr))
}
}
tmp_classes = cls.df(dtf)
dtf_classes$newclasses = tmp_classes[,ncol(tmp_classes)]
cat("\n")
cat("dfnums_to_numeric(dtf, max_NAs=", max_NAs, ") reports: dataframe 'dtf_classes' has ", nrow(dtf_classes), " rows, ", ncol(dtf_classes), " columns.\n", sep="")
cat("...names() and classes() of each column below...\n", sep="")
cat("\n")
print(dtf_classes)
return(dtf)
}
# Remove factors from the character-like columns of a data.frame
# (leaves the numbers as numbers)
df_nonum_factors_to_char <- function(dtf, max_NAs=0.5)
{
dtf_classes = cls.df(dtf, printout=TRUE)
dtf_names = names(dtf)
numcols = ncol(dtf)
cls_col_list = c()
for (i in 1:numcols)
{
# Get one column:
cmdstr = paste("cls_col = class(dtf$'", dtf_names[i], "')", sep="")
eval(parse(text = cmdstr))
#cat(i, ": ", dtf_names[i], " = ", cls_col, "\n", sep="")
cls_col_list[i] = cls_col
}
for (i in 1:numcols)
{
if (cls_col_list[i] == "factor")
{
# Get one column, convert to character:
cmdstr = paste("newcol = as.character(dtf$", dtf_names[i], ")", sep="")
eval(parse(text = cmdstr))
cmdstr = paste("dtf$", dtf_names[i], " = newcol", sep="")
eval(parse(text = cmdstr))
}
}
tmp_classes = cls.df(dtf)
dtf_classes$newclasses = tmp_classes[,ncol(tmp_classes)]
cat("\n")
cat("dfnums_to_numeric(dtf, max_NAs=", max_NAs, ") reports: dataframe 'dtf_classes' has ", nrow(dtf_classes), " rows, ", ncol(dtf_classes), " columns.\n", sep="")
cat("...names() and classes() of each column below...\n", sep="")
cat("\n")
print(dtf_classes)
return(dtf)
}
# dput on an S4 object, or an S4 object within an S3 object, or something
# Source:
# http://stackoverflow.com/questions/3466599/dputting-an-s4-object
dput2 <- function (x,
file = "",
control = c("keepNA", "keepInteger", "showAttributes")){
if (is.character(file))
if (nzchar(file)) {
file <- file(file, "wt")
on.exit(close(file))
}
else file <- stdout()
opts <- .deparseOpts(control)
if (isS4(x)) {
cat("new(\"", class(x), "\"\n", file = file, sep = "")
for (n in slotNames(x)) {
cat(" ,", n, "= ", file = file)
dput2(slot(x, n), file = file, control = control)
}
cat(")\n", file = file)
invisible()
} else if(length(grep('@',capture.output(str(x)))) > 0){
if(is.list(x)){
cat("list(\n", file = file, sep = "")
for (i in 1:length(x)) {
if(!is.null(names(x))){
n <- names(x)[i]
if(n != ''){
cat(" ,", n, "= ", file = file)
}
}
dput2(x[[i]], file = file, control = control)
}
cat(")\n", file = file)
invisible()
} else {
stop('S4 objects are only handled if they are contained within an S4 object or a list object')
}
}
else .Internal(dput(x, file, opts))
}
# dput on an S4 object, or an S4 object within an S3 object, or something
# Source:
# http://stackoverflow.com/questions/3466599/dputting-an-s4-object
# can't get this modified version to insert commas appropriately, so we just
# have reformatted function here
dput2a <- function (x, file = "", control = c("keepNA", "keepInteger", "showAttributes"))
{
if (is.character(file))
{
if (nzchar(file))
{
file <- file(file, "wt")
on.exit(close(file))
}
else
{
file <- stdout()
}
}
opts <- .deparseOpts(control)
if (isS4(x)) # if #1
{
cat("new(\"", class(x), "\"\n", file = file, sep = "")
for (n in slotNames(x))
{
cat(" ,", n, "= ", file = file)
dput2a(slot(x, n), file = file, control = control)
}
cat(")\n", file = file)
invisible()
} else if(length(grep('@',capture.output(str(x)))) > 0) # if #2
{
if(is.list(x))
{
cat("list(\n", file = file, sep = "")
for (i in 1:length(x))
{
if(!is.null(names(x)))
{
n <- names(x)[i]
if(n != '')
{
cat(" ,", n, "= ", file = file)
}
}
dput2a(x[[i]], file = file, control = control)
}
cat(")\n", file = file)
invisible()
} else {
stop('S4 objects are only handled if they are contained within an S4 object or a list object')
}
} else { #if #3
.Internal(dput(x, file, opts))
}
}
# multiline sed; the example fixed dput2 output on S4 objects
multiline_sed <- function(fn, patternstr, newstr, outfn="sed_result.txt")
{
# R requires \\n here (for system() command)
# UNIX/Terminal will just want \n
#patternstr = ')\\nnew(\"Polygons\"'
#newstr = '),new("Polygons"'
# Do find/replace on mulitline (removes \n but who cares)
# This sed works, although it removes the \n:
#sed -n '1h;1!H;${;g;s|)\nnew(\"Polygons\"|),new("Polygons"|g;p;}' tmppoly2.txt > tmppoly2_edited.txt;
k = 0
cmds = NULL
# Starter; from here:
# http://austinmatzko.com/2008/04/26/sed-multi-line-search-and-replace/
cmds[[(k=k+1)]] = "sed -n '1h;1!H;${;g;"
# preface to starting string
cmds[[(k=k+1)]] = "s|"
# pattern string; user specifies escapes etc.
#cmds[[(k=k+1)]] = ')\\nnew(\"Polygons\"'
cmds[[(k=k+1)]] = patternstr
# transition to replace string
cmds[[(k=k+1)]] = "|"
# replacement string; user specifies escapes etc.
#cmds[[(k=k+1)]] = '),new("Polygons"'
cmds[[(k=k+1)]] = newstr
# End sed
cmds[[(k=k+1)]] = "|g;p;}'"
# input filename and output filename
#outfn = gsub(pattern="\\.", replacement="_edited.", x=fn)
#cmds[[(k=k+1)]] = paste(" ", fn, " > sed_result.txt;", sep="")
cmds[[(k=k+1)]] = paste(" ", fn, " > ", outfn, ";", sep="")
cmdstr = paste(cmds, collapse="", sep="")
return(cmdstr)
}
moving_average <- function(xvals, yvals, byvals)
{
# moving averages
intervals = seq(from=min(xvals), to=max(xvals), by=byvals)
xmeans = c()
ymeans = c()
for (i in 1:(length(intervals)-1))
{
starti = intervals[i]
endi = intervals[i+1]
indices1 = xvals >= starti
indices2 = xvals < endi
indices_in_range = (indices1 + indices2) > 1
vals = yvals[indices_in_range]
if (length(vals) > 0)
{
xmean = mean(c(starti, endi))
ymean = mean(vals)
xmeans = c(xmeans, xmean)
ymeans = c(ymeans, ymean)
}
}
tmp = cbind(xmeans, ymeans)
moving_avgs = data.frame(tmp)
return(moving_avgs)
}
extract_lines_startnum_to_flag <- function(lines, row_to_start, string_to_stop_at)
{
doneflag = 0
lines_to_keep = c()
lines_to_keep_i = 0
for (i in 1:length(lines))
{
line = lines[i]
# skip the 1st two lines
if (i < row_to_start)
{
blah = 0
}
else
{
if (doneflag == 1)
{
blah = 0
}
else
{
# if line starts with a dash, you are done...
if (grepl(string_to_stop_at, line))
{
#print("yo!")
doneflag = 1
}
else
{
lines_to_keep_i = lines_to_keep_i + 1
print(paste("Storing line #", lines_to_keep_i, sep=""))
lines_to_keep[[lines_to_keep_i]] = line
}
}
}
}
return(lines_to_keep)
}
# This will only extract 1 group between 1st startflag & 1st doneflag
extract_lines_startstr_to_endstr <- function(lines, string_to_start_at, string_to_end_at, printflag=TRUE, include_endstring=FALSE, instance_to_find=1)
{
startflag = 0
doneflag = 0
lines_to_keep = c()
lines_to_keep_i = 0
instance = 0
for (i in 1:length(lines))
{
line = lines[i]
#print(paste(string_to_start_at, line))
if (grepl(string_to_start_at, line))
{
instance = instance+1
if (instance == instance_to_find)
{
startflag = 1
}
}
# Only check for the doneflag, *IF YOU HAVE ALREADY FOUND THE STARTFLAG*
if (startflag == 1)
{
if (grepl(string_to_end_at, line))
{
doneflag = 1
}
}
# end if done; if not, check startflag
if (doneflag == 1)
{
blah = 0
if(printflag)
{
print("Found string_to_end_at, exiting extract_lines_startstr_to_endstr()...")
}
# Include the last line, if desired
if (include_endstring == TRUE)
{
lines_to_keep_i = lines_to_keep_i + 1
if(printflag)
{
print(paste("Storing ", lines_to_keep_i, "th line, line #", i, sep=""))
}
lines_to_keep[[lines_to_keep_i]] = line
return(lines_to_keep)
}
}
else
{
# MAIN inclusion of lines
if (startflag == 1)
{
lines_to_keep_i = lines_to_keep_i + 1
#print(lines_to_keep_i)
if(printflag)
{
print(paste("Storing ", lines_to_keep_i, "th line, line #", i, sep=""))
}
lines_to_keep[[lines_to_keep_i]] = line
}
else
{
blah = 0
}
}
}
# Check if you've got the correct instance
return(lines_to_keep)
}
# Extract the strings that have a certain word at a certain position
extract_lines_with_word_at_wordnum <- function(lines, word_to_find, wordnum=1, delimiter=" ", printflag=FALSE)
{
linenums_to_keep = c()
for (i in 1:length(lines))
{
tmpline = lines[i]
if (tmpline == "")
{
next
}
words = strsplit(tmpline, split=delimiter)[[1]]
#print(words)
if (words[wordnum] == word_to_find)
{
#print(words[wordnum])
linenums_to_keep[length(linenums_to_keep)+1] = i
if(printflag)
{
print(paste("Storing line #", i, sep=""))
}
}
}
#print(linenums_to_keep)
newlines = lines[linenums_to_keep]
return(newlines)
}
# This is super-slow
array_to_text_sucky <- function(inarray, spacer)
{
tmpstr = inarray[1]
for (i in 2:length(inarray))
{
tmpstr = paste(tmpstr, inarray[i], sep=spacer)
}
return(tmpstr)
}
array_to_text <- function(inarray, spacer)
{
tmpstr = paste(as.character(inarray), sep="", collapse=spacer)
return(tmpstr)
}
minmax_pretty <- function(x)
{
minmax = c( min(pretty(x)), max(pretty(x)) )
return(minmax)
}
minmax_literal <- function(x)
{
minmax = c( min(x, na.rm=TRUE), max(x, na.rm=TRUE) )
return(minmax)
}
convert_nums_to_circular_lat <- function(nums, maxval=90)
{
mini = floor(min(nums))
maxi = ceiling(max(nums))
nums5 = rep(0, length(nums))
# go through 0-maxval, maxval-2*maxval, 2*maxval-3*maxval, 3*maxval-4*maxval
# translates:0-maxval, maxval-0, 0- -maxval, -maxval-0
edgeval = maxval * 4
# get the nums in terms of edgevals
nums1 = nums/edgeval
# Don't change numbers between -maxval and maxval, but change the rest of them...
##############################################
# degrees > 0 (positive)
##############################################
posvals = nums > 0
# remove all cycles except the last
nums2 = nums1[posvals] - floor(nums1[posvals])
# convert back to lat:
nums3 = nums2 * edgeval
nums4 = nums3
vals_to_change = (nums3 > maxval) + (nums3 <= 2*maxval) == 2
nums4[vals_to_change] = 2*maxval - nums3[vals_to_change]
vals_to_change = (nums3 > 2*maxval) + (nums3 <= 3*maxval) == 2
nums4[vals_to_change] = -1*(nums3[vals_to_change] - 2*maxval)
vals_to_change = (nums3 > 3*maxval) + (nums3 <= 4*maxval) == 2
nums4[vals_to_change] = 4*maxval - nums3[vals_to_change]
nums5[posvals] = nums4
##############################################
# degrees < 0 (negative)
##############################################
negvals = nums < 0
# remove all cycles except the last
nums2 = nums1[negvals] - ceiling(nums1[negvals])
# convert back to lat:
nums3 = nums2 * edgeval
nums4 = nums3
vals_to_change = (nums3 < -maxval) + (nums3 >= -2*maxval) == 2
nums4[vals_to_change] = -2*maxval - nums3[vals_to_change]
vals_to_change = (nums3 < -2*maxval) + (nums3 >= -3*maxval) == 2
nums4[vals_to_change] = -1*(nums3[vals_to_change] - -1*2*maxval)
vals_to_change = (nums3 < -3*maxval) + (nums3 >= -4*maxval) == 2
nums4[vals_to_change] = -4*maxval - nums3[vals_to_change]
nums5[negvals] = nums4
return(nums5)
}
convert_nums_to_long <- function(nums, maxval=180)
{
# get the nums in terms of 180s
nums0 = nums
nums1 = nums
# For nums > 0
TFnums_gt0 = nums > 0
nums1[TFnums_gt0] = nums0[TFnums_gt0]/maxval
# remove all cycles except the last
nums1[TFnums_gt0] = nums1[TFnums_gt0] - floor(nums1[TFnums_gt0])
# convert back to lat:
nums1[TFnums_gt0] = nums1[TFnums_gt0] * maxval
# For nums < 0
TFnums_lt0 = nums < 0
nums1[TFnums_lt0] = nums0[TFnums_lt0]/maxval
# remove all cycles except the last
nums1[TFnums_lt0] = nums1[TFnums_lt0] - ceiling(nums1[TFnums_lt0])
# convert back to lat:
nums1[TFnums_lt0] = nums1[TFnums_lt0] * maxval
return(nums1)
}
# Basic correlation plot
basic_xy_plot <- function(x, y, titlestart_txt="title\n", xlab=names(x), ylab=names(y), xlim=minmax_pretty(x), ylim=minmax_pretty(y))
{
cortest_xy = cor.test(x, y, xlim=c(-1,1), ylim=c(-1,1))
lm_xy = lm(y ~ x)
slope = lm_xy$coefficients[2]
intercept = lm_xy$coefficients[1]
slopetxt = round(lm_xy$coefficients[2], 4)
intercepttxt = round(lm_xy$coefficients[1], 4)
corval = round(cortest_xy$estimate, 2)
#pval = round(cortest_xy$p.value, 4)
pval = cortest_xy$p.value
titletxt = paste(titlestart_txt, "cor=", corval, "; slope=", slopetxt, "; int=", intercepttxt, "; p=", pval, sep="")
plot(x, y, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim)
# make segment
minx = min(x, na.rm=TRUE)
miny = min(y, na.rm=TRUE)
maxx = max(x, na.rm=TRUE)
maxy = max(y, na.rm=TRUE)
x0 = minx
x1 = maxx
y0 = slope*x0 + intercept
y1 = slope*x1 + intercept
segments(x0, y0, x1, y1)
title(titletxt)
}
plot_basic_xy <- function(x, y, titlestart_txt="title\n", xlab=names(x), ylab=names(y), xlim=minmax_pretty(x), ylim=minmax_pretty(y))
{
cortest_xy = cor.test(x, y, xlim=c(-1,1), ylim=c(-1,1))
lm_xy = lm(y ~ x)
slope = lm_xy$coefficients[2]
intercept = lm_xy$coefficients[1]
slopetxt = round(lm_xy$coefficients[2], 4)
intercepttxt = round(lm_xy$coefficients[1], 4)
corval = round(cortest_xy$estimate, 2)
pval = cortest_xy$p.value
titletxt = paste(titlestart_txt, "cor=", corval, "; slope=", slopetxt, "; int=", intercepttxt, "; p=", pval, sep="")
plot(x, y, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim)
# make segment
minx = min(x, na.rm=TRUE)
miny = min(y, na.rm=TRUE)
maxx = max(x, na.rm=TRUE)
maxy = max(y, na.rm=TRUE)
x0 = minx
x1 = maxx
y0 = slope*x0 + intercept
y1 = slope*x1 + intercept
segments(x0, y0, x1, y1)
title(titletxt)
}
add_lm_line_to_plot <- function(x, y, ...)
{
lm_xy = lm(y ~ x)
# DON'T ROUND FOR PLOTTING!!
#slope = round(lm_xy$coefficients[2], 2)
#intercept = round(lm_xy$coefficients[1], 2)
slope = lm_xy$coefficients[2]
intercept = lm_xy$coefficients[1]
# make segment
minx = min(x, na.rm=TRUE)
miny = min(y, na.rm=TRUE)
maxx = max(x, na.rm=TRUE)
maxy = max(y, na.rm=TRUE)
x0 = minx
x1 = maxx
y0 = slope*x0 + intercept
y1 = slope*x1 + intercept
segments(x0, y0, x1, y1, ...)
return(lm_xy)
}
# Fancy correlation plots
#http://www.personality-project.org/R/r.useful.html
#some fancy graphics -- adapted from help.cor
panel.cor.scale <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r = (cor(x, y,use="pairwise"))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex * abs(r))
}
panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r = (cor(x, y,use="pairwise"))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex )
}
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...)
}
panel.points <- function(x, tmppch=20, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
par(pch=tmppch)
pts <- points(x, pch=".")
}
pairs.panels <- function (x,y,smooth=TRUE,scale=FALSE, tmppch=NULL)
{
oldpar = par("pch")
par(pch = tmppch)
if (smooth )
{
if (scale)
{
pairs(x, diag.panel=panel.hist,upper.panel=panel.cor.scale,lower.panel=panel.smooth)
#pairs(x, diag.panel=panel.hist,upper.panel=panel.cor.scale,lower.panel=function(x) panel.points(x) )
#pairs(x, diag.panel=panel.hist, upper.panel=panel.cor.scale, lower.panel=function(x) points(x, pch=tmppch))
#pairs(x, diag.panel=panel.hist, upper.panel=panel.cor.scale, lower.panel=panel.points)#, lower.panel=panel.points(x, pch=tmppch))
}
else
{
#pairs(x, diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=function(x) panel.points(x) )
pairs(x, diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth)
#pairs(x, diag.panel=panel.hist, upper.panel=panel.cor, lower.panel=function(x) points(x, pch=tmppch))
#pairs(x, diag.panel=panel.hist, upper.panel=panel.cor, lower.panel=panel.points)#, lower.panel=panel.points(x, pch=tmppch))
} #else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth)
}
else #smooth is not true
{
if (scale)
{
pairs(x, diag.panel=panel.hist,upper.panel=panel.cor.scale)
}
else
{
pairs(x, diag.panel=panel.hist,upper.panel=panel.cor)
}
} #end of else (smooth)
par(pch = oldpar)
} #end of function
pairs.panels.lm <- function (x,y,tmplm=TRUE,scale=FALSE, tmppch=NULL)
{
oldpar = par("pch")
par(pch = tmppch)
if (tmplm)
{
if (scale)
{
pairs(x, diag.panel=panel.hist, upper.panel=panel.cor.scale, lower.panel=panel.smooth, span=0.5, f=0.5)
}
else
{
pairs(x, diag.panel=panel.hist, upper.panel=panel.cor, lower.panel=panel.smooth)
} #else {pairs(x,diag.panel=panel.hist,upper.panel=panel.cor,lower.panel=panel.smooth)
}
else #smooth is not true
{
if (scale)
{
pairs(x, diag.panel=panel.hist,upper.panel=panel.cor.scale)
}
else
{
pairs(x, diag.panel=panel.hist,upper.panel=panel.cor) }
} #end of else (smooth)
par(pch = oldpar)
} #end of function
#function to replace upper triangle of matrix with unattenuated correlations, and the diagonal with reliabilities
#input is a correlation matrix and a vector of reliabilities
correct.cor <- function(x,y) { n=dim(x)[1]
{ x[1,1] <- y[1,1]
for (i in 2:n) {
x[i,i] <- y[1,i] #put reliabilities on the diagonal
k=i-1
for (j in 1:k) {
x[j,i] <- x[j,i]/sqrt(y[1,i]*y[1,j]) } #fix the upper triangular part of the matrix
}
return(x) }}
#difference of dependent (paired) correlations (following Steiger,J., 1980, Tests for comparing elements of a correlation matrix. Psychological Bulletin, 87, 245-251)
paired.r <- function(xy,xz,yz,n) {
diff <- xy-xz
determin=1-xy*xy - xz*xz - yz*yz + 2*xy*xz*yz
av=(xy+xz)/2
cube= (1-yz)*(1-yz)*(1-yz)
t2 = diff * sqrt((n-1)*(1+yz)/(((2*(n-1)/(n-3))*determin+av*av*cube)))
return(t2)
}
fisherz <- function(rho) {0.5*log((1+rho)/(1-rho)) } #converts r to z
###################################################
# LaTeX-related functions
###################################################
#
# to run tex2div
library(tools)
# You also need a well-set-up .Rprofile file to make it work (see old
# or current .Rprofile, just one line)
# Might not need much more than:
############################################################################
# See /Dropbox/_njm/_njm/LaTeX_functioning/ for a detailed example of LaTeX / Sweave
# Rnw Snw etc. working...
#############################################################################
## Set working directory
##wd = "/Users/nick/Desktop/_IB286_Marshall_paleobio/"
#wd = "/Users/nick/Desktop/_2010-10-12_work/_Marshall_PBDB/_IB286_Marshall_paleobio/"
#wd = "/Dropbox/_njm/_njm/LaTex_functioning/"
#setwd(wd)
runlatex <- function(fn, wd=getwd())
{
# Setup directories for LaTex files and the working directories
# Old, 2011 version -- still seems to work (2013-02-13, NJM)
swwds = c("/usr/local/texlive/2011basic/texmf-dist/tex/latex/base/", wd, "/usr/texbin/")
# New, 2012 version
#swwds = c("/usr/local/texlive/2012basic/texmf-dist/tex/latex/base/", wd, "/usr/texbin/")
# New version, installed from:
# /Users/nickm/Downloads/install-tl-unx.tar.gz
# cd /Users/nickm/Downloads/install-tl-20130128
# sudo ./install-tl
# # This installed to:
# Add /usr/local/texlive/2012/texmf/doc/man to MANPATH, if not dynamically determined.
# Add /usr/local/texlive/2012/texmf/doc/info to INFOPATH.
# Most importantly, add /usr/local/texlive/2012/bin/universal-darwin
# to your PATH for current and future sessions.
#
# cd ~
# bbedit .bash_profile
# bbedit .Rprofile
# These commands build and open the Sweave tex and PDF files...
# permfn = "example-2.Snw"
# permfn = "hw2_v2.Snw"
permfn = fn
Sweave(permfn)
texfn = sub(".Snw", ".tex", permfn)
texi2dvi(texfn, pdf="TRUE", quiet=FALSE, texinputs=swwds)
pdffn = sub(".Snw", ".pdf", permfn)
cmdstr = paste("open ", pdffn, sep="")
system(cmdstr)
return(cmdstr)
}
##############################################################
# TreeRogue 0.1: TreeThief-like tree grabbing using x,y
# coordinates digitized from an image of a phylogenetic tree.
##############################################################
# GOAL: to process x, y coordinates into a Newick-format tree
##############################################################
# by Nick Matzke
# Copyright 2010
# matzke@berkeley.edu
# 10/27/2010
##############################################################
#
# Free to use/redistribute under GNU General Public License, version 2
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
#
###################################################################
#
# #===================================================
# Run with these commands
# ===================================================
# library(ape)
#
# put the text files (at bottom) into your working directory
# wd = "/Users/nick/Desktop/__projects/2010-11-01_cone_snails/"
# setwd(wd)
#
# xy2 = treerogue_read_files()
# xy3 = treerogue_associate_branch_bottoms_with_nodes(xy2)
# tr = build_tree_using_corners(xy3)
# plot(tr)
#
####################################################################
#
# NOTES
# *Heavily* modified from a very limited script posted here by
# bbolker@gmail.com:
# https://stat.ethz.ch/pipermail/r-sig-phylo/2008-November/000173.html
#
# Background: I worked up this script after getting frustrated at
# (1) the failure of the famous "TreeThief" to work on any modern machine;
# (2) my failure to get the newer "TreeSnatcher" to work on Mac OS X 10.4.11
# (some weirdness about Java and X11, as far as I can tell), and
# (3) no other good options.
#
# Summary: This script takes in x,y coordinates (with the lower left as the origin)
# of nodes, tips, and branch bottoms ("corners"), and builds a tree out of it.
#
# It assumes:
# (1) Your tree is horizontal left-to-right, with the tips on the right
# (2) Your tree is a "square" tree (i.e. no diagonal/curved branches)
#
# I captured my x,y coordinates using GraphClick 3.0, available for $8 at:
# http://www.arizona-software.ch/graphclick/
#
# (there is a free trial download, but only lets you export 10 coordinates at a
# time, so it is pointless)
#
# REQUIRED INPUTS:
# (for each, digitize the relevant points in GraphClick, and
# File-->Export to a text file):
#
# (Note: all text files should have a header line)
#
# 1. A tab-delimited text file with x and y for each internal node
#
# 2. A tab-delimited text file with x and y for each tip
#
# 2a. A text file with tip names, in order from top-to-bottom
#
# 3. A tab-delimited text file with x and y for each tip for each "corner"
# (i.e., the bottom of each branch).
#
# 4. For now, do NOT digitize the bottom of the root of the tree, if the
# image you are digitizing has one. You could add the root length later
# manually, if you like.
#
# 5. The tree must be fully dichotomous (if the image you are digitizing is not,
# you can "fake it" by resolving polytomies by clicking digitization points
# to, in effect, manually resolve these polytomies with very short branches.
# Note that you will have to add a corner for each internal node you add (!).
#
# The R APE package can collapse short branches to polytomies later, if you like.
#
# Trees do not have to be ultrametric, and digitization does not have to be
# exact -- the script will attempt to match the most likely branch bottoms
# to the nodes (a graphic is produced by R so that you can check the results
# and tinker if necessary).
#
# Requires the APE library.
#
#############################################################
library(ape)
# Assumes default filenames
treerogue_read_files <- function(internal_nodes_file="internal.txt", tip_nodes_file="tips.txt", corner_nodes_file="corners.txt", tipnames_file="tipnames.txt", txt_file_type="txt", points_files_have_column_headings=TRUE, tipnames_file_has_column_heading=TRUE)
{
defaults='
# Mac / Nick / Raiff
internal_nodes_file="internal.txt"
tip_nodes_file="tips.txt"
corner_nodes_file="corners.txt"
tipnames_file="tipnames.txt"
txt_file_type="txt"
points_files_have_column_headings=TRUE
tipnames_file_has_column_heading=TRUE
# Windows/Raji
internal_nodes_file="Nodes.csv"
tip_nodes_file="Tips.csv"
corner_nodes_file="Corners.csv"
tipnames_file="tipnames.txt"
txt_file_type="csv"
points_files_have_column_headings=FALSE
tipnames_file_has_column_heading=TRUE
'
# The text files types can be tab-delimited text (.txt), or comma-delimited (csv)
if (txt_file_type == "txt")
{
internal = read.table(file=internal_nodes_file, header=points_files_have_column_headings, skip=0, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
tips = read.table(file=tip_nodes_file, header=points_files_have_column_headings, skip=0, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
corners = read.table(file=corner_nodes_file, header=points_files_have_column_headings, skip=0, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
tipnames = read.table(file=tipnames_file, header=tipnames_file_has_column_heading, skip=0, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
}
# The text files types can be tab-delimited text (.txt), or comma-delimited (csv)
if (txt_file_type == "csv")
{
internal = read.table(file=internal_nodes_file, header=points_files_have_column_headings, skip=0, sep=",", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
tips = read.table(file=tip_nodes_file, header=points_files_have_column_headings, skip=0, sep=",", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
corners = read.table(file=corner_nodes_file, header=points_files_have_column_headings, skip=0, sep=",", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
tipnames = read.table(file=tipnames_file, header=tipnames_file_has_column_heading, skip=0, sep=",", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
}
if (points_files_have_column_headings == FALSE)
{
names(internal) = c("x","y")
names(tips) = c("x","y")
names(corners) = c("x","y")
names(tipnames) = "tipnames"
}
if (tipnames_file_has_column_heading == FALSE)
{
names(tipnames) = "tipnames"
}
# combine all the points for plotting purposes
allpoints = rbind(corners, tips, internal)
# Setup the plot
plot(allpoints, pch=".", col="white")
# Plot the points
points(tips, xlim=c(0,7), ylim=c(0,9), pch="t", col="red")
title("Plotting your tips (t), internal (i), corners (c). Look especially for\noutlying and missing points.")
points(internal, pch="i", col="blue")
points(corners, pch="c", col="green")
points(corners, xlim=c(0,7), ylim=c(0,9), pch=".", col="black")
# sort the tips from top to bottom in y
tips = tips[order(tips$y, decreasing = TRUE), ]
# sort the internals from left to right in x
internal = internal[order(internal$x, decreasing=FALSE), ]
if (nrow(tips) != nrow(tipnames))
{
print("ERROR: the number of tips must equal the length of the tipnames!")
print(paste("Instead, nrow(tipnames) =", nrow(tipnames), "and nrow(tips) =", nrow(tips), sep=" "))
}
if ((nrow(tips)-1) != nrow(internal))
{
print("ERROR: the number of tips-1 must equal the number of the internal nodes!")
print(paste("Instead, nrow(tips) =", nrow(tips), "and nrow(internal) =", nrow(internal), sep=" "))
}
nodetypes = c(rep("tip", nrow(tipnames)), rep("internal", nrow(internal)))
nodenames = unlist(c(tipnames, rep("", nrow(internal))))
xy = rbind(tips, internal)
xy2 = cbind(xy, nodetypes, nodenames)
xy2 = as.data.frame(xy2)
names(xy2) = c("x", "y", "nodetypes", "nodenames")
xy2 = df_nonum_factors_to_char(xy2, max_NAs=0.5)
xy2$nodetypes[xy2$x == min(xy2$x)] = "root"
if (nrow(corners) != (nrow(xy2)-1))
{
print("ERROR: the number of corners must equal the number of nodes-1 !")
print(paste("Instead, length(nodes) =", nrow(xy2), "and nrow(corners) =", nrow(corners), sep=" "))
}
# sort file so that the tips are first
# tip nodes in order from top to bottom:
xytips = xy[xy$tipname != "", ]
tips_in_order = xy2$tipname[xy2$tipname != ""]
return(xy2)
}
treerogue_associate_branch_bottoms_with_nodes <- function(xy2, corner_nodes_file="corners.txt", points_files_have_column_headings=TRUE, txt_file_type="txt")
{
defaults='
# Mac / Nick / Raiff
corner_nodes_file="corners.txt"
points_files_have_column_headings=TRUE
# Windows/Raji
corner_nodes_file="Corners.csv"
points_files_have_column_headings=FALSE
'
# Load the coordinates of the corners
# The text files types can be tab-delimited text (.txt), or comma-delimited (csv)
if (txt_file_type == "txt")
{
corners = read.table(file=corner_nodes_file, header=points_files_have_column_headings, skip=0, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
}
if (txt_file_type == "csv")
{
corners = read.table(file=corner_nodes_file, header=points_files_have_column_headings, skip=0, sep=",", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
}
if (points_files_have_column_headings == FALSE)
{
names(corners) = c("x","y")
}
# Process the corners and attach them to nodes
bots = corners
names(bots) = c("x", "y")
# Get xy data
xx = xy2$x
yy = xy2$y
dtf = associate_branch_bottoms_with_nodes(xx, yy, bots)
xy3 = cbind(xy2, dtf$chosen_node_bottoms_xx, dtf$chosen_node_bottoms_yy)
names(xy3) = c(names(xy2), "cx", "cy")
write.table(xy3, "linked_corners.txt", quote=FALSE, sep=" ", row.names = FALSE,
col.names = TRUE)
return(xy3)
}
check_for_duplicate_points <- function(bots, pointtype="default")
{
# Get the list of distances
distbots_matrix = as.matrix(dist(bots))
# Remove zero distances on diagonal (self-to-self)
diag(distbots_matrix) = 100
bots_distances = c(distbots_matrix)
n1 = 1:nrow(bots)
n2 = 1:nrow(bots)
closest10 = sort(bots_distances)[1:10]
tmpstr = paste("\nChecking for identical or near-identical ", pointtype, ". Most likely:\n", sep="")
tmpstr = paste("\nIf you have any distances super-near zero, check for duplicate points.\n", sep="")
cat(tmpstr)
closest_bots = NULL
# For each of the closest 10 distances,
for (i in 1:length(closest10))
{
# Find bots distances that match EXACTLY
TF = bots_distances == closest10[i]
TFmat = matrix(TF, nrow=nrow(distbots_matrix), ncol=ncol(distbots_matrix), byrow=FALSE)
sumTF_rows = apply(TFmat, 1, sum)
sumTF_cols = apply(TFmat, 2, sum)
# This causes a problem, if there are identical bots_distances, and thus 2 instead of 1 in the sum
#for (j in 1:sum(sumTF_rows))
for (j in 1:sum(sumTF_rows>0))
{
cat(i, j, "\n", sep=", ")
print(n1)
print(sumTF_rows)
print(n1[sumTF_rows > 0])
nums1 = n1[sumTF_rows > 0][j]
#nums2 = n2[sumTF_cols > 0][j]
bots1 = bots[sumTF_rows > 0, ][j,]
bots1 = unlist(bots1)
tmprow = c(round(closest10[i], 6), nums1, bots1)
closest_bots = rbind(closest_bots, tmprow)
}
}
closest_bots
closest_bots = adf2(closest_bots)
tmpnames = c("dist", "rownum", "x", "y")
names(closest_bots) = tmpnames
return(closest_bots)
}
# Associate branch bottom coordinates with nodes, and plot the results;
# The user may then edit the output associations if they so desire.
associate_branch_bottoms_with_nodes <- function(xx, yy, bots)
{
# There should be one less branch bottom than there are internal nodes
# (because the root node (should) have no digitized "corner" beneath it)
nodes = 1:length(xx)
if (length(nodes) != nrow(bots) +1)
{
print("ERROR: the number of corners must equal the number of nodes-1 !")
print(paste("Instead, length(nodes) =", length(nodes), "and nrow(bots) =", nrow(bots), sep=" "))
} else {
#######################################
# Check for duplicate points
#######################################
closest_bots = check_for_duplicate_points(bots=bots, pointtype="corners")
print(closest_bots)
closest_tops = check_for_duplicate_points(bots=cbind(xx,yy), pointtype="tops")
print(closest_tops)
# OK, find bottom of branch to go with each top of branch
# an array saying which branches have a bottom
node_with_no_bottom = rep(TRUE, length(nodes))
# these are the remaining branch bottoms that have not been associated yet
bots_with_no_top = rep(TRUE, nrow(bots))
bxx = bots$x
byy = bots$y
botsnodes = 1:nrow(bots)
# Empty values to hold nodes
chosen_node_bottoms_xx = rep(NA, length(xx))
chosen_node_bottoms_yy = rep(NA, length(yy))
i=0
while (sum(node_with_no_bottom) > 1)
{
i=i+1
#print(i)
# look for branch bottom coordinates in a narrow window to the left of the node
# basically (a) smallest slope and (b) closest distance in x
## find next node to include (the rightmost internal node)
maxvals = xx * node_with_no_bottom
nextnode <- which( maxvals == max(maxvals, na.rm=TRUE) )[1]
####################################
# Find the best matching branch bottom/corner for each node
####################################
# This is trial-and-error, you may have to plink to find a function
# that works.
# That, or do manual edits to the tree later...
####################################
ydist <- byy - yy[nextnode]
xdist <- bxx - xx[nextnode]
# Rank of the y distances
rank_ydist = rank(abs(ydist))
# calculate the slops
xyslopes <- abs(ydist/xdist)
# the best ancestor will have a low slope to the branch bottom, and a short negative distance in x
xdist_neg = xdist
xdist_neg[xdist > 0] = 0
xdist_neg[xdist < 0] = -1 * xdist_neg[xdist < 0]
# normalize to units of minimum absolute distance
min_dist = (min(abs(xdist[xdist!=0]), na.rm=TRUE))
xdist_neg_norm = (xdist_neg / min_dist)
# short positive distances are less good (half as good) than short negative distances
xdist_pos = xdist
xdist_pos[xdist < 0] = 0
xdist_pos[xdist > 0] = xdist_pos[xdist > 0]
xdist_pos_norm = (xdist_pos / min_dist) * 100
rank_xdist = rank_ydist
rank_xdist[xdist <= 0] = 1
rank_xdist[xdist > 0] = 10000000
###########################
# Plink here especially...
###########################
rank_slope = (xyslopes^2)
#final_rank = rank_ydist * abs(ydist) + xyslopes^0.5 * xdist_neg_norm + xdist_pos_norm
# This worked on dinos pt1
#final_rank = 10000*(rank_ydist * abs(ydist)) + xyslopes^0.5 * xdist_neg_norm * xdist_pos_norm
# This worked on dinos pt1
final_rank = 10000*(rank_ydist * abs(ydist)) + xyslopes^0.5 * xdist_neg_norm * xdist_pos_norm
###########################
branch_bot_fits = final_rank
best_fit = which(branch_bot_fits == min(branch_bot_fits, na.rm=TRUE))[1]
#bottom_to_add = botsnodes[bots_with_no_top][best_fit]
bottom_to_add = botsnodes[best_fit]
chosen_node_bottoms_xx[nextnode] = bxx[bottom_to_add]
chosen_node_bottoms_yy[nextnode] = byy[bottom_to_add]
bxx = bxx[-bottom_to_add]
byy = byy[-bottom_to_add]
# remove the node from the list needing branch bottoms
node_with_no_bottom[nextnode] = FALSE
bots_with_no_top[bottom_to_add] = FALSE
}
}
tmpdata = cbind(nodes, xx, yy, chosen_node_bottoms_xx, chosen_node_bottoms_yy)
#print(tmpdata)
dtf = as.data.frame(tmpdata)
plot(dtf$y, dtf$chosen_node_bottoms_yy)
title("Y-coord of branch tops vs. branch bottoms. *Should* be linear.")
plot(c(xx, bots$x), c(yy, bots$y), pch="")
points(xx, yy, pch="n")
points(bots$x, bots$y, pch="b")
title("Use this plot to check if branch bottoms match branch tops (tips/nodes)")
segments(xx, yy, chosen_node_bottoms_xx, chosen_node_bottoms_yy)
plot(c(xx, bots$x), c(yy, bots$y), pch="")
text(xx, yy, label=paste("n", 1:length(xx), sep=""))
text(bots$x, bots$y, labe=paste("b", 1:nrow(bots), sep=""))
title("This plot has node numbers so you can manually hack it. However problems are\n almost always due to missing nodes or duplicate nodes while digitizing.")
segments(xx, yy, chosen_node_bottoms_xx, chosen_node_bottoms_yy)
return(dtf)
}
build_tree_using_corners <- function(xy3)
{
# define the tip.labels
tip.labels = xy3$nodenames
tip.labels = tip.labels[tip.labels != ""]
if (!missing(tip.labels))
{
ntips <- length(tip.labels)
}
xx = xy3$x
yy = xy3$y
cx = xy3$cx
cy = xy3$cy
nodes <- 1:length(xx)
is.tip <- nodes <= ntips
# keep track of the nodes which are unlinked
unlinked_nodes = rep(TRUE, length(nodes))
# Checks (kinda) if internal nodes are ordered from left-to-right
if (which.min(xx) != ntips+1)
{
##
print("ERROR: reorder nodes the way ape/phylo expects! (tips first, then internals in order from left-to-right.")
#yy[internal] <- rev(yy[!is.tip])[order(xx[!is.tip])]
#xx[internal] <- rev(yy[!is.tip])[order(xx[!is.tip])]
}
edges <- matrix(nrow=0,ncol=2)
edge.length <- numeric(0)
nnode <- length(xx)-ntips
while (sum(unlinked_nodes) > 1)
{
## find next node to include (the rightmost internal node)
nextnode <- which(!is.tip & xx==max(xx[!is.tip]))[1]
## find daughters
# get the distance (in y) to all of the other corners
ydist <- yy-yy[nextnode]
xdist <- xx-xx[nextnode]
# Check if it's the root
if ( is.na(cy[nextnode]) )
{
cy[nextnode] = yy[nextnode]
cx[nextnode] = 0 # leftmost coordinate must be 0!
}
cydist <- yy-cy[nextnode]
cxdist <- xx-cx[nextnode]
# find the TWO tips closest to this internal node, which are RIGHT of this node
# this only finds the CLOSEST tip in Y, we want the TWO closest tips!!
#daughters <- which(is.tip & dist==min(dist[is.tip]))
# rank the ydistances in the y direction
rank_of_ydists = order(cydist)
# rank the xdistances in the x direction
rank_of_xdists = order(cxdist)
# get the node numbers in order; delete from this list as they are eliminated
nodes_to_keep = nodes
# daughter nodes must be to the right (in x) of the nextnode
# (and they must be unlinked)
nodes_to_keep = nodes_to_keep[unlinked_nodes][xdist[unlinked_nodes] > 0]
# daughter nodes should be the two closest corners to nextnode (in y, mostly AND x)
absolute_dist_from_node = 100*abs(cydist[nodes_to_keep]) + 1*abs(cxdist[nodes_to_keep])
# sort the distances
absolute_dist_from_node_sorted = sort(absolute_dist_from_node)
# take the 2nd smallest absolute distance
y_abs_dist_tokeep = absolute_dist_from_node_sorted[2]
nodes_to_keep_final = nodes_to_keep[absolute_dist_from_node <= y_abs_dist_tokeep]
print(paste("Linking: #", nodes_to_keep_final[1], " ", tip.labels[nodes_to_keep_final[1]], ", #", nodes_to_keep_final[2], " ", tip.labels[nodes_to_keep_final[2]], sep=""))
#daughters <- which(is.tip & dist==min(dist[is.tip]))
daughters = nodes_to_keep_final
## be careful with numeric fuzz?
edges <- rbind(edges,
nodes[c(nextnode,daughters[1])],
nodes[c(nextnode,daughters[2])])
edge.length <- c(edge.length,xx[daughters]-xx[nextnode])
# add nextnode to the list of tips (which are not available nodes for the nextnode)
is.tip[nextnode] <- TRUE
# remove the daughters & coordinates from the list of available nodes
unlinked_nodes[daughters] = FALSE
print(sum(unlinked_nodes))
#xx <- xx[-daughters]
#yy <- yy[-daughters]
# remove the daughters from the list of possible nodes to link
#unlinked_nodes
#is.tip <- is.tip[-daughters]
#nodes <- nodes[-daughters]
}
tr <- list(tip.labels=tip.labels,
edge=edges,
edge.length=edge.length,
Nnode=nnode)
class(tr) <- "phylo"
tr <- reorder(tr)
tr$tip.labels = tip.labels
return(tr)
}
# This attempts to build the tree without corners.
# This works OK for young nodes, but gets increasingly bad
# with older nodes, unless you have a perfectly symmetrical tree.
build_tree_without_using_corners <- function(xx, yy, tip.labels, poly=numeric(0), debug=FALSE)
{
# define the tips
if (!missing(tip.labels))
{
ntips <- length(tip.labels)
}
nodes <- 1:length(xx)
is.tip <- nodes <= ntips
# keep track of the nodes which are unlinked
unlinked_nodes = rep(TRUE, length(nodes))
if (which.min(xx) != ntips+1)
{
##
print("ERROR: reorder nodes the way ape/phylo expects! (tips first, then internals in order from left-to-right.")
#yy[internal] <- rev(yy[!is.tip])[order(xx[!is.tip])]
#xx[internal] <- rev(yy[!is.tip])[order(xx[!is.tip])]
}
edges <- matrix(nrow=0,ncol=2)
edge.length <- numeric(0)
nnode <- length(xx)-ntips
while (sum(unlinked_nodes) > 1)
{
## find next node to include (the rightmost internal node)
nextnode <- which(!is.tip & xx==max(xx[!is.tip]))[1]
## find daughters
# get the distance (in y) to all of the other nodes
ydist <- yy-yy[nextnode]
xdist <- xx-xx[nextnode]
# find the TWO tips closest to this internal node, which are RIGHT of this node
# this only finds the CLOSEST tip in Y, we want the TWO closest tips!!
#daughters <- which(is.tip & dist==min(dist[is.tip]))
# rank the ydistances in the y direction
rank_of_ydists = order(ydist)
# rank the xdistances in the x direction
rank_of_xdists = order(xdist)
# get the node numbers in order; delete from this list as they are eliminated
nodes_to_keep = nodes
# daughter nodes must be to the right (in x) of the nextnode
# (and they must be unlinked)
nodes_to_keep = nodes_to_keep[unlinked_nodes][xdist[unlinked_nodes] > 0]
# daughter nodes should be the two closest to nextnode (in y)
absolute_dist_from_node = abs(ydist[nodes_to_keep])
# sort the distances
absolute_dist_from_node_sorted = sort(absolute_dist_from_node)
# take the 2nd smallest absolute distance
y_abs_dist_tokeep = absolute_dist_from_node_sorted[2]
nodes_to_keep_final = nodes_to_keep[absolute_dist_from_node <= y_abs_dist_tokeep]
print(paste("Linking: #", nodes_to_keep_final[1], " ", tip.labels[nodes_to_keep_final[1]], ", #", nodes_to_keep_final[2], " ", tip.labels[nodes_to_keep_final[2]], sep=""))
#daughters <- which(is.tip & dist==min(dist[is.tip]))
daughters = nodes_to_keep_final
## be careful with numeric fuzz?
edges <- rbind(edges,
nodes[c(nextnode,daughters[1])],
nodes[c(nextnode,daughters[2])])
edge.length <- c(edge.length,xx[daughters]-xx[nextnode])
# add nextnode to the list of tips (which are not available nodes for the nextnode)
is.tip[nextnode] <- TRUE
# remove the daughters & coordinates from the list of available nodes
unlinked_nodes[daughters] = FALSE
#xx <- xx[-daughters]
#yy <- yy[-daughters]
# remove the daughters from the list of possible nodes to link
unlinked_nodes
#is.tip <- is.tip[-daughters]
#nodes <- nodes[-daughters]
}
zz <- list(tip.labels=tip.labels,
edge=edges,
edge.length=edge.length,
Nnode=nnode)
class(zz) <- "phylo"
zz <- reorder(zz)
zz$tip.labels = tip.labels
return(zz)
}
treethief <- function(xypts)
{
## from ?plot.tree:
cat("(((Strix_aluco:4.2,Asio_otus:4.2):3.1,",
"Athene_noctua:7.3):6.3,Tyto_alba:13.5);",
file = "ex.tre", sep = "\n")
tree.owls <- read.tree("ex.tre")
#plot(tree.owls)
unlink("ex.tre") # delete the file "ex.tre"
#plot(tree.owls)
xy <- get("last_plot.phylo",envir=.PlotPhyloEnv)
xx <- xy$xx
yy <- xy$yy
points(xx,yy,col="white",pch=16,cex=2)
text(xx,yy,col=2,1:length(xx))
newtree <- build.tree(xx,yy,tree.owls$tip.label)
data(bird.orders)
plot(bird.orders,show.node.label=TRUE)
xy <- get("last_plot.phylo",envir=.PlotPhyloEnv)
points(xx,yy,col="white",pch=16,cex=2)
text(xx,yy,col=2,1:length(xx))
xx <- xy$xx
yy <- xy$yy
newtree2 <- build.tree(xx,yy,bird.orders$tip.label)
}
####################################
# charnames_to_nexus_text
####################################
# Take a list of names, add numbers & quotes, to produce NEXUS character matrix input, e.g.:
#
# 1 'it_is1', 2 'it_is1_blank', 3 'it_is1a', 4 'it_is1b', 5 'it_is1c', 6 not1, 7 'not1_blank', 8 not1a, 9 not1b, 10 the1, 11 'the1_blank', 12 intstr1, 13 'intstr1_blank', 14 intstr1b, 15 intstr1c, 16 intstr2, 17 'intstr2_blank', 18 intstr2b, 19 intstr2c, 20 of1, 21 the2, 22 'species1_blank', 23 species1a, 24 species1b, 25 that1, 26 'that1_blank', 27 survive1, 28 'survive1_blank', 29 survive1a, 30 survive1b, 31 of1, 32 'it_is2', 33 not2, 34 'not2_blank', 35 the3, 36 species2, 37 species2a, 38 species2b, 39 that2, 40 'that2_blank', 41 that2a, 42 survives2a, 43 'survives2_blank', 44 but1, 45 'but1_blank', 46 but1a, 47 rather, 48 'rather_blank', 49 rathera, 50 'it_is3_blank', 51 'it_is3a', 52 'it_is3b', 53 'it_is3c', 54 the4, 55 'the4_blank', 56 species3, 57 'species3_blank', 58 species3a, 59 that3, 60 'that3_blank', 61 survives3, 62 is1, 63 'is1_blank', 64 the5, 65 'the5_blank', 66 one, 67 that4, 68 'that4_blank', 69 is2, 70 'able_best', 71 'able_best_blank', 72 'able_best_a', 73 'able_best_b', 74 to1, 75 adapt1, 76 'adapt1_blank', 77 adapt1a, 78 adapt1b, 79 to2, 80 'to2_blank', 81 and1, 82 'and1_blank', 83 to3, 84 adjust1, 85 'adjust1_blank', 86 adjust1a, 87 adjust1b, 88 best1, 89 'best1_blank', 90 to4, 91 the6, 92 'the6_blank', 93 changing, 94 'changing_blank', 95 'long_end', 96 'long_end_blank', 97 'long_enda', 98 'long_endb', 99 'long_endc', 100 'long_endd', 101 comma1, 102 comma2, 103 comma3, 104 comma4, 105 'intel_before_strong', 106 'General_Field', 107 paraphrase, 108 'Darwin_attribution', 109 'Darwin_subsource_simp', 110 'if_evo_theory', 111 'if_Darwins_theory', 112 'if_Origin', 113 'cited_source' ;
charnames_to_nexus_text <- function(inarray, quotemark="")
{
tmpstr = c()
for (i in 1 : (length(inarray)-1) )
{
tmptmp = paste(i, " ", quotemark, inarray[i], quotemark, ", ", sep="")
tmpstr = paste(tmpstr, tmptmp, sep="")
}
# add the last one, without comma
tmpstr = paste(tmpstr, i, " ", quotemark, inarray[i+1], quotemark, " ", sep="")
return(tmpstr)
}
chars_to_nexus_fmt <- function(charmatrix, namewidth=12, require_question_marks=FALSE, ambiguity_check=TRUE)
{
require(gdata) # for "trim" function
outstr = ""
width = ncol(charmatrix)
# Debugging
#print(namewidth)
for (i in 1:nrow(charmatrix))
{
# This line caused a bug
#tmpstr1 = as.character(charmatrix$rownames[i])
tmpstr1 = as.character(row.names(charmatrix)[i])
len = nchar(tmpstr1, type="chars")
numspaces_to_add = namewidth - len
# Debugging
#print(i)
#print(tmpstr1)
#print(len)
#print(numspaces_to_add)
spaces_list = rep(" ", numspaces_to_add)
tmpstr2 = array_to_text(c(tmpstr1, spaces_list), spacer="")
chars = charmatrix[i,2:width]
tmpstr3 = array_to_text(inarray=chars, spacer="")
if (ambiguity_check)
{
# check for "&" chars: convert 1&2 to (1 2)
if (grepl("&", tmpstr3))
{
for (j in 1:(width-1))
{
if (grepl("&", chars[j]))
{
if (require_question_marks == FALSE)
{
# Code the character with the possible states
char = strsplit(as.character(chars[j]), "&")
newchars_str = array_to_text(char[[1]], spacer=" ")
newchar = paste("(", newchars_str, ")", sep="")
chars[j] = newchar
cat("Ambiguous character detected & converted...", charmatrix[i,1], ", char #", j, ": ", newchar, "\n")
}
else
{
# Code the character as "?"
chars[j] = "?"
}
}
}
tmpstr3 = array_to_text(inarray=chars, spacer="")
}
} # End ambiguity check
tmpline = paste(" ", tmpstr2, tmpstr3, "\n", sep="")
outstr = paste(outstr, tmpline, sep="")
}
outstr = trim(outstr)
#outstr = paste(" ", outstr, sep="")
return(outstr)
}
charmatrix_to_nexus_file <- function(charmatrix, nexus_outfn, quotemark="", namewidth=11, require_question_marks=FALSE, ambiguity_check=TRUE)
{
# initialize array of strings
# l = lines
l = c()
l[1] = "#NEXUS"
l[2] = "[ written by Nick Matzke R script 'quotescr_v11.R', written 9/1/2010 ]"
l[3] = ""
l[4] = "BEGIN TAXA;"
l[5] = " TITLE Taxa;"
l[6] = paste(" DIMENSIONS NTAX=", nrow(charmatrix), ";", sep="")
l[7] = "TAXLABELS"
tmpstr = array_to_text(charmatrix$rownames, spacer=" ")
l[8] = paste(" ", tmpstr, sep="")
l[9] = " ;"
l[10] = ""
l[11] = "END;"
l[12] = ""
l[13] = ""
l[14] = "BEGIN CHARACTERS;"
l[15] = " TITLE Character_Matrix;"
l[16] = paste(" DIMENSIONS NCHAR=", (ncol(charmatrix)-1), ";", sep="")
tmpstr = " FORMAT DATATYPE = STANDARD GAP = - MISSING = ?;"
# maybe we can skip this?
#SYMBOLS = 0 1 2 3 4 5 6 7 8 9 A B C D";"
l[17] = tmpstr
l[18] = " CHARSTATELABELS "
tmpstr = charnames_to_nexus_text(inarray = names(charmatrix)[2:ncol(charmatrix)] , quotemark="")
l[19] = paste(" ", tmpstr, ";", sep="")
l[20] = " MATRIX"
# assemble all the characters into one big string
l[21] = chars_to_nexus_fmt(charmatrix, namewidth, require_question_marks, ambiguity_check)
l[22] = ";"
l[23] = ""
l[24] = "END;"
write.table(l, nexus_outfn, append=FALSE, quote=FALSE, row.names=FALSE, col.names=FALSE)
headf(nexus_outfn)
tailf(nexus_outfn)
cat("Character matrix output to NEXUS: '", nexus_outfn, "'\n")
}
nexus_data_to_chardf <- function(nexusdata)
{
# Convert character data read in by APE's read.data.nexus
# into a simple dataframe table (rows: taxa, columns: characters)
#
# This table can then be written out to NEXUS or TNT
#
chardf1 = as.data.frame(nexusdata)
chardf2 = as.data.frame(t(chardf1))
# Get the number of characters (columns)
numchars = dim(chardf2)[2]
# make the row names a column in the table
tmp_rownames = rownames(chardf2)
# blank them out
rownames(chardf2) = NULL
# add a column with the rownames
chardf3 = cbind(tmp_rownames, chardf2)
# make the column headers
col_headers1 = paste("char", 1:numchars, sep="")
col_headers2 = c("rownames", col_headers1)
# put the column headers into the chardf
names(chardf3) = col_headers2
#
cat("\n\nPreview of character dataframe (first 8 rows & 15 columns:\n\n")
print(chardf3[1:8, 1:15])
cat("\nTotal chardf dimensions: ", dim(chardf3)[1], " rows & ", dim(chardf3)[2], " columns.\n\n", sep="")
return(chardf3)
}
charnames_to_tnt_text <- function(inarray, quotemark="")
{
tmpstr = c()
for (i in 1:length(inarray) )
{
tmptmp = paste("{", i-1, " ", quotemark, inarray[i], quotemark, ";\n", sep="")
tmpstr = paste(tmpstr, tmptmp, sep="")
}
return(tmpstr)
}
chars_to_tnt_fmt <- function(charmatrix, namewidth=12)
{
require(gdata) # for "trim" function
outstr = ""
width = ncol(charmatrix)
for (i in 1:nrow(charmatrix))
{
tmpstr1 = as.character(charmatrix$rownames[i])
#len = nchar(tmpstr1, type="chars")
#numspaces_to_add = namewidth - len
#spaces_list = rep(" ", numspaces_to_add)
tmpstr2 = array_to_text(c(tmpstr1, " "), spacer="")
chars = charmatrix[i,2:width]
tmpstr3 = array_to_text(inarray=chars, spacer="")
# check for "&" chars: convert 1&2 to (1 2)
if (grepl("&", tmpstr3))
{
for (j in 1:(width-1))
{
if (grepl("&", chars[j]))
{
char = strsplit(as.character(chars[j]), "&")
newchars_str = array_to_text(char[[1]], spacer=" ")
newchar = paste("[", newchars_str, "]", sep="")
chars[j] = newchar
cat("Ambiguous character detected & converted...", charmatrix[i,1], ", char #", j, ": ", newchar, "\n")
}
}
tmpstr3 = array_to_text(inarray=chars, spacer="")
}
tmpline = paste("", tmpstr2, tmpstr3, "\n", sep="")
outstr = paste(outstr, tmpline, sep="")
}
outstr = trim(outstr)
#outstr = paste("", outstr, sep="")
return(outstr)
}
charmatrix_to_TNT_file <- function(charmatrix, tnt_outfn, quotemark="")
{
# initialize array of strings
# l = lines
l = c()
l[1] = "xread"
l[2] = paste(ncol(charmatrix)-1, nrow(charmatrix))
tmpstr = chars_to_tnt_fmt(charmatrix)
l[3] = tmpstr
l[4] = ";"
l[5] = ""
l[6] = "cnames"
l[7] = charnames_to_tnt_text( names(charmatrix)[2:ncol(charmatrix)] )
l[8] = ";"
l[9] = ""
l[10] = ""
l[11] = "proc /;"
l[12] = "comments 0"
l[13] = ";"
l[14] = ""
l[15] = ""
write.table(l, tnt_outfn, append=FALSE, quote=FALSE, row.names=FALSE, col.names=FALSE)
headf(tnt_outfn)
tailf(tnt_outfn)
cat("Character matrix output to TNT: '", tnt_outfn, "'\n")
}
scale_2values_axis <- function(observed_nums, observed_axis_inbreaks, desired_axis)
{
xlims_v1_min = min(observed_nums)
xlims_v1_max = max(observed_nums)
xlims_min_inbreaks = min(observed_axis_inbreaks)
xlims_max_inbreaks = max(observed_axis_inbreaks)
xlims_min_desired = min(desired_axis)
xlims_max_desired = max(desired_axis)
x = c(xlims_v1_min, xlims_v1_max)
y = c(xlims_min_inbreaks, xlims_max_inbreaks)
reg_result = lm(y~x)
slope = reg_result$coefficients[2]
intercept = reg_result$coefficients[1]
xlims_touse_min = slope * xlims_min_desired + intercept
xlims_touse_max = slope * xlims_max_desired + intercept
xlims_touse = c(xlims_touse_min, xlims_touse_max)
return(xlims_touse)
}
make_multihist <- function(l, desired_xlims, colors, numbreaks, tmptitle, tmp_xlabel=NULL, tmp_ylabel=NULL)
{
for (i in 1:length(l))
{
if (i==1)
{
hist(l[[i]], breaks=numbreaks, col=colors[i], border="white", xlim=desired_axis, main=tmptitle, add=FALSE, xlab=tmp_xlabel, ylab=tmp_ylabel)
} else {
hist(l[[i]], breaks=numbreaks, col=colors[i], border="white", xlim=desired_axis, add=TRUE, ylab=tmp_ylabel)
}
#col=colors, border=c("white"), xlim=xlims_touse, plot.it=FALSE)
#plot(tmphist, col=colors[i], border="white", xlim=desired_axis)
#tmphist = hist(l[[i]], breaks=numbreaks, plot=FALSE)
#plot(tmphist, col=colors[i], border="white", add=TRUE)
}
#border=c("white"),
#offsets = min(tmphist$breaks)
#barwidth = tmphist$breaks[1]-tmphist$breaks[0]
#xrange = c(0, max(tmphist$breaks))
#barplot(tmphist$out, offset=offsets), width=barwidth, xlim=c(min(xrange), max(xrange)), ylim=c(0, 1.2*max(tmphist$out)))
#, beside=TRUE, col=colors, xlim=desired_axis, ylim=c(0, 1.2*max(tmphist$out)), bty="o", xaxt="n")
#axis(1, at=pretty(), label=pretty(desired_axis), tick=TRUE, tcl=ticklength, pos=0)
}
# Limit each numeric column in a dataframe to e.g. 4 significant digits
signif_digits_df <- function(dtf, numsig=4, printout=FALSE)
{
dtf_classes = cls.df(dtf, printout)
for (i in 1:length(dtf_classes$cls_col_list))
{
cls_col = dtf_classes$cls_col_list[i]
if (cls_col == "numeric")
{
# Convert numeric column to one with signif digits
colname = dtf_classes$dtf_names[i]
cmdstr = paste("dtf$", colname, " = signif(dtf$", colname, ", digits=", numsig, ")", sep="")
eval(parse(text = cmdstr))
}
}
return(dtf)
}
setup = '
char=";"
'
# remove ending character from each line of a file
remove_ending_character_fn <- function(fn, outfn, char=";")
{
# Read file to tmplines
tmplines = scan(fn, what="character", sep="\n")
# Remove the last character
for (i in 1:length(tmplines))
{
tmplines[i] = remove_ending_character(tmplines[i], char)
}
write_table_good_no_header(tmplines, outfn)
headf(outfn)
return(outfn)
}
# remove ending character, if it equals "char"
remove_ending_character <- function(tmpline, char=";")
{
tmpchars = strsplit(tmpline, split="")[[1]]
indices_that_match = match_list1_in_list2_return_all_indices(char, tmpchars)
if (sum(indices_that_match) > 0)
{
last_match = indices_that_match[length(indices_that_match)]
if (tmpchars[last_match] == tmpchars[length(tmpchars)])
{
newline = substr(tmpline, start=1, stop=nchar(tmpline)-1)
} else {
newline = tmpline
}
} else {
newline = tmpline
}
return(newline)
}
stripquotes_spaces <- function(tmpstr)
{
require(gdata)
# Remove quotes
tmpstr = gsub(pattern='\\"', replacement="", x=tmpstr)
# Remove quotes
tmpstr = gsub(pattern="'", replacement="", x=tmpstr)
# Remove \
tmpstr = gsub(pattern='\\\\', replacement="", x=tmpstr)
# trim whitespace
tmpstr = trim(tmpstr)
return(tmpstr)
}
#stripquotes_spaces_df <- function(tmpdf)
# {
# tmpdf = apply(tmpdf, 2, stripquotes_spaces)
# return(tmpdf)
# }
read_table_good <- function(fn, tmpskip=0)
{
# Read in the data, store in variable d
# This has all of Nick's preferred options
dtf = read.table(fn, header=TRUE, skip=tmpskip, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
return(dtf)
}
read_table_good_w_rownames <- function(fn, tmpskip=0)
{
# Read in the data, store in variable d
# This has all of Nick's preferred options
dtf = read.table(fn, header=TRUE, skip=tmpskip, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE, row.names=1)
return(dtf)
}
read_table_good_no_header <- function(fn, tmpskip=0)
{
# Read in the data, store in variable d
# This has all of Nick's preferred options
dtf = read.table(fn, header=FALSE, skip=tmpskip, sep=" ", quote="", stringsAsFactors = FALSE, strip.white=TRUE, fill=TRUE)
return(dtf)
}
write_table_good <- function(dtf, outfn, ...)
{
write.table(x=dtf, file=outfn, append=FALSE, quote=FALSE, sep=" ", row.names=FALSE, col.names=TRUE, ...)
}
write_table_good_no_header <- function(dtf, outfn, sepval="\n")
{
write.table(dtf, file=outfn, append=FALSE, quote=FALSE, sep=sepval, row.names=FALSE, col.names=FALSE)
}
write_table_good_with_rownames <- function(dtf, outfn)
{
write.table(dtf, file=outfn, append=FALSE, quote=FALSE, sep=" ", row.names=TRUE, col.names=TRUE)
}
write_table_good_no_colnames_or_rownames <- function(dtf, outfn)
{
write.table(dtf, file=outfn, append=FALSE, quote=FALSE, sep=" ", row.names=FALSE, col.names=FALSE)
}
write_line_good <- function(dtf, outfn)
{
write.table(dtf, file=outfn, append=TRUE, quote=FALSE, sep=" ", row.names=FALSE, col.names=TRUE)
}
write_lines_good <- function(dtf, outfn, sepval="\n", tmpappend=FALSE)
{
write.table(dtf, file=outfn, append=tmpappend, quote=FALSE, sep=sepval, row.names=FALSE, col.names=FALSE)
}
write_table_good_w_rownames <- function(dtf, outfn, ...)
{
write.table(dtf, file=outfn, append=FALSE, quote=FALSE, sep=" ", row.names=TRUE, col.names=TRUE, ...)
}
write_table_good_w_rownames_yesappend <- function(dtf, outfn, ...)
{
write.table(dtf, file=outfn, append=TRUE, quote=FALSE, sep=" ", row.names=TRUE, col.names=TRUE, ...)
}
write_table_good_w_rownames_no_colnames <- function(dtf, outfn, ...)
{
write.table(dtf, file=outfn, append=FALSE, quote=FALSE, sep=" ", row.names=TRUE, col.names=FALSE)
}
# Confusion matrix
make_confusion_matrix <- function(truth, predicted)
{
#########################################
# Make a confusion matrix
#########################################
#
# Following:
# NASA Remote Sensing Tutorial
# http://rst.gsfc.nasa.gov/Sect13/Sect13_3.html
# http://rst.gsfc.nasa.gov/Sect13/originals/Fig13_7.jpg
#
# Rows are "truth" (from ground/high-res image interpretation)
# Cols are "predicted" (from Landsat)
#
total_num_cells = length(predicted)
unique_classes = sort(unique(c(truth, predicted)))
confusion_matrix = matrix(data=NA, nrow=length(unique_classes)+3, ncol=length(unique_classes)+4)
numclasses = length(unique_classes)
for (i in 1:numclasses)
{
for (j in 1:numclasses)
{
truth_val = unique_classes[i]
pred_val = unique_classes[j]
truth_matching_TF = truth == truth_val
pred_matching_TF = predicted == pred_val
cell_val = sum(truth_matching_TF + pred_matching_TF == 2)
confusion_matrix[i,j] = cell_val
}
}
confusion_matrix
# Do the totals -- totals of columns
for (i in 1:numclasses)
{
confusion_matrix[numclasses+1,i] = sum(confusion_matrix[,i], na.rm=TRUE)
}
confusion_matrix
# Do the totals -- totals of rows
for (i in 1:(numclasses+1))
{
confusion_matrix[i, numclasses+1] = sum(confusion_matrix[i,], na.rm=TRUE)
}
confusion_matrix
# Do the commission and omission errors
# omissions = errors of omission = pixels missed that should have been found (false negatives)
# 1-ommission error = Producers's Accuracy
#
# comissions = errors of comission = pixels put in the category that should NOT have been (false positives)
# 1-commission error = User's Accuracy
#
# omission errors -- across rows
# 1-ommission error = Producers's Accuracy
omissions_ttl = 0
for (i in 1:numclasses)
{
total = confusion_matrix[i, numclasses+1]
number_of_hits = confusion_matrix[i,i]
number_of_misses = total - number_of_hits
omissions_ttl = omissions_ttl + number_of_misses
confusion_matrix[i, numclasses+2] = number_of_misses / total * 100
confusion_matrix[i, numclasses+3] = (1 - number_of_misses / total) * 100
}
confusion_matrix[numclasses+1, numclasses+2] = omissions_ttl/total_num_cells * 100
confusion_matrix[numclasses+1, numclasses+3] = 100 - omissions_ttl/total_num_cells * 100
# Comission errors -- down columns
# 1-commission error = User's Accuracy
comissions_ttl = 0
for (i in 1:numclasses)
{
total = confusion_matrix[numclasses+1, i]
number_of_hits = confusion_matrix[i,i]
number_of_misses = total - number_of_hits
comissions_ttl = comissions_ttl + number_of_misses
confusion_matrix[numclasses+2, i] = number_of_misses / total * 100
confusion_matrix[numclasses+3, i] = (1 - number_of_misses / total) * 100
}
confusion_matrix[numclasses+2, numclasses+1] = comissions_ttl/total_num_cells * 100
confusion_matrix[numclasses+3, numclasses+1] = 100 - comissions_ttl/total_num_cells * 100
# Mapping accuracy -- across rows
for (i in 1:numclasses)
{
total = confusion_matrix[i, numclasses+1]
number_of_hits = confusion_matrix[i,i]
confusion_matrix[i, numclasses+4] = number_of_hits / total * 100
}
confusion_matrix
# Overall accuracy (total number correct / total)
correct_ttl = 0
for (i in 1:numclasses)
{
correct_ttl = correct_ttl + confusion_matrix[i,i]
}
total = confusion_matrix[numclasses+1, numclasses+1]
mapping_accuracy = correct_ttl / total * 100
confusion_matrix[nrow(confusion_matrix), ncol(confusion_matrix)] = mapping_accuracy
confusion_matrix = adf(confusion_matrix)
names1 = paste("pred_", as.character(unique_classes), sep="")
names2 = paste("true_", as.character(unique_classes), sep="")
names(confusion_matrix) = c(names1, "total", "omission", "users_accuracy", "mapping_accuracy")
row.names(confusion_matrix) = c(names2, "total", "comission", "producers_accuracy")
confusion_matrix
return(confusion_matrix)
}
# Get best guess for each row
# (e.g., prediction with the highest probability)
get_highest_val <- function(preds)
{
tmpnames = names(adf(preds))
maxvals = apply(X=preds, MARGIN=1, FUN=max)
colnums = 1:length(tmpnames)
colnames_matrix = maxvals
for (i in 1:nrow(preds))
{
TF = preds[i, ] == maxvals[i]
colnames_matrix[i] = tmpnames[TF]
}
return(colnames_matrix)
}
calc_kappa_from_confusion_matrix <- function(confusion_matrix, numclasses)
{
#
# Kappa is basically:
# ( observed agreement - chance agreement ) /
# ( 1 - chance agreement )
# Following:
# http://kogs-www.informatik.uni-hamburg.de/PROJECTS/censis/satsim/node10.html
#
# Better: Congalton 1991
#
# Congalton, R. G. 1991. A review of assessing the accuracy
# of classifications of remotely sensed data. Remote Sensing
# of Environment 37:35-46.
# http://dx.doi.org/10.1016/0034-4257(91)90048-B
#
# Interpretation:
# http://en.wikipedia.org/wiki/Cohen%27s_kappa
#
# Nonetheless, magnitude guidelines have appeared in the literature. Perhaps the
# first was Landis and Koch,[11] who characterized values < 0 as indicating no
# agreement and 0–.20 as slight, .21–.40 as fair, .41–.60 as moderate, .61–.80
# as substantial, and .81–1 as almost perfect agreement. This set of guidelines
# is however by no means universally accepted; Landis and Koch supplied no evidence
# to support it, basing it instead on personal opinion. It has been noted that these
# guidelines may be more harmful than helpful.[2] Fleiss's[12]:218 equally arbitrary
# guidelines characterize kappas over .75 as excellent, .40 to .75 as fair to good,
# and below .40 as poor.
#
# A very clear presentation:
# http://www.geocomputation.org/1999/044/gc_044.htm
#
# For variance of Kappa, see especially Congalton 1999, p. 106
# http://www.amazon.com/Assessing-Accuracy-Remotely-Sensed-Data/dp/1420055127#reader_1420055127
#
# Get the total # of pixels
N = confusion_matrix[numclasses+1,numclasses+1]
# Get p0
p0 = 0
for (i in 1:numclasses)
{
p0 = p0 + confusion_matrix[i,i]
}
p0 = p0
# Get pz
pz = 0
for (i in 1:numclasses)
{
#sum1
sum1 = 0
sum2 = 0
for (j in 1:numclasses)
{
sum1 = sum1 + confusion_matrix[i,j]
sum2 = sum2 + confusion_matrix[j,i]
}
pz = pz + (sum1 * sum2)
}
#pz = pz * (1/ (N^2))
kappaval = ((N * p0) - pz) / ((N^2) - pz)
return(kappaval)
}
calc_kappavar_from_confusion_matrix <- function(confusion_matrix, numclasses)
{
# Following:
# http://kogs-www.informatik.uni-hamburg.de/PROJECTS/censis/satsim/node10.html
# Better: Congalton 1991
#
# Congalton, R. G. 1991. A review of assessing the accuracy
# of classifications of remotely sensed data. Remote Sensing
# of Environment 37:35-46.
# http://dx.doi.org/10.1016/0034-4257(91)90048-B
#
# Interpretation:
# http://en.wikipedia.org/wiki/Cohen%27s_kappa
#
# Nonetheless, magnitude guidelines have appeared in the literature. Perhaps the
# first was Landis and Koch,[11] who characterized values < 0 as indicating no
# agreement and 0–.20 as slight, .21–.40 as fair, .41–.60 as moderate, .61–.80
# as substantial, and .81–1 as almost perfect agreement. This set of guidelines
# is however by no means universally accepted; Landis and Koch supplied no evidence
# to support it, basing it instead on personal opinion. It has been noted that these
# guidelines may be more harmful than helpful.[2] Fleiss's[12]:218 equally arbitrary
# guidelines characterize kappas over .75 as excellent, .40 to .75 as fair to good,
# and below .40 as poor.
#
# A very clear presentation:
# http://www.geocomputation.org/1999/044/gc_044.htm
#
# For variance of Kappa, see especially Congalton 1999, p. 106
# http://www.amazon.com/Assessing-Accuracy-Remotely-Sensed-Data/dp/1420055127#reader_1420055127
#
# Get the total # of pixels
n = confusion_matrix[numclasses+1,numclasses+1]
# Get theta1
theta1 = 0
for (i in 1:numclasses)
{
theta1 = theta1 + confusion_matrix[i,i]
}
theta1 = 1/n * theta1
# Get theta2
theta2 = 0
for (i in 1:numclasses)
{
#sum1
sum1 = 0
sum2 = 0
for (j in 1:numclasses)
{
sum1 = sum1 + confusion_matrix[i,j]
sum2 = sum2 + confusion_matrix[j,i]
}
theta2 = theta2 + (sum1 * sum2)
}
theta2 = 1/(n^2) * theta2
# Get theta3
theta3 = 0
for (i in 1:numclasses)
{
nii = confusion_matrix[i,i]
#sum1
sum1 = 0
sum2 = 0
for (j in 1:numclasses)
{
sum1 = sum1 + confusion_matrix[i,j]
sum2 = sum2 + confusion_matrix[j,i]
}
theta3 = theta3 + nii * (sum1 + sum2)
}
theta3 = 1/(n^2) * theta3
# Get theta4
theta4 = 0
isum = 0
for (i in 1:numclasses)
{
nii = confusion_matrix[i,i]
#sum1
jsum = 0
for (j in 1:numclasses)
{
sum1 = 0
for (ii in 1:numclasses)
{
sum1 = sum1 + confusion_matrix[j,ii]
sum2 = sum2 + confusion_matrix[ii,i]
}
nij = confusion_matrix[i,j]
jsum = jsum + (nii * (sum1 + sum2)^2)
}
isum = isum + jsum
}
theta4 = 1/(n^3) * isum
# Super-formula in multiple parts
part1 = theta1*(1-theta1) / ((1-theta2)^2)
part2num = 2 * (1-theta1) * ((2*theta1*theta2) - theta3)
part2den = (1-theta2)^3
part3num = ((1-theta1)^2) * (theta4 - 4*(theta2^2))
part3den = (1-theta2)^4
varK = (1/n) * (part1 + (part2num / part2den) + (part3num / part3den))
return(varK)
}
is.not.NA <- function(x)
{
return(is.na(x) == FALSE)
}
#
# is.not.na <- function(x)
# {
# return(is.na(x) == FALSE)
# }
# remove NA rows from list
na.remove <- function(x)
{
outx = x[is.not.NA(x)]
return(outx)
}
allneg <- function(inlist)
{
x = (inlist <= 0)
sumneg = sum(x, na.rm=TRUE)
lenneg = length(na.remove(x))
if (lenneg == sumneg)
{
return(TRUE)
}
else
{
return(FALSE)
}
}
allpos <- function(inlist)
{
x = (inlist >= 0)
sumpos = sum(x, na.rm=TRUE)
lenpos = length(na.remove(x))
if (lenpos == sumpos)
{
return(TRUE)
}
else
{
return(FALSE)
}
}
remove_NA_rows <- function(dtf, colname)
{
cat("\n")
cat("remove_NA_rows: initial #rows = ", nrow(dtf), "\n")
cmdstr = paste("rows_to_keep = is.not.NA(dtf$", colname, ")", sep="")
#print(cmdstr)
eval(parse(text = cmdstr))
#print(dim(df))
#print(length(rows_to_keep))
#print(rows_to_keep)
outdf = dtf[rows_to_keep, ]
cat("remove_NA_rows: ending #rows = ", nrow(outdf), "\n")
return(outdf)
}
# check if all elements match
do_all_match <- function(arr1, arr2)
{
matches = (arr1 == arr2)
if (sum(matches) == length(matches))
{
return(TRUE)
} else {
return(FALSE)
}
}
# split strings on whitespace
strsplit_on_tabs_remove_whitespace <- function(tmpline)
{
# split on 1 or more whitespaces
temp = strsplit(tmpline, "[\t]")
# get the list
list_of_strs = temp[[1]]
# remove any leading/trailing ""
list_of_strs = list_of_strs[list_of_strs != ""]
for (i in 1:length(list_of_strs))
{
tmpstr = list_of_strs[i]
list_of_strs[i] = gsub(" ", "", tmpstr)
}
return(list_of_strs)
}
# catch "integer(0)" e.g. from non-matching grep
is.numzero <- function(x)
{
if (length(x) == 0)
{
return(TRUE)
}
else
{
return(FALSE)
}
}
is.inf <- function(x)
{
return(x == Inf)
}
is.not.inf <- function(x)
{
return(is.inf(x) == FALSE)
}
# return_items_not_NA <- function(x)
# {
# y = x[is.not.NA(x)]
# return(y)
# }
plot_point <- function(x, y, pt_title, color)
{
points(x, y, type="p", pch=19, cex=2, col=color)
text(x, y, pt_title, col=color, pos=4, bg="white")
}
# print out the last 5 rows of a data frame
foot <- function(dtf)
{
nrows = nrow(dtf)
print(dtf[((nrows-5) : nrows), ])
}
# print out the header, footer, and basic info on a dataframe (df)
sumdf <- function(dtf)
{
print(dim(dtf))
print(head(dtf))
foot(dtf)
# print the class of each column
for (col in 1:ncol(dtf))
{
cat("Col#", col, " class = ", class(dtf[1,col]), "\n", sep="")
}
}
make_cdf <- function(list_rel_probs)
{
if (sum(list_rel_probs) == 0)
{
print("make_cdf() ERROR: sum(list_rel_probs) is ZERO")
print(list_rel_probs)
return(NA)
}
if (class(list_rel_probs) != "data.frame")
{
dim(list_rel_probs) = c(length(list_rel_probs), 1)
list_rel_probs = data.frame(list_rel_probs)
}
cdf_rel_probs = rep(0, nrow(list_rel_probs))
dim(cdf_rel_probs) = c(length(cdf_rel_probs), 1)
cdf_rel_probs = data.frame(cdf_rel_probs)
cdf_rel_probs = cbind(cdf_rel_probs, cdf_rel_probs)
names(cdf_rel_probs) = c("bottom", "top")
for (i in 1:nrow(list_rel_probs))
{
if (i==1)
{
cdf_rel_probs$bottom[i] = 0
cdf_rel_probs$top[i] = list_rel_probs[i, 1]
}
else
{
cdf_rel_probs$bottom[i] = cdf_rel_probs$top[i-1]
cdf_rel_probs$top[i] = cdf_rel_probs$top[i-1] + list_rel_probs[i, 1]
}
}
cdf_rel_probs_new = cdf_rel_probs / sum(list_rel_probs)
return(cdf_rel_probs_new)
}
# empirical p-values (for empirical PDF, cdf, percentiles, quantiles)
empirical_pval_fast <- function(x, observed_val)
{
num_lt_observed = sum(x <= observed_val)
pval = num_lt_observed / length(x)
return(pval)
}
# empirical p-values (for empirical PDF, cdf, percentiles, quantiles)
empirical_pval_slow <- function(x, observed_val)
{
freqtab = f.freq(x)
pval = freqtab$cumul_fraction[freqtab$x == observed_val]
return(pval)
}
f.freq <- function(x)
{
freqtab <- data.frame(table(x))
freqtab$fraction <- freqtab$Freq / length(x)
freqtab$cumul_fraction[1] <- freqtab$fraction[1]
for(i in 2:length(freqtab[,1]))
{
freqtab$cumul_fraction[i] = freqtab$cumul_fraction[i-1] + freqtab$fraction[i]
}
return(freqtab)
}
# Convert unique items to characters
unique_items_to_chars <- function(tempy, y, namestr)
{
col_w_name = match(namestr, names(tempy), nomatch=NaN)
tmpname = names(y)[col_w_name]
cmdstr = paste("charstates = unique(tempy$", tmpname, ")", sep="")
eval(parse(text = cmdstr))
charstates = charstates[charstates != "?"]
charindexes = seq(1, length(charstates))
cat("\n")
cat(namestr, " - ", "# of states: ", length(charstates), "\n", sep="")
cat("code state\n")
# Display the characters
for (i in 1:length(charstates))
{
#y$General_Field[tempy$General_Field == charstates[i]] = mesquite_character_states[charindexes[i]]
cat(mesquite_character_states[charindexes[i]], ": ", charstates[i], "\n", sep="")
cmdstr = paste("y$", tmpname, "[tempy$", tmpname, " == charstates[i]] = mesquite_character_states[charindexes[i]]", sep="")
eval(parse(text = cmdstr))
}
return(y)
}
# Retrieve chars
# Note: only works if there is a 1-to-1 mapping of codes to text
#
# inchars_outchars = retrieve_char_text(tempy, y, namestr)
# inchars = inchars_outchars[[1]]
# outchars = inchars_outchars[[2]]
retrieve_char_text <- function(tempy, y, namestr)
{
tmpname = namestr
# Get numerical characters
cmdstr = paste("charstates = unique(tempy$", tmpname, ")", sep="")
eval(parse(text = cmdstr))
# Get text states
cmdstr = paste("textstates = unique(y$", tmpname, ")", sep="")
eval(parse(text = cmdstr))
inchars = c()
outchars = c()
for (i in 1:length(charstates))
{
charstate = charstates[i]
charstate_index = match(charstate, charstates)
textstate = textstates[charstate_index]
inchars = c(inchars, charstate)
outchars = c(outchars, textstate)
}
inchars_outchars = list(inchars, outchars)
return(inchars_outchars)
}
# Count the number of matching characters
count_chars <- function(X, char=",")
{
# Count the number of matching characters in X,
# where X is a string or list of strings
numchars_that_match = count.fields(textConnection(X), sep=char)
return(numchars_that_match)
}
print_chars <- function(y, namestr)
{
col_w_name = match(namestr, names(y), nomatch=NaN)
tmpname = names(y)[col_w_name]
cmdstr = paste("charstates = unique(y$", tmpname, ")", sep="")
eval(parse(text = cmdstr))
cat("\n")
cat(namestr, " - ", "# of states: ", length(charstates), "\n", sep="")
cat("List of the character states:\n")
# Display the characters
for (i in 1:length(charstates))
{
cat(charstates[i], "\n", sep="")
}
}
print_char_key <- function(inchars, outchars)
{
cat("\n")
cat("Printing codes for characters:\n")
for (i in 1:length(inchars))
{
cat(outchars[i], ": ", inchars[i], "\n", sep="")
}
}
print_chars_catlist <- function(y, namestr)
{
mesquite_character_states = c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "K", "M", "N", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "a", "b", "c", "d", "e", "f", "g", "h", "l", "m", "n", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z")
col_w_name = match(namestr, names(y), nomatch=NaN)
tmpname = names(y)[col_w_name]
cmdstr = paste("charstates = unique(y$", tmpname, ")", sep="")
eval(parse(text = cmdstr))
cat("\n")
cat(namestr, " - ", "# of states: ", length(charstates), "\n", sep="")
cat("\n")
cat("List of the character states:\n")
# Display the characters -- text input
cat("inchars = c(")
for (i in 1:(length(charstates)-1))
{
cat('"', charstates[i], '", \n', sep="")
}
cat('"', charstates[length(charstates)], '")\n', sep="")
cat("\n")
# Display the characters -- suggested coded output
cat("outchars = c(")
charcount = 0
for (i in 1:(length(charstates)-1))
{
if (charstates[i] != "?")
{
charcount = charcount + 1
cat('"', mesquite_character_states[charcount], '", \n', sep="")
}
else
{
cat('"?"', ', \n', sep="")
}
}
if (charstates[length(charstates)] != "?")
{
charcount = charcount + 1
cat('"', mesquite_character_states[charcount], '")\n', sep="")
}
else
{
cat('"?"', ')\n', sep="")
}
cat("\n")
}
convert_items_to_items <- function(y, namestr, name1list, name2list)
{
for (i in 1:length(name1list))
{
name1 = name1list[i]
name2 = name2list[i]
y = convert_item_to_item(y, namestr, name1, name2)
}
return(y)
}
convert_item_to_item <- function(y, namestr, name1, name2)
{
col_w_name = match(namestr, names(y), nomatch=NaN)
tmpname = names(y)[col_w_name]
cmdstr = paste("y$", tmpname, "[y$", tmpname, ' == "', name1, '"] = "', name2, '"', sep="")
print(cmdstr)
eval(parse(text = cmdstr))
return(y)
}
convert_grepl_item_to_item <- function(y, namestr, name1, name2)
{
col_w_name = match(namestr, names(y), nomatch=NaN)
tmpname = names(y)[col_w_name]
cmdstr = paste("rows_to_change = grepl(name1, y$", tmpname, ")", sep="")
eval(parse(text = cmdstr))
cmdstr = paste("y$", tmpname, "[rows_to_change] = '", name2, "'", sep="")
print(cmdstr)
eval(parse(text = cmdstr))
return(y)
}
# Extract rows with indices matching rows_to_keep
gather_rows_with_indices_old <- function(tmptable, rows_to_keep)
{
# make empty matrix
newtable = matrix(data=NA, nrow = length(rows_to_keep), ncol=ncol(tmptable))
names(newtable) = names(tmptable)
for (i in 1:nrow(newtable))
{
# insert the new row
newtable = tmptable[rows_to_keep[i], ]
}
return(newtable)
}
# Extract rows with indices matching rows_to_keep
# Fixing a bug that was caused by row numbers being characters instead of
# numbers
gather_rows_with_indices <- function(tmptable, rows_to_keep)
{
rows_to_keep = as.numeric(rows_to_keep)
# make empty matrix
newtable = matrix(data=NA, nrow = length(rows_to_keep), ncol=ncol(tmptable))
names(newtable) = names(tmptable)
for (i in 1:nrow(newtable))
{
# insert the new row
newtable = tmptable[rows_to_keep[i], ]
}
return(newtable)
}
# remove rows from temp_table, based on col matching list_to_remove
remove_rows_with_list_to_remove <- function(temp_table, col, list_to_remove)
{
# get TRUE/FALSE for which rows in species col are undefined, according to list_to_remove
truefalse_rows_match = match_list1_in_list2(col, list_to_remove)
cat("remove_rows_with_list_to_remove(): initial table #rows = ", nrow(temp_table), "\n")
cat("Removing ", (sum(truefalse_rows_match)), " rows with c(", list2str(c(list_to_remove)), ") in them.\n", sep="")
temp_table = temp_table[truefalse_rows_match == FALSE, ]
cat("remove_rows_with_list_to_remove(): revised table #rows = ", nrow(temp_table), "\n")
return(temp_table)
}
# This should be a faster list2 string
# This should be a faster list2 string
list2str_fast <- function(list1, spacer="")
{
# convert to character
# split into list of lists of characters
# merge lists with unlist
# paste with collapse argument
tmpstr = paste(unlist(strsplit(as.character(list1), split="")), collapse=spacer)
return(tmpstr)
}
list2str_fast_nosplit <- function(list1, spacer="")
{
# convert to character
# split into list of lists of characters
# merge lists with unlist
# paste with collapse argument
tmpstr = paste(unlist(as.character(list1)), collapse=spacer)
return(tmpstr)
}
list2str_unlist <- function(list1, spacer=" ")
{
outlist = c("startlist")
for (i in 1:length(list1))
{
listitem = list1[i]
if (length(listitem) > 1)
{
outlist = append(outlist, list2str(unlist(listitem), spacer=" "))
}
else
{
addstr = as.character(list1[i])
#tmpstr = paste(tmpstr, addstr, sep=spacer)
outlist = append(outlist, addstr)
}
}
tmpstr = list2str(outlist, spacer=" ")
return(tmpstr)
}
return_unlist <- function(list1, spacer=" ")
{
outlist = c("startlist")
for (i in 1:length(list1))
{
listitem = list1[i]
if (length(listitem) > 1)
{
outlist = append(outlist, unlist(listitem), spacer=" ")
}
else
{
addstr = as.character(list1[i])
#tmpstr = paste(tmpstr, addstr, sep=spacer)
outlist = append(outlist, addstr)
}
}
return(outlist)
}
# unlist_dtf_cols <- function(dtf, printflag=FALSE)
# {
# # Sometimes cbind makes each column a list, this can screw up use/searching of
# # the column later on.
# # Unlist each column...
# for (i in 1:ncol(dtf))
# {
# tmpstr = paste("unlisting col: ", names(dtf)[i], "...", sep="")
# prflag(tmpstr, printflag=printflag)
#
# # catch a possible error from unlisting
# # the number of rows needs to stay the same!!
# tmpcol = unlist(dtf[, i])
# if (length(tmpcol) != length(dtf[, i]))
# {
# tmpstr = paste("...failed! unlist(col) length=", length(tmpcol), "; nrow(dtf) = ", nrow(dtf), sep="")
# prflag(tmpstr, printflag=printflag)
# }
# else
# {
# dtf[, i] = tmpcol
# tmpstr = paste(" ", " ", sep="")
# prflag(tmpstr, printflag=printflag)
# }
# }
#
# #dtf2 = adf(dtf)
#
# return(dtf)
# }
char_descriptions_to_NEXUS <- function(namestr_list, inchars_list, outchars_list)
{
# Create l, an empty list of lines...
l = list()
#l[(h=h+1)] = " CHARSTATELABELS"
tmpcharstr_list = c()
for (i in 1:length(namestr_list))
{
tmpname = namestr_list[i]
# filter out ? and ''
inchars = inchars_list[[i]]
outchars = outchars_list[[i]]
chars_to_keep1 = outchars != ""
chars_to_keep2 = outchars != "?"
# remove the character states which are "" or "?" in the coded section
chars_to_keep = ((chars_to_keep2 + chars_to_keep1) == 2)
inchars2 = inchars[chars_to_keep]
oldstr = "'"
newstr = ""
tmp_charstates_list_noapost = gsub(oldstr, newstr, inchars2)
oldstr = " "
newstr = "_"
tmp_charstates_list_nospace = gsub(oldstr, newstr, tmp_charstates_list_noapost)
#tmp_charstates_str = list2str(inchars_list[[i]], spacer='" "')
tmp_charstates_str = list2str(tmp_charstates_list_nospace, spacer="' '")
tmp_charstates_str2 = paste("'", tmp_charstates_str, "'", sep="")
tmpcharstr = paste(i, tmpname, "/ ", tmp_charstates_str2)
tmpcharstr_list = c(tmpcharstr_list, tmpcharstr)
}
bigstr_chardescs = list2str(tmpcharstr_list, spacer=", ")
charstatelabels = paste(" ", bigstr_chardescs, " ;", sep="")
return(charstatelabels)
}
# Add character state labels to a preexisting NEXUS file...
add_charstatelabels_to_NEXUS <- function(charstatelabels, infn, outfn)
{
lines = scan(file=infn, what="character", sep="\n")
# Go through lines
line_with_charstatelabels = NULL
for (i in 1:length(lines))
{
# Get line
line = lines[i]
# Check if line matches CHARSTATELABELS
if (grepl("CHARSTATELABELS", line))
{
line_with_charstatelabels = i
}
else
{
blah=TRUE
}
}
# Assuming you found line_with_charstatelabels...
lines[line_with_charstatelabels+1] = charstatelabels
write.table(lines, file=outfn, quote=FALSE, append=FALSE, sep="", row.names = FALSE, col.names=FALSE)
return(lines)
}
count_rows_NOT_containing_substr_in_col <- function(dtf, col, tmp_substr)
{
list_to_kill = grepl(tmp_substr, col, fixed=TRUE)
list_to_keep = (list_to_kill == FALSE)
print(sum(list_to_keep))
return(sum(list_to_keep))
}
count_rows_containing_substr_in_col <- function(dtf, col, tmp_substr)
{
list_to_keep = grepl(tmp_substr, col, fixed=TRUE)
print(sum(list_to_keep))
return(sum(list_to_keep))
}
remove_rows_containing_substr_in_col <- function(dtf, col, tmp_substr)
{
list_to_kill = grepl(tmp_substr, col, fixed=TRUE)
list_to_keep = (list_to_kill == FALSE)
return(dtf[list_to_keep, ])
}
retain_rows_containing_substr_in_col <- function(dtf, col, tmp_substr)
{
list_to_keep = grepl(tmp_substr, col, fixed=TRUE)
return(dtf[list_to_keep, ])
}
extract_rows_with_col_matching_findthis_str <- function(temp_table, col, findthis)
{
# Extract North-America-specific modern extinctions
matches = find_str_inlist(findthis, tmplist=col)
cat("# of rows matching '", findthis, "': ", sum(matches), "\n", sep="")
temp_table = temp_table[matches, ]
return(temp_table)
}
# 2014-01-30_NJM:
# moving to BioGeoBEARS readwrite
# for process_DIVA_output
# Find a string inside another string...
find_instring <- function(findthis, string)
{
result = grep(findthis, string)
if (length(result) > 0)
{
#print("match")
match = TRUE
}
else
{
match = FALSE
}
return(match)
}
# 2014-01-30_NJM:
# moving to BioGeoBEARS readwrite
# for process_DIVA_output
find_str_inlist <- function(findthis, tmplist)
{
# make empty list
outlist = rep(NA, length(tmplist))
for (i in 1:length(tmplist))
{
string = tmplist[i]
outlist[i] = find_instring(findthis, string)
}
return(outlist)
}
match_list1_in_list2_return_all_indices <- function(list1, list2)
{
# returns indices of list2 that match something in list1
matchlist = list2 %in% list1
matching_indices = seq(from=1, to=length(list2), by=1)[matchlist]
return(matching_indices)
}
#
#
# # return matching TRUE/FALSE values
# # list1 (.e.g. a big list) TRUE if it is found in list2 (e.g. a smaller list)
# match_list1_in_list2 <- function(list1, list2)
# {
# matchlist = list1 %in% list2
# return(matchlist)
# }
# return matching TRUE/FALSE values
# list1 (.e.g. a big list) TRUE if it is found in list2 (e.g. a smaller list)
match_item_in_list2 <- function(item, list2, return_indices=TRUE)
{
matchlist_TF = list2 %in% item
if (return_indices == TRUE)
{
matching_indices = seq(from=1, to=length(list2), by=1)[matchlist_TF]
return(matching_indices)
}
if (return_indices == FALSE)
{
return(matchlist_TF)
}
}
compare_2cols_against_another_2cols_find_matching_pairs_in_list2 <- function(list1, list2)
{
list1_paste = paste(list1[,1], ",", list1[,2], sep="")
list2_pasteA = paste(list2[,1], ",", list2[,2], sep="")
list2_pasteB = paste(list2[,1], ",", list2[,2], sep="")
list2_matches1 = match_list1_in_list2(list2_pasteA, list1_paste)
list2_matches2 = match_list1_in_list2(list2_pasteB, list1_paste)
list2_matches = list2_matches1 + list2_matches2 > 0
return(list2_matches)
}
# NOTE!!! THESE MATCH FUNCTIONS JUST RETURN THE *FIRST* MATCH, *NOT* ALL MATCHES
# (argh)
# return indices in 2nd list matching the first list
# It WILL return one match for each item in the list, though...
# get_indices_where_list1_occurs_in_list2 <- function(list1, list2)
# {
# match_indices = match(list1, list2)
# return(match_indices)
# }
get_indices_where_string_in_list1_occurs_in_list2 <- function(list1, list2)
{
match_indices = unlist(lapply(list1, get_index_in_list2_matching_string, list2))
match_indices[is.na(match_indices) == TRUE] = ""
return(match_indices)
}
# returns the index of the FIRST hit
get_index_in_list2_matching_string <- function(tmpstr, list2)
{
matches_TF = lapply(tmpstr, grepl, list2)[[1]]
first_hit_index = match(TRUE, matches_TF)
return(first_hit_index)
}
# # return indices in 2nd list matching the first list
# get_indices_where_list1_occurs_in_list2_noNA <- function(list1, list2)
# {
# match_indices = match(list1, list2)
# match_indices = return_items_not_NA(match_indices)
# return(match_indices)
# }
# this is f-ed up!!
get_real_indices_where_list1_occurs_in_list2 <- function(list1, list2)
{
print("get_real_indices_where_list1_occurs_in_list2()")
print("WARNING: THIS FUNCTION RETURNS THE INDICES OF NOT NA, NOT THE INDICES OF THE MATCHES")
indices = 1:length(list1)
matches_or_NAs = get_indices_where_list1_occurs_in_list2(list1, list2)
not_NA = is.not.NA(matches_or_NAs)
indices_to_return = indices[not_NA]
return(indices_to_return)
}
# place items with string matching list2 in new column of size list2
#place_items_matching_str_in_list1_in_newcol <- function()
# colmax, max.col
colmax <- function(dtf)
{
maxvals = apply(dtf, 2, max, na.rm=TRUE)
return(maxvals)
}
colmin <- function(dtf)
{
minvals = apply(dtf, 2, min, na.rm=TRUE)
return(minvals)
}
normalize_by_colmax <- function(dtf)
{
maxvals = colmax(dtf)
# this outputs the results into columns, instead of corresponding rows
x = apply(dtf, 1, "/", maxvals)
# so transpose
normalized_df = t(x)
normalized_df = as.data.frame(normalized_df)
names(normalized_df) = names(dtf)
return(normalized_df)
}
##############################################
# Statistical tests
##############################################
# Welch's t-test (comparing means of two samples with different
# sizes and possibly different sample variances)
setup = '
samp1mean = mean_rfd_genetree_to_genetree
samp1std = std_rfd_genetree_to_genetree
samp2mean = mean_rfd_genetree_to_sptree
samp2std = std_rfd_genetree_to_sptree
'
welchs_ttest_using_means <- function(samp1mean, samp2mean, samp1std, samp2std, n1, n2)
{
#print(mean_x1)
#print(mean_x2)
mean_x1 = samp1mean
mean_x2 = samp2mean
std_x1 = samp1std
std_x2 = samp2std
# Welch's t-test:
# http://en.wikipedia.org/wiki/Welch%27s_t_test
t_top = mean_x1 - mean_x2
s_x1_x2 = ((std_x1^2/n1) + (std_x2^2/n2))
t_bot = sqrt(s_x1_x2)
t_welch = t_top / t_bot
# Calculate degrees of freedom
df_top = ((std_x1^2/n1) + (std_x2^2/n2))^2
v1 = n1 - 1
v2 = n2 - 1
df_bot1 = std_x1^4 / (n1^2 * v1)
df_bot2 = std_x2^4 / (n2^2 * v2)
degfree = df_top / (df_bot1 + df_bot2)
# one-tailed test, is observed t greater than null t?
# (i.e., is null t less than observed t)
#
# (i.e., are between-treecloud distances (samp2) same as null (within-cloud distances)? )
#
prob_obs_t_GT_null = pt(t_welch, degfree, lower.tail=TRUE) # i.e., between dists >0
#prob_between_dists_x1_is_greater_than_x2 = pt(t_welch, degfree)
#cat("t_top: ", t_top, "; prob_null_between_dists_x1_not_greater_than_x2: ", prob_null_between_dists_x1_not_greater_than_x2, "; prob_between_dists_x1_is_greater_than_x2: ", prob_between_dists_x1_is_greater_than_x2, "\n", sep="")
cat("samp1=", samp1mean, "+/-", samp1std, " samp2=", samp2mean, "+/-", samp2std, " t_welch=", t_welch, " df = ", degfree, ", p(null < obs t)=", prob_obs_t_GT_null, " =P(samp2=>samp1); ", 1-prob_obs_t_GT_null, "= P(samp2<samp1) \n", sep="")
tmprow = c(samp1mean, samp2mean, t_top, std_x1, std_x2, t_welch, degfree, prob_obs_t_GT_null, 1-prob_obs_t_GT_null)
return(tmprow)
}
min.f1f2 <- function(x, mu1, mu2, sd1, sd2)
{
f1 <- dnorm(x, mean=mu1, sd=sd1)
f2 <- dnorm(x, mean=mu2, sd=sd2)
pmin(f1, f2)
}
##############################################
# Matrix functions
##############################################
# symmetric flip across the diagonal of a matrix
flipdiag <- function(mat)
{
newmat = mat
for (i in ncol(mat))
{
for (j in nrow(mat))
{
newmat[j,i] = mat[i,j]
}
}
return(newmat)
}
nondiagTF <- function(m)
{
# Returns TF for non-diagonals
diag(m) = "blah"
TF = m != "blah"
return(TF)
}
nondiag <- function(m)
{
# Returns non-diagonals
nondiag_m = m[nondiagTF(m)]
return(nondiag_m)
}
# Compare lower and upper diagonals
compare_lower_upper_diag <- function(x)
{
xtri = x[lower.tri(x)]
ytmp = flipdiag(x)
ytri = ytmp[lower.tri(ytmp)]
(cor(cbind(xtri, ytri)))
dif = c(mean(xtri))-c(mean(ytri))
prstr = paste(mean(c(xtri)), mean(c(ytri)), dif, sep=" ")
print(prstr)
plot(xtri, ytri, type="p", pch=".")
return(dif)
}
# Compare row/col
compare_row_col <- function(x, rownum)
{
xnums = c(x[rownum, ])
ynums = c(x[, rownum])
(cor(cbind(xnums, ynums)))
dif = c(mean(xnums))-c(mean(ynums))
prstr = paste(mean(c(xnums)), mean(c(ynums)), dif, sep=" ")
print(prstr)
plot(xnums, ynums, type="p", pch=".")
return(dif)
}
# Find rows contains
# (note that this
findrows_containing <- function(x, thing_to_find)
{
rows_to_keep = 1:nrow(x)
for (i in 1:nrow(x))
{
if(sum(x[i, ] == thing_to_find, na.rm = TRUE) > 0)
{
rows_to_keep[i] = TRUE
}
else
{
rows_to_keep[i] = FALSE
}
}
return(rows_to_keep)
}
# Find rows containing no NAs
findrows_containing_noNA <- function(x)
{
rows_to_keep = is.na(rowSums(x))
return(rows_to_keep == FALSE)
}
# Find rows containing no NAs
findrows_containing_noINF <- function(x)
{
rows_to_keep = is.inf(rowSums(x))
return(rows_to_keep == FALSE)
}
# Find rows containing no NAs, no Infs
return_rows_containing_noNA_noInf <- function(x)
{
rows_to_keep = 1:nrow(x)
rows_to_keep = findrows_containing_noNA(x)
y = x[rows_to_keep, ]
rows_to_keep = findrows_containing_noINF(y)
z = y[rows_to_keep, ]
return(z)
}
# Linear color scheme
defaults = '
nums = rnorm(1000, mean=500, sd=200)
numcolors = length(nums)
minval = 300
maxval = 700
mincolor = "yellow"
maxcolor = "red"
scaling=1
nums=avg_percent_diff
minval=0.35
maxval=0.52
'
colorscheme <- function(nums = rnorm(1000, mean=500, sd=200), numcolors = length(nums), minval = 300, maxval = 700, mincolor = "red", maxcolor = "yellow", scaling=1)
{
# for smoothColors:
# smoothColorscalculates a sequence of colors. If two color names in the arguments
# are separated by a number, that number of interpolated colors will be inserted between
# the two color endpoints.
require(plotrix)
# make a color table
tmpdf1 = cbind(1:length(nums), nums)
# sort it
tmpdf2 = tmpdf1[order(nums), ]
numvals_below_min = sum(nums <= minval)
numvals_above_max = sum(nums >= maxval)
minval_index = numvals_below_min
maxval_index = (length(nums) - numvals_above_max)
numvals_that_change = maxval_index - minval_index - 2
vals_below_min = 1:numvals_below_min
vals_above_max = (length(nums) - numvals_above_max) : length(nums)
# put in the changing colors
changing_colors = smoothColors(mincolor, numvals_that_change, maxcolor)
# rank scaling
if (scaling == "rank")
{
changing_colors_final = changing_colors
}
else
{
# scaling is an exponent
ttmpdf3 = tmpdf2[(minval_index+1) : (maxval_index), ]
ttmpdf4 = ttmpdf3
numer = ttmpdf3[, 2]-min(ttmpdf3[, 2])
denom = max(ttmpdf3[, 2]) - min(ttmpdf3[, 2])
color_indices = 1+round(numvals_that_change * (numer/denom)^scaling )
changing_colors_final = changing_colors[color_indices]
#plot(color_indices, rep(1, length(color_indices)), col=changing_colors_final)
#title("Colors that are changing")
#xy = dotPlot(nums)
#xy
}
color_list = c( rep(mincolor, numvals_below_min), changing_colors_final, rep(maxcolor, numvals_above_max) )
print(length(color_list))
# Get the xy coordinates of the dotPlot
xylist = get_dotPlot_coords(nums)
# Plot color scheme
# set new params but save old params
old_params = par(mfrow=c(3, 1))
# Points ranked by value
plot(1:length(nums), rep(1, length(nums)), col=color_list)
title("Colors in order")
# Stacked-dot plot (histogram-like)
plot(xylist[,1], xylist[,2], col="black", pch=1)
title("Stacked-dot plot of input numbers")
plot(xylist[,1], xylist[,2], col=color_list, pch=19)
title("Stacked-dot plot of input numbers, colored by approximate assigned color")
tmpdf3 = cbind(tmpdf2, color_list)
tmpdf4 = tmpdf3[order(as.numeric(tmpdf3[,1])), ]
tmpdf4 = adf(tmpdf4)
names(tmpdf4) = c("orig_order", "nums", "color_list")
par(old_params)
return(tmpdf4)
}
get_dotPlot_coords <- function (x, group, xlim, ylim, col, xlab, ylab, pch, cex, breaks, stacked = TRUE, ...)
{
xlist = NULL
ylist = NULL
DB = FALSE
pch.size = "O"
grouped = TRUE
parList = list(...)
if (missing(xlab))
xlab = deparse(substitute(x))
x = x[!is.na(x)]
if (missing(xlim))
xlim = range(x)
if (missing(ylim))
ylim = c(0, 1)
if (missing(ylab))
ylab = ""
if (missing(cex))
cex = 1
if (missing(group))
group = rep(1, length(x))
if (length(unique(group)) == 1)
grouped = FALSE
if (missing(pch) || length(unique(group)) > length(pch))
pch = 1:length(unique(group))
if (missing(col) || length(unique(group)) > length(col))
col = 1:length(unique(group))
if (missing(breaks))
{
plot(1, 1, xlim = xlim, ylim = ylim, type = "n", axes = FALSE, cex = cex, xlab = xlab, ylab = ylab)
slotSizeX = strwidth(pch.size, units = "user", cex = cex)
if (DB)
print(paste("slotSizeX:", slotSizeX))
span = diff(range(x))
temp1 = ppoints(2 * ceiling(span/slotSizeX))
temp2 = numeric(length(temp1) + 2)
temp2[2:(length(temp1) + 1)] = temp1
temp2[1] = temp2[1] - 1.01 * diff(c(temp1[1], temp1[2]))
temp2[length(temp2)] = rev(temp1)[1] + 1.01 * diff(c(temp1[1],
temp1[2]))
temp2 = temp2 * span + min(x)
temp = min(x) + ppoints(span/slotSizeX) * span
breaks = numeric(length(temp) + 2)
breaks[2:(length(temp) + 1)] = temp
breaks[1] = temp[1] - diff(c(temp[1], temp[2])) * 1.001
breaks[length(breaks)] = rev(temp)[1] + diff(c(temp[1],
temp[2])) * 1.001
breaks = temp2
}
slotSizeY = strheight(pch.size, units = "user", cex = cex)
if (DB)
print(paste("slotSizeY:", slotSizeY))
span = diff(ylim)
temp1 = ppoints(2 * ceiling(span/slotSizeY))
temp2 = numeric(length(temp1) + 2)
temp2[2:(length(temp1) + 1)] = temp1
temp2[1] = temp2[1] - 1.01 * diff(c(temp1[1], temp1[2]))
temp2[length(temp2)] = rev(temp1)[1] + 1.01 * diff(c(temp1[1],
temp1[2]))
yVec = temp2 * span + min(ylim)
if (yVec[1] < 0)
yVec = yVec + abs(yVec[1])
else yVec = yVec - yVec[1]
if (DB)
print(paste("temp2:", temp2))
if (DB)
print(paste("breaks:", breaks))
histObj = hist(x, breaks = breaks, right = FALSE, plot = FALSE)
hMids = histObj$mids
hCounts = histObj$counts
hMids = histObj$mids
mat = matrix(NA, nrow = length(x), ncol = length(hMids))
colMat = mat
groupmat = mat
numVec = 1:nrow(mat)
cutOff = 1
groupList = vector(mode = "list", length = length(unique(group)))
for (k in unique(group))
{
histObj = hist(x[group == k], breaks = breaks, plot = FALSE)
hMids = histObj$mids
hCounts = histObj$counts
hMids = histObj$mids
for (i in seq(along = hMids))
{
value = pch[k]
colValue = col[k]
from = 0
from = numVec[is.na(mat[, i])][1]
to = from
if (hCounts[i] == 0)
value = NA
if (hCounts[i] >= 1)
to = to + hCounts[i] - 1
if (to > cutOff)
cutOff = to
if (DB)
{
print(paste("from:", from))
print(paste("to:", to))
print(paste("i:", i))
print(paste("value:", value))
}
mat[from:to, i] = value
colMat[from:to, i] = colValue
}
groupList[[k]] = groupmat
}
if (grouped && !stacked)
{
groupIndex = unique(group)
par(mfrow = c(length(groupIndex), 1))
#for (i in groupIndex) dotPlot(x[group == i], xlim = xlim, breaks = breaks, cex = cex, xlab = xlab, ylab = ylab, col = col, pch = pch, ...)
}
else
{
mat = mat[1:cutOff, ]
if (!is.matrix(mat))
mat = matrix(mat, nrow = 1)
if (DB)
print(mat)
#plot(1, 1, xlim = xlim, ylim = ylim, type = "n", cex = cex, xlab = xlab, ylab = ylab, ...)
for (i in 1:nrow(mat))
{
x = hMids[!is.na(mat[i, ])]
y = rep(i * 0.3, times = length(x))
y = rep(yVec[i], times = length(x))
col = colMat[i, !is.na(mat[i, ])]
pch = mat[i, !is.na(mat[i, ])]
#points(x, y, col = col, pch = pch, cex = cex)
xlist = c(xlist, x)
ylist = c(ylist, y)
}
}
xylist = cbind(xlist, ylist)
xylist = xylist[order(xlist), ]
return(xylist)
#if (DB)
# print(hMids)
#invisible(mat)
}
##############################################
# Plotting functions
##############################################
# Modified from mrds package, http://www.inside-r.org/packages/cran/mrds/docs/histline , e.g.
# Need "mrds:::" as the author left @export out the .R file...
# mrds:::histline(height=histvals$density, breaks=histvals$breaks, lineonly=FALSE, outline=TRUE, fill=FALSE)
# mrds:::histline(height=histvals$density, breaks=histvals$breaks, lineonly=TRUE, outline=TRUE, fill=FALSE)
# TURN THIS BACK ON AFTER UPGRADING TO OPTIMX 2013
junk='
mrds:::histline
function (height, breaks, lineonly = FALSE, outline = FALSE,
fill = FALSE, ylim = range(height), xlab = "x", ylab = "y",
det.plot = FALSE, ...)
{
n = length(height)
if (length(breaks) != (n + 1))
stop("breaks must be 1 longer than height")
if (outline) {
y = c(0, rep(height, times = rep(2, n)), 0)
x = rep(breaks, times = rep(2, (n + 1)))
}
else {
y = rep(0, 4 * n)
x = rep(0, 4 * n + 2)
for (i in 1:n) {
y[((i - 1) * 4 + 1):(i * 4)] = c(0, rep(height[i],
2), 0)
x[((i - 1) * 4 + 1):(i * 4)] = c(rep(breaks[i], 2),
rep(breaks[i + 1], 2))
}
x = x[1:(4 * n)]
}
if (lineonly) {
if (!fill)
lines(x, y, ...)
else polygon(x, y, ...)
}
else {
if (!fill)
plot(x, y, type = "l", ylim = ylim, xlab = xlab,
ylab = ylab, ...)
else {
if (det.plot) {
plot(x, y, type = "n", ylim = ylim, xlab = xlab,
ylab = ylab, yaxp = c(0, 1, 5))
}
else {
plot(x, y, type = "n", ylim = ylim, xlab = xlab,
ylab = ylab)
}
polygon(x, y, ...)
}
}
}
'
# Square the x and y dimensions
squareplot <- function(x, y)
{
# Scatter plot with axes drawn on the same scale
# I'd like to produce some scatter plots where N units on the X axis are > equal to N units
# on the Y axis (as measured with a ruler, on screen or paper).
# x <- sample(10:200,40)
# y <- sample(20:100,40)
# windows(width = max(x),height = max(y))
# plot(x,y)
# try:
plot(x, y, asp = 1)
# or, better:
# library(MASS)
# eqscplot(x,y)
# or
# library(lattice)
# xyplot(y ~ x, aspect = "iso")
}
# Make some subplots
subplots <- function(nrows=1, ncols=2)
{
q = quartz(width=7*ncols, height=7*nrows)
par(mfcol = c(nrows, ncols))
return(q)
}
# Pairs plot with histograms
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...)
}
pairs_with_hist <- function(datamat, ...)
{
pairs(datamat, diag.panel=panel.hist, ...)
return()
}
setup='
xvals = UK_input[,2]
yvals = UK_input[,1]
nbins=10
'
meansplot = function(xvals, yvals, nbins=10, tmppch=1)
{
bins_to_plot = seq(min(xvals), max(xvals), (max(xvals)-min(xvals))/nbins )
mean_xvals = NULL
mean_yvals = NULL
for (i in 1:(length(bins_to_plot)-1) )
{
binstart = bins_to_plot[i]
binend = bins_to_plot[i+1]
TF1 = xvals >= binstart
TF2 = xvals < binend
TF = TF1 + TF2 == 2
mean_xvals = c(mean_xvals, mean(xvals[TF]))
mean_yvals = c(mean_yvals, mean(yvals[TF]))
}
lm1 = linear_regression_plot(mean_xvals, mean_yvals, tmppch=tmppch)
return(lm1)
}
setup = '
x = ipd$rating
y = ipd$num_plays
xlabel="len_sec"
ylabel="num_plays"
'
# If slope1=TRUE, subtract 1:1 slope from the line and test for differences
linear_regression_plot <- function(x, y, xlabel="x", ylabel="y", tmppch=".", printall=TRUE, tmplinecol="black", tmplty=1, tmplwd=1, plottext=TRUE, legend_x="topleft", legend_y=NULL, xlim=minmax_pretty(x), ylim=minmax_pretty(y), slope1=FALSE, intercept_in_legend=TRUE, ...)
{
# Make a linear regression plot
model1 = lm(y~x)
print("Summary of model 1 (standard regression, y predicted by x)")
print(summary(model1))
slope = model1$coefficients[2]
intercept = model1$coefficients[1]
plot(x, y, xlab=xlabel, ylab=ylabel, pch=tmppch, xlim=xlim, ylim=ylim, ...)
tmpx1 = min(x, na.rm=TRUE)
tmpx2 = max(x, na.rm=TRUE)
tmpy1 = slope*tmpx1 + intercept
tmpy2 = slope*tmpx2 + intercept
segments(x0=tmpx1, y0=tmpy1, x1=tmpx2, y1=tmpy2, col=tmplinecol, lty=tmplty, lwd=tmplwd)
if (plottext)
{
# Subtract 1:1 slope if desired
# (for Patrick Shih cyanobacteria analysis)
if (slope1 == TRUE)
{
# Subtract 1:1 from the y values
ynew = y-x
model2 = lm(ynew~x)
pval = summary(model2)$coefficients[2,4]
print("Summary of model 2 (standard regression, (y-x) predicted by x; this means we've substracted out the 1:1 expected relationship.)")
print(summary(model2))
} else {
pval = summary(model1)$coefficients[2,4]
} # END if (slope1 == TRUE)
R2 = summary(model1)$r.squared
slope = summary(model1)$coefficients[2,1]
slopeSE = summary(model1)$coefficients[2,2]
slope95 = 1.96*slopeSE
intercept = summary(model1)$coefficients[1,1]
interceptSE = summary(model1)$coefficients[1,2]
intercept95 = 1.96*interceptSE
R2txt = paste("R2 = ", format(R2, digits=3), sep="")
slopetxt = paste("m=", format(slope, digits=3), " +/- ", format(slope95, digits=3), sep="")
intercepttxt = paste("intercept=", format(intercept, digits=3), " +/- ", format(intercept95, digits=3), sep="")
if (slope1 == TRUE)
{
pvaltxt = paste("p = ", format(pval, digits=3), " (null: slope is 1:1)", sep="")
} else {
pvaltxt = paste("p = ", format(pval, digits=3), sep="")
} # END if (slope1 == TRUE)
if (intercept_in_legend==TRUE)
{
txt_to_plot = paste(R2txt, slopetxt, intercepttxt, pvaltxt, sep="\n")
} else {
txt_to_plot = paste(R2txt, slopetxt, pvaltxt, sep="\n")
}
# http://stackoverflow.com/questions/3761410/how-can-i-plot-my-r-squared-value-on-my-scatterplot-using-r
# bty suppresses box
# print(legend_x)
# print(legend_y)
legend(x=legend_x, y=legend_y, bty="n", legend=txt_to_plot, cex=0.9)
}
if (printall == TRUE)
{
print(model1)
print(summary(model1))
}
return(model1)
}
extract_lm_stats <- function(tmplm)
{
R2 = summary(tmplm)$r.squared
# Get the slp information
slp = summary(tmplm)$coefficients[2,1]
slpSE = summary(tmplm)$coefficients[2,2]
slp_2sigma = 1.96*slpSE
slp0025 = slp + 1.96*slpSE
slp0975 = slp - 1.96*slpSE
slp_pval = summary(tmplm)$coefficients[2,4]
slp_tval = summary(tmplm)$coefficients[2,3]
slp_sig = pvals_to_sigstars(slp_pval)
# Get the intercent (int) information
int = summary(tmplm)$coefficients[1,1]
intSE = summary(tmplm)$coefficients[1,2]
int_2sigma = 1.96*intSE
int0025 = int + 1.96*intSE
int0975 = int - 1.96*intSE
int_pval = summary(tmplm)$coefficients[1,4]
int_tval = summary(tmplm)$coefficients[1,3]
int_sig = pvals_to_sigstars(int_pval)
tmprow = c(R2, slp, slpSE, slp_2sigma, slp0025, slp0975, slp_pval, slp_tval, slp_sig, int, intSE, int_2sigma, int0025, int0975, int_pval, int_tval, int_sig)
tmpcolnames = c("R2", "slp", "slpSE", "slp_2sigma", "slp0025", "slp0975", "slp_pval", "slp_tval", "slp_sig", "int", "intSE", "int_2sigma", "int0025", "int0975", "int_pval", "int_tval", "int_sig")
return(tmprow)
}
extract_glm_coefs <- function(glmobj)
{
#R2 = summary(glmobj)$r.squared
# Get the slp information
coefs = summary(glmobj)$coefficients
names_coefs = rownames(coefs)
names_params = colnames(coefs)
coefs_list = rep(NULL, times=nrow(coefs)*ncol(coefs))
names_list = rep(NULL, times=nrow(coefs)*ncol(coefs))
conf_intervals = confint(glmobj)
lower025 = rep(NULL, times=nrow(coefs))
upper975 = rep(NULL, times=nrow(coefs))
names_lower025 = paste("lower025_", names_coefs, sep="")
names_upper975 = paste("upper975_", names_coefs, sep="")
onum = 0
for (n in 1:length(names_coefs))
{
for (m in 1:length(names_params))
{
onum = onum+1
coefs_list[onum] = coefs[n,m]
# Fix names
tmpname = paste(names_coefs[n], "_", names_params[m], sep="")
tmpname0 = gsub("(Intercept)", "Intercept", tmpname, fixed=TRUE)
tmpname1 = gsub("Std. Error", "stdErr", tmpname0)
tmpname2 = gsub("z value", "zval", tmpname1)
tmpname3 = gsub(" ", "_", tmpname2)
tmpname4 = gsub("\\.", "_", tmpname3)
tmpname5 = gsub("Pr(>|z|)", "pvalGTz", tmpname4, fixed=TRUE)
names_list[onum] = tmpname5
}
# Don't calculate using the normal distribution, instead use confint()
# lower025[n] = coefs[n,1] - (1.96 * coefs[n,2])
# upper975[n] = coefs[n,1] + (1.96 * coefs[n,2])
lower025[n] = conf_intervals[n,1]
upper975[n] = conf_intervals[n,2]
}
coefs_list
names_list
tmprow = c(coefs_list, lower025, upper975)
tmpcolnames = c(names_list, names_lower025, names_upper975)
names(tmprow) = tmpcolnames
glmcoefs = tmprow
return(glmcoefs)
}
extract_glm_stats <- function(glmobj)
{
a = summary(glmobj)$deviance
b = summary(glmobj)$aic
c = summary(glmobj)$df.residual
d = summary(glmobj)$null.deviance
e = summary(glmobj)$df.null
tmprow = c(a, b, c, d, e)
tmpnames = c("deviance", "aic", "df.residual", "null.deviance", "df.null")
names(tmprow) = tmpnames
tmpstats = tmprow
return(tmpstats)
}
# Extract the approximate 95% confidence intervals
# as brunch() doesn't have a confint method
confint_brunch <- function(brunchMod)
{
coefs = summary(brunchMod)$coefficients
nrows = nrow(coefs)
brunch_CI = NULL
#check_for_NA
if (is.na(coefs[1]))
{
return(NA)
}
for (j in 1:nrows)
{
lower025 = coefs[j,"Estimate"] - 1.96 * coefs[1,"Std. Error"]
upper975 = coefs[j,"Estimate"] + 1.96 * coefs[1,"Std. Error"]
CIs = c(lower025, upper975)
CIs2 = matrix(CIs, nrow=1, ncol=2)
brunch_CI = rbind(brunch_CI, CIs2)
}
rownames(brunch_CI) = rownames(coefs)
colnames(brunch_CI) = c("2.5 %", "97.5 %")
brunch_CI
return(brunch_CI)
}
# Extract the approximate 95% confidence intervals
# as compar.gee() doesn't have a confint method
confint_gee <- function(geeobj)
{
coefs = extract_gee_coefmat(geeobj)
nrows = nrow(coefs)
brunch_CI = NULL
#check_for_NA
if (is.na(coefs[1]))
{
return(NA)
}
for (j in 1:nrows)
{
lower025 = coefs[j,"Estimate"] - 1.96 * coefs[1,"S.E."]
upper975 = coefs[j,"Estimate"] + 1.96 * coefs[1,"S.E."]
CIs = c(lower025, upper975)
CIs2 = matrix(CIs, nrow=1, ncol=2)
brunch_CI = rbind(brunch_CI, CIs2)
}
rownames(brunch_CI) = rownames(coefs)
colnames(brunch_CI) = c("2.5 %", "97.5 %")
brunch_CI
return(brunch_CI)
}
extract_brunch_coefs <- function(brunchMod)
{
require(caper) # For brunch
#######################################################
# caper's brunch algorithm
# caper.pdf
# "The 'brunch' algorithm calculates contrasts for models that include binary categorical variables.
# Contrasts are identified and calculated for all variables in the model for a set of nodes where each
# side can be unequivocally attributed to one or other of the categories. Unlike 'crunch', nested contrasts
# are not calculated and each row of data at the tips is used only once. This follows Burt (1989):
# contrasts whose paths do not meet or cross at any point will be phylogenetically independent."
#######################################################
#R2 = summary(brunchMod)$r.squared
# Get the slp information
coefs = summary(brunchMod)$coefficients
#check_for_NA
if (is.na(coefs[1]))
{
return(NA)
}
names_coefs = rownames(coefs)
names_params = colnames(coefs)
coefs_list = rep(NULL, times=nrow(coefs)*ncol(coefs))
names_list = rep(NULL, times=nrow(coefs)*ncol(coefs))
conf_intervals = confint_brunch(brunchMod)
lower025 = rep(NULL, times=nrow(coefs))
upper975 = rep(NULL, times=nrow(coefs))
names_lower025 = paste("lower025_", names_coefs, sep="")
names_upper975 = paste("upper975_", names_coefs, sep="")
onum = 0
for (n in 1:length(names_coefs))
{
for (m in 1:length(names_params))
{
onum = onum+1
coefs_list[onum] = coefs[n,m]
# Fix names
tmpname = paste(names_coefs[n], "_", names_params[m], sep="")
tmpname0 = gsub("(Intercept)", "Intercept", tmpname, fixed=TRUE)
tmpname1 = gsub("Std. Error", "stdErr", tmpname0)
tmpname2 = gsub("z value", "zval", tmpname1)
tmpname3 = gsub(" ", "_", tmpname2)
tmpname4 = gsub("\\.", "_", tmpname3)
tmpname5 = gsub("Pr(>|z|)", "pvalGTz", tmpname4, fixed=TRUE)
tmpname6 = gsub("Pr(>|t|)", "pvalGTt", tmpname5, fixed=TRUE)
names_list[onum] = tmpname6
}
# Don't calculate using the normal distribution, instead use confint()
# lower025[n] = coefs[n,1] - (1.96 * coefs[n,2])
# upper975[n] = coefs[n,1] + (1.96 * coefs[n,2])
lower025[n] = conf_intervals[n,1]
upper975[n] = conf_intervals[n,2]
}
coefs_list
names_list
tmprow = c(coefs_list, lower025, upper975)
tmpcolnames = c(names_list, names_lower025, names_upper975)
names(tmprow) = tmpcolnames
glmcoefs = tmprow
return(glmcoefs)
}
extract_brunch_stats <- function(brunchMod)
{
#check_for_NA
coefs = summary(brunchMod)$coefficients
if (is.na(coefs[1]))
{
return(NA)
}
a = summary(brunchMod)$sigma
b = summary(brunchMod)$df
c = summary(brunchMod)$r.squared
d = summary(brunchMod)$adj.r.squared
e = summary(brunchMod)$fstatistic
f = summary(brunchMod)$cov.unscaled
tmprow = c(a, b, c, d, e, f)
tmpnames = c("sigma", "df1", "df2", "df3", "r.squared", "adj.r.squared", "Fstat_value", "Fstat_numdf", "Fstat_dendf", "cov.unscaled")
names(tmprow) = tmpnames
tmpstats = tmprow
return(tmpstats)
}
extract_gee_stats <- function(geeobj)
{
#check_for_NA
coefs = extract_gee_coefmat(geeobj)
if (is.na(coefs[1]))
{
return(NA)
}
#
# nobs - the number of observations.
# QIC - the quasilikelihood information criterion as defined by Pan (2001).
# coefficients - the estimated coefficients (or regression parameters).
# scale - the scale (or dispersion parameter).
# W - the variance-covariance matrix of the estimated coefficients.
# dfP - the phylogenetic degrees of freedom (see Paradis and Claude for details on this).
#
a = geeobj$nobs
b = geeobj$dfP
c = geeobj$scale
d = geeobj$QIC
tmprow = c(a, b, c, d)
tmpnames = c("nobs", "dfP", "scale", "QIC")
names(tmprow) = tmpnames
tmpstats = tmprow
return(tmpstats)
}
extract_gee_coefmat <- function(geeobj)
{
#R2 = summary(geeobj)$r.squared
# modified from print.compar.gee
NAs <- is.na(geeobj$coef)
coefmat <- geeobj$coef[!NAs]
cnames <- names(coefmat)
coefmat <- matrix(rep(coefmat, 4), ncol = 4)
dimnames(coefmat) <- list(cnames, c("Estimate", "S.E.", "t",
"Pr(T > |t|)"))
df <- geeobj$dfP - dim(coefmat)[1]
coefmat[, 2] <- sqrt(diag(geeobj$W))
coefmat[, 3] <- coefmat[, 1]/coefmat[, 2]
# pt = prob. t-distribution
if (df < 0)
{
warning("not enough degrees of freedom to compute P-values.")
coefmat[, 4] <- NA
} else {
coefmat[, 4] <- 2 * (1 - pt(abs(coefmat[, 3]), df))
}
coefmat
return(coefmat)
}
extract_gee_coefs <- function(geeobj)
{
coefmat = extract_gee_coefmat(geeobj)
coefmat
coefs = coefmat
names_coefs = rownames(coefs)
names_params = colnames(coefs)
coefs_list = rep(NULL, times=nrow(coefs)*ncol(coefs))
names_list = rep(NULL, times=nrow(coefs)*ncol(coefs))
conf_intervals = confint_gee(geeobj)
lower025 = rep(NULL, times=nrow(coefs))
upper975 = rep(NULL, times=nrow(coefs))
names_lower025 = paste("lower025_", names_coefs, sep="")
names_upper975 = paste("upper975_", names_coefs, sep="")
onum = 0
for (n in 1:length(names_coefs))
{
for (m in 1:length(names_params))
{
onum = onum+1
coefs_list[onum] = coefs[n,m]
# Fix names
tmpname = paste(names_coefs[n], "_", names_params[m], sep="")
tmpname0 = gsub("(Intercept)", "Intercept", tmpname, fixed=TRUE)
tmpname1 = gsub("S.E.", "stdErr", tmpname0)
tmpname2 = gsub("Intercept t", "Int_tval", tmpname1)
tmpname3 = gsub("Pr(T > |t|)", "pvalGTt", tmpname2, fixed=TRUE)
tmpname4 = gsub(" ", "_", tmpname3)
tmpname5 = gsub("\\.", "_", tmpname4)
names_list[onum] = tmpname5
}
# Don't calculate using the normal distribution, instead use confint()
# lower025[n] = coefs[n,1] - (1.96 * coefs[n,2])
# upper975[n] = coefs[n,1] + (1.96 * coefs[n,2])
lower025[n] = conf_intervals[n,1]
upper975[n] = conf_intervals[n,2]
}
coefs_list
names_list
tmprow = c(coefs_list, lower025, upper975)
tmpcolnames = c(names_list, names_lower025, names_upper975)
names(tmprow) = tmpcolnames
geecoefs = tmprow
return(geecoefs)
}
lmstats_to_string_for_plot <- function(tmprow)
{
R2txt = paste("R2 = ", format(as.numeric(tmprow[1]), digits=3), sep="")
slopetxt = paste("m=", format(as.numeric(tmprow[2]), digits=3), " +/- ", format(as.numeric(tmprow[4]), digits=3), sep="")
pvaltxt = paste("p = ", format(as.numeric(tmprow[7]), digits=3), sep="")
txt_to_plot = paste(R2txt, slopetxt, pvaltxt, sep="\n")
return(txt_to_plot)
}
pvals_to_sigstars <- function(pval, pval1star=0.05, pval2star=0.01, pval3star=0.001)
{
sigstar = " "
if (pval < 0.05)
{
sigstar = "*"
}
if (pval < 0.01)
{
sigstar = "**"
}
if (pval < 0.001)
{
sigstar = "***"
}
return(sigstar)
}
# Linear colorscale
# Take some data, assign a color to each point, linearly on the colorscale
setup='
#
datapoints = exp(tmp_points_to_plot$response) / (1 + exp(tmp_points_to_plot$response))
colorscale = colorlist
datapoints=tmp_datapoints
colorscale=tmp_colorscale
rescale=FALSE
minscale=""
maxscale=""
minscale=0
maxscale=1
minscale=cellStats(marsgrid4, "min")
maxscale=cellStats(marsgrid4, "max")
'
linear_colorscale <- function(datapoints, colorscale, rescale=FALSE, minscale="", maxscale="", minout="", maxout="")
{
length_colorscale = length(colorscale)
minval = min(datapoints)
maxval = max(datapoints)
# for linear scaling from 1 to numpoints
if (rescale == TRUE)
{
if (minscale == "")
{
colorscale_indices = ceiling( ( ((datapoints - minval) / (maxval-minval)) * (length_colorscale-1) ))
} else {
#minscale = min(datapoints)
#maxscale = max(datapoints)
# no rescaling, just raw values on the 0-1 scale
colorscale_indices = ceiling( ( ((datapoints - minval) / (maxval-minval)) * (maxscale - minscale) + minscale ) * (length_colorscale-1) )
}
} else {
if (minscale == "")
{
# no rescaling, just raw values on the 0-1 scale
colorscale_indices = ceiling( datapoints * (length_colorscale-1) )
# Correction, if the points are negative, make that the new zero
if (minval < 0)
{
datapoints_new = datapoints - minval
datapoints_new = datapoints_new * maxval
colorscale_indices = ceiling( datapoints_new * (length_colorscale-1) )
}
} else {
#minscale = min(datapoints)
#maxscale = max(datapoints)
# no rescaling, just raw values on the minscale-maxscale scale
colorscale_indices = ceiling( datapoints * (length_colorscale-1) )
colorscale_indices[datapoints >= maxscale] = length_colorscale
colorscale_indices[datapoints <= minscale] = 1
}
}
colors_to_plot_per_datapoint = colorscale[colorscale_indices]
return(colors_to_plot_per_datapoint)
}
setup='
datapoints=tmp_datapoints
colorscale=tmp_colorscale
minin=tmpmin
maxin=tmpmax
minin=""
maxin=""
minout=""
maxout=""
'
linear_colorscale2 <- function(datapoints, colorscale, rescale=FALSE, minin="", maxin="", minout="", maxout="")
{
length_colorscale = length(colorscale)
if (minin == "")
{
minin = min(datapoints)
}
if (maxin == "")
{
maxin = max(datapoints)
}
datapoints[datapoints < minin] = minin
datapoints[datapoints > maxin] = maxin
# minout / maxout -- the proportions of the colorscale that you want to occupy...
if (minout == "")
{
minout = 0
}
if (maxout == "")
{
maxout = 1
}
colorscale_indices = ceiling( ( ((datapoints - minin) / (maxin-minin)) * (maxout-minout) + minout) * (length_colorscale-1) )
colors_to_plot_per_datapoint = colorscale[colorscale_indices]
return(colors_to_plot_per_datapoint)
}
make_grid_from_coords <- function(xcoords, ycoords)
{
xcoords1 = rep(xcoords, length(ycoords))
ycoords1 = rep(ycoords, length(xcoords))
xy = cbind(xcoords1, ycoords1)
#plot(xy)
# Convert to SpatialPoints
S = SpatialPoints(xy)
class(S)
proj4string(S) = envproj
# Convert to SpatialPixels
G = S
gridded(G) = TRUE
#plot(G)
class(G)
return(G)
}
# =========================================================
# Google-related functions
# =========================================================
# Copy the corrected google numbers to each matching quote
# (one could also split the results amongst the matching quotes; see next function)
copy_fixed_quantchars_to_each_matching <- function(quantchars, quotes_list_no_extra_spaces_to_google, fixed_goog, cols_in_fixed_goog, unique_quotes_to_google, divide_by_frequency = TRUE)
{
quantchar_copy_nums_to_repeated_quotes = NULL
q = NULL
# the full list of quotes (from whatever source; full quantchars dataset, or subset)
rownames = quantchars$rownames
quotetxt = quotes_list_no_extra_spaces_to_google
numcols = ncol(quantchars) - 1
tmprow = rep(NA, numcols)
# store the accumulating rows here
tmpmat = NULL
names_cols_in_fixed_goog = names(fixed_goog)[cols_in_fixed_goog]
# default (no spreading out of Google Hits across multiple hits of quote)
divide_by_factor = 1
for (i in 1:nrow(quantchars))
{
tmp_quotetxt_in_bigmatrix_list = c(quotetxt[i])
tmp_quotetxt_in_bigmatrix = quotetxt[i]
matching_unique_quotetxt_row = get_indices_where_list1_occurs_in_list2(tmp_quotetxt_in_bigmatrix_list, unique_quotes_to_google)
#print(matching_unique_quotetxt_row)
# get number of other guys showing that match in the big list
num_in_biglist_matching = sum(quotes_list_no_extra_spaces_to_google == tmp_quotetxt_in_bigmatrix)
print(num_in_biglist_matching)
# if you do want to change the numbers by dividing equally amongst multiple hits of the quote,
# do so here...
if (divide_by_frequency == TRUE)
{
divide_by_factor = num_in_biglist_matching
} else {
pass = "blah"
}
tmprow = as.numeric(fixed_goog[matching_unique_quotetxt_row, cols_in_fixed_goog]) / divide_by_factor
numchars = nchar(tmp_quotetxt_in_bigmatrix)
numwords = length(strsplit(tmp_quotetxt_in_bigmatrix, " ")[[1]])
tmprow2 = c(as.data.frame(numchars), as.data.frame(numwords), tmprow)
tmpmat = rbind(tmpmat, tmprow2)
}
tmpmat2 = cbind(as.data.frame(rownames), as.data.frame(quotetxt), as.data.frame(tmpmat, row.names=1:nrow(tmpmat)))
outdf = as.data.frame(tmpmat2, row.names=1:nrow(tmpmat2))
names(outdf) = c("rownames", "quotetxt", "numchars", "numwords", names_cols_in_fixed_goog)
quantchar_copy_nums_to_repeated_quotes = outdf
return(quantchar_copy_nums_to_repeated_quotes)
}
# If a list has [[1]], [[2]], or is a dataframe, kill that with this
get_items_from_nested_list <- function(nested_list)
{
tmpout = c()
for (i in 1:length(nested_list))
{
item = nested_list[[i]]
tmpout = c(tmpout, item)
}
return(tmpout)
}
# If a list has [[1]], [[2]], or is a dataframe, kill that with this
get_items_from_nonnested_list <- function(nested_list)
{
tmpout = c()
for (i in 1:length(nested_list))
{
item = nested_list[i]
tmpout = c(tmpout, item)
}
return(tmpout)
}
#######################################################
# Compiler switcheroo
#######################################################
junk='
switch_compiler_to_new()
switch_compiler_to_default()
junk='
# Change the compiler to e.g. g++46
switch_compiler_to_new <- function(old_compiler_symlink="/usr/bin/g++", new_compiler_fn="/my_gcc/bin/g++46", new_R_Makevars="/Users/nickm/.R/Makevars46")
{
# Replace the symlink to the compiler
cmdstr = paste("rm ", old_compiler_symlink, "; ln -s ", new_compiler_fn, " ", old_compiler_symlink, sep="")
system(cmdstr)
# Replace the $HOME/.R/Makevars file
cmdstr = paste("rm ~/.R/Makevars; cp ", new_R_Makevars, " ~/.R/Makevars", sep="")
system(cmdstr)
# Check the compiler
cmdstr2 = "g++ -v"
system(cmdstr2)
return(cmdstr)
}
# Change the compiler back to e.g. default g++42
switch_compiler_to_default <- function(old_compiler_symlink="/usr/bin/g++", new_compiler_fn="/usr/bin/llvm-g++-4.2", new_R_Makevars="/Users/nickm/.R/Makevars_default")
{
#######################################################
# NOTE: ln -s works like this:
#
# ln -s THING_TO_LINK_TO THING_THAT_POINTS_TO_SOMETHING_ELSE
#
#######################################################
# old_compiler_symlink="/usr/bin/g++"; new_compiler_fn="/usr/bin/llvm-g++-4.2"; new_R_Makevars="/Users/nickm/.R/Makevars_default"
# Replace the symlink to the compiler
cmdstr = paste("rm ", old_compiler_symlink, "; ln -s ", new_compiler_fn, " ", old_compiler_symlink, sep="")
system(cmdstr)
# Replace the $HOME/.R/Makevars file
cmdstr = paste("rm ~/.R/Makevars; cp ", new_R_Makevars, " ~/.R/Makevars", sep="")
system(cmdstr)
# Check the compiler
cmdstr2 = "g++ -v"
system(cmdstr2)
return(cmdstr)
}
# Change the gfortran to e.g. 4.6
switch_gfortran_to_new <- function(old_compiler_symlink="/usr/local/bin/gfortran", new_compiler_fn="/usr/local/gfortran/bin/gfortran")
{
# Replace the symlink to the compiler
cmdstr = paste("rm ", old_compiler_symlink, "; ln -s ", new_compiler_fn, " ", old_compiler_symlink, sep="")
system(cmdstr)
# Check the compiler
cmdstr2 = "gfortran -v"
system(cmdstr2)
return(cmdstr)
}
# Change the gfortran back to e.g. default gfortran42
switch_gfortran_to_default <- function(old_compiler_symlink="/usr/local/bin/gfortran", new_compiler_fn="/usr/bin/gfortran-4.2")
{
# Replace the symlink to the compiler
cmdstr = paste("rm ", old_compiler_symlink, "; ln -s ", new_compiler_fn, " ", old_compiler_symlink, sep="")
system(cmdstr)
# Check the compiler
cmdstr2 = "gfortran -v"
system(cmdstr2)
return(cmdstr)
}
# Change default Makevars for install
new_Makevars <- function(new_R_Makevars="/Users/nickm/.R/Makevars_phyRmcmc")
{
# Replace the $HOME/.R/Makevars file
cmdstr = paste("rm ~/.R/Makevars; cp ", new_R_Makevars, " ~/.R/Makevars", sep="")
system(cmdstr)
return(cmdstr)
}
# Change default Makevars for install
old_Makevars <- function(new_R_Makevars="/Users/nickm/.R/Makevars_default")
{
# Replace the $HOME/.R/Makevars file
cmdstr = paste("rm ~/.R/Makevars; cp ", new_R_Makevars, " ~/.R/Makevars", sep="")
system(cmdstr)
return(cmdstr)
}
# Check gcc version
check_gcc <- function(gccstr="/usr/bin/gcc")
{
cmdstr = paste(gccstr, " -v", sep="")
system(cmdstr)
}
# Check g++ version
check_gpp <- function(gccstr="/usr/bin/g++")
{
cmdstr = paste(gccstr, " -v", sep="")
system(cmdstr)
}
# Check gfortran version
# use "which gfortran" to find out where it is
check_gfortran <- function(gccstr="/usr/bin/local/gfortran")
{
cmdstr = paste("which gfortran", sep="")
system(cmdstr)
cmdstr = paste(gccstr, " -v", sep="")
system(cmdstr)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment