Ben Marwick benmarwick

## grainAnalysis.R
# Load Data
grainData <- read.csv('grainSize.csv', check.names=F, na.strings='--' )

# Calculate Derived Sample Values
grainData[['Phi Diameter']] <- -log2( grainData[['Grain Diameter']] )
totalWeight <- sum( grainData[['Sample Weight']] )
grainData[["Percent Retained"]] <- grainData[['Sample Weight']] / totalWeight
grainData[["Cumulative Percent"]] <- cumsum( grainData[["Percent Retained"]] )
grainData[['Percent Finer']] <- 1 - grainData[['Cumulative Percent']]

## TAGS_Stats.R
require(stringr)
require(RCurl)
require(ggplot2)
gsqAPI = function(key,query,gid=0){ return( read.csv( paste( sep="",'http://spreadsheets.google.com/tq?', 'tqx=out:csv','&tq=', curlEscape(query), '&key=', key, '&gid=', gid) ) ) }

trim <- function (x) sub('@','',x)

twCounts=function(df){
	print("Counting @'d users")
    to.count=data.frame(table(df$to))

## analyzeCN220.r
# @author: Michael J Bommarito II
# @date: Feb 20, 2011
# @email: michael.bommarito@gmail.com
# @packages: gridExtra, ggplot2

library(gridExtra)
library(ggplot2)

setwd('/data/workspace/blog/cn220/')

## clustergram-had.r
ks.default <- function(rows) seq(2, max(3, rows %/% 4))

many_kmeans <- function(x, ks = ks.default(nrow(x)), ...) {
  ldply(seq_along(ks), function(i) {
    cl <- kmeans(x, centers = ks[i], ...)
    data.frame(obs = seq_len(nrow(x)), i = i, k = ks[i], cluster = cl$cluster)
  })
}

all_hclust <- function(x, ks = ks.default(nrow(x)), point.dist = "euclidean", cluster.dist = "ward") {

## clustergram-had.r
ks.default <- function(rows) seq(2, max(3, rows %/% 4))

many_kmeans <- function(x, ks = ks.default(nrow(x)), ...) {
  ldply(seq_along(ks), function(i) {
    cl <- kmeans(x, centers = ks[i], ...)
    data.frame(obs = seq_len(nrow(x)), i = i, k = ks[i], cluster = cl$cluster)
  })
}

all_hclust <- function(x, ks = ks.default(nrow(x)), point.dist = "euclidean", cluster.dist = "ward") {

## sqldf_examples.R
library(sqldf)

sqldf("SELECT
        day
        , avg(temp) as avg_temp
      FROM beaver2
      GROUP BY
        day;")

#   day avg_temp

## readtps.R
read.tps = function(data) {
  # Reads the .tps file format produced by TPSDIG
  # (http://life.bio.sunysb.edu/morph/ into a single data frame
  # USAGE: R> read.tps("filename.tps")
  a = readLines(data) # so we can do some searching and indexing
  LM = grep("LM", a) # find the line numbers for LM
  ID.ind = grep("ID", a) # find the line numbers for ID
  # and the ID values, SCALE values, and image names
  ID = gsub("(ID=)(.*)", "\\2", grep("ID", a, value=T))
  SCALE = gsub("(SCALE=)(.*)", "\\2", grep("SCALE", a, value=T))

## d3SimpleNetwork.R
#' An R function for creating simple D3 javascript directed network graphs.
#'
#' d3SimpleNetwork creates simple D3 javascript network graphs.
#'
#' @param data a data frame object with three columns. The first two are the names of the linked units. The third records an edge value. (Currently the third column doesn't affect the graph.)
#' @param Source character string naming the network source variable in the data frame. If \code{Source = NULL} then the first column of the data frame is treated as the source.
#' @param Target character string naming the network target variable in the data frame. If \code{Target = NULL} then the second column of the data frame is treated as the target.
#' @param height numeric height for the network graph's frame area.
#' @param width numeric width for the network graph's frame area.
#' @param file a character string of the file name to save the resulting graph. If a file name is given a standalone webpage is created, i.e. with a header and footer. If \code{file = NULL} then

## RSA_geoarch.r
#get data from google sheet
# connect to google sheet
require(RCurl)
options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
#in google spreadsheet, go to file-> publish to web -> get link to publish to web -> get csv file
goog <- "https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldFRsVi1VZ2EyNXJ1ZEV5SG5GSExwRHc&single=true&gid=5&output=csv"
data <- read.csv(textConnection(getURL(goog)), stringsAsFactors = FALSE)

# extract just data for plotting: pH, SOM, CaCO3, MS-LF, MS-FD
plotting_data <- na.omit(data[,c('Sample.number',

## lsa_mds.R
# load required libraries
library(tm)
library(ggplot2)
library(lsa)

# 1. Prepare mock data
text <- c("transporting food by cars will cause global warming. so we should go local.",
          "we should try to convince our parents to stop using cars because it will cause global warming.",
          "some food, such as mongo, requires a warm weather to grow. so they have to be transported to canada.",
          "a typical electronic circuit can be built with a battery, a bulb, and a switch.",
	# Load Data
	grainData <- read.csv('grainSize.csv', check.names=F, na.strings='--' )

	# Calculate Derived Sample Values
	grainData[['Phi Diameter']] <- -log2( grainData[['Grain Diameter']] )
	totalWeight <- sum( grainData[['Sample Weight']] )
	grainData[["Percent Retained"]] <- grainData[['Sample Weight']] / totalWeight
	grainData[["Cumulative Percent"]] <- cumsum( grainData[["Percent Retained"]] )
	grainData[['Percent Finer']] <- 1 - grainData[['Cumulative Percent']]
	require(stringr)
	require(RCurl)
	require(ggplot2)
	gsqAPI = function(key,query,gid=0){ return( read.csv( paste( sep="",'http://spreadsheets.google.com/tq?', 'tqx=out:csv','&tq=', curlEscape(query), '&key=', key, '&gid=', gid) ) ) }

	trim <- function (x) sub('@','',x)

	twCounts=function(df){
	print("Counting @'d users")
	to.count=data.frame(table(df$to))
	# @author: Michael J Bommarito II
	# @date: Feb 20, 2011
	# @email: michael.bommarito@gmail.com
	# @packages: gridExtra, ggplot2

	library(gridExtra)
	library(ggplot2)

	setwd('/data/workspace/blog/cn220/')
	ks.default <- function(rows) seq(2, max(3, rows %/% 4))

	many_kmeans <- function(x, ks = ks.default(nrow(x)), ...) {
	ldply(seq_along(ks), function(i) {
	cl <- kmeans(x, centers = ks[i], ...)
	data.frame(obs = seq_len(nrow(x)), i = i, k = ks[i], cluster = cl$cluster)
	})
	}

	all_hclust <- function(x, ks = ks.default(nrow(x)), point.dist = "euclidean", cluster.dist = "ward") {
	library(sqldf)

	sqldf("SELECT
	day
	, avg(temp) as avg_temp
	FROM beaver2
	GROUP BY
	day;")

	# day avg_temp
	read.tps = function(data) {
	# Reads the .tps file format produced by TPSDIG
	# (http://life.bio.sunysb.edu/morph/ into a single data frame
	# USAGE: R> read.tps("filename.tps")
	a = readLines(data) # so we can do some searching and indexing
	LM = grep("LM", a) # find the line numbers for LM
	ID.ind = grep("ID", a) # find the line numbers for ID
	# and the ID values, SCALE values, and image names
	ID = gsub("(ID=)(.*)", "\\2", grep("ID", a, value=T))
	SCALE = gsub("(SCALE=)(.*)", "\\2", grep("SCALE", a, value=T))
	#' An R function for creating simple D3 javascript directed network graphs.
	#'
	#' d3SimpleNetwork creates simple D3 javascript network graphs.
	#'
	#' @param data a data frame object with three columns. The first two are the names of the linked units. The third records an edge value. (Currently the third column doesn't affect the graph.)
	#' @param Source character string naming the network source variable in the data frame. If \code{Source = NULL} then the first column of the data frame is treated as the source.
	#' @param Target character string naming the network target variable in the data frame. If \code{Target = NULL} then the second column of the data frame is treated as the target.
	#' @param height numeric height for the network graph's frame area.
	#' @param width numeric width for the network graph's frame area.
	#' @param file a character string of the file name to save the resulting graph. If a file name is given a standalone webpage is created, i.e. with a header and footer. If \code{file = NULL} then
	#get data from google sheet
	# connect to google sheet
	require(RCurl)
	options(RCurlOptions = list(capath = system.file("CurlSSL", "cacert.pem", package = "RCurl"), ssl.verifypeer = FALSE))
	#in google spreadsheet, go to file-> publish to web -> get link to publish to web -> get csv file
	goog <- "https://docs.google.com/spreadsheet/pub?key=0As7CmPqGXTzldFRsVi1VZ2EyNXJ1ZEV5SG5GSExwRHc&single=true&gid=5&output=csv"
	data <- read.csv(textConnection(getURL(goog)), stringsAsFactors = FALSE)

	# extract just data for plotting: pH, SOM, CaCO3, MS-LF, MS-FD
	plotting_data <- na.omit(data[,c('Sample.number',
	# load required libraries
	library(tm)
	library(ggplot2)
	library(lsa)

	# 1. Prepare mock data
	text <- c("transporting food by cars will cause global warming. so we should go local.",
	"we should try to convince our parents to stop using cars because it will cause global warming.",
	"some food, such as mongo, requires a warm weather to grow. so they have to be transported to canada.",
	"a typical electronic circuit can be built with a battery, a bulb, and a switch.",