Ben Marwick benmarwick

## grainAnalysis.R
# Load Data
grainData <- read.csv('grainSize.csv', check.names=F, na.strings='--' )

# Calculate Derived Sample Values
grainData[['Phi Diameter']] <- -log2( grainData[['Grain Diameter']] )
totalWeight <- sum( grainData[['Sample Weight']] )
grainData[["Percent Retained"]] <- grainData[['Sample Weight']] / totalWeight
grainData[["Cumulative Percent"]] <- cumsum( grainData[["Percent Retained"]] )
grainData[['Percent Finer']] <- 1 - grainData[['Cumulative Percent']]

## TAGS_Stats.R
require(stringr)
require(RCurl)
require(ggplot2)
gsqAPI = function(key,query,gid=0){ return( read.csv( paste( sep="",'http://spreadsheets.google.com/tq?', 'tqx=out:csv','&tq=', curlEscape(query), '&key=', key, '&gid=', gid) ) ) }

trim <- function (x) sub('@','',x)

twCounts=function(df){
	print("Counting @'d users")
    to.count=data.frame(table(df$to))

## analyzeCN220.r
# @author: Michael J Bommarito II
# @date: Feb 20, 2011
# @email: michael.bommarito@gmail.com
# @packages: gridExtra, ggplot2

library(gridExtra)
library(ggplot2)

setwd('/data/workspace/blog/cn220/')

## clustergram-had.r
ks.default <- function(rows) seq(2, max(3, rows %/% 4))

many_kmeans <- function(x, ks = ks.default(nrow(x)), ...) {
  ldply(seq_along(ks), function(i) {
    cl <- kmeans(x, centers = ks[i], ...)
    data.frame(obs = seq_len(nrow(x)), i = i, k = ks[i], cluster = cl$cluster)
  })
}

all_hclust <- function(x, ks = ks.default(nrow(x)), point.dist = "euclidean", cluster.dist = "ward") {

## ggFactoPlot.R
# Plotting the output of FactoMineR's PCA using ggplot2
#
# load libraries
library(FactoMineR)
library(ggplot2)
library(scales)
library(grid)
library(plyr)
library(gridExtra)
#

## clustergram-had.r
ks.default <- function(rows) seq(2, max(3, rows %/% 4))

many_kmeans <- function(x, ks = ks.default(nrow(x)), ...) {
  ldply(seq_along(ks), function(i) {
    cl <- kmeans(x, centers = ks[i], ...)
    data.frame(obs = seq_len(nrow(x)), i = i, k = ks[i], cluster = cl$cluster)
  })
}

all_hclust <- function(x, ks = ks.default(nrow(x)), point.dist = "euclidean", cluster.dist = "ward") {

## battleship_CAseriation.R
# From http://cainarchaeology.weebly.com/r-package-for-seriation-via-ca.html

library(CAseriation)
data("perfect_seriation")
#loads the sample dataset

check.ca.plot(perfect_seriation,1,2)
#plot the Correspondence Analysis scatterplot of the first 2 dimensions in order #to inspect data structure (e.g., seeking for the horseshoe effect)

sort.table(perfect_seriation,1)

## JSTOR2MALLET.r
# set working directory, ie. location of JSTOR DfR CSV
# files on the computer
setwd("C:\\some directory with JSTOR DfR CSV files")

# create a list of all the CSV files
myFiles <- list.files(pattern="*.csv|CSV")

# read in all the CSV files to an R data object
myData <-  lapply(myFiles, read.csv)


## R2MALLET.r
# Set working directory
dir <- "C:\\" # adjust to suit
setwd(dir)

# configure variables and filenames for MALLET
## here using MALLET's built-in example data and
## variables from http://programminghistorian.org/lessons/topic-modeling-and-mallet

# folder containing txt files for MALLET to work on
importdir <- "C:\\mallet-2.0.7\\sample-data\\web\\en"

## parallel-topicmodels.r
# Speed tests of different parallel and non-parallel methods
# for iterating over different numbers of topics with
# topicmodels

# clear workspace and stop any previous cluster instances
rm(list = ls(all.names = TRUE))
gc()
sfStop()

library(topicmodels)
	# Load Data
	grainData <- read.csv('grainSize.csv', check.names=F, na.strings='--' )

	# Calculate Derived Sample Values
	grainData[['Phi Diameter']] <- -log2( grainData[['Grain Diameter']] )
	totalWeight <- sum( grainData[['Sample Weight']] )
	grainData[["Percent Retained"]] <- grainData[['Sample Weight']] / totalWeight
	grainData[["Cumulative Percent"]] <- cumsum( grainData[["Percent Retained"]] )
	grainData[['Percent Finer']] <- 1 - grainData[['Cumulative Percent']]
	require(stringr)
	require(RCurl)
	require(ggplot2)
	gsqAPI = function(key,query,gid=0){ return( read.csv( paste( sep="",'http://spreadsheets.google.com/tq?', 'tqx=out:csv','&tq=', curlEscape(query), '&key=', key, '&gid=', gid) ) ) }

	trim <- function (x) sub('@','',x)

	twCounts=function(df){
	print("Counting @'d users")
	to.count=data.frame(table(df$to))
	# @author: Michael J Bommarito II
	# @date: Feb 20, 2011
	# @email: michael.bommarito@gmail.com
	# @packages: gridExtra, ggplot2

	library(gridExtra)
	library(ggplot2)

	setwd('/data/workspace/blog/cn220/')
	ks.default <- function(rows) seq(2, max(3, rows %/% 4))

	many_kmeans <- function(x, ks = ks.default(nrow(x)), ...) {
	ldply(seq_along(ks), function(i) {
	cl <- kmeans(x, centers = ks[i], ...)
	data.frame(obs = seq_len(nrow(x)), i = i, k = ks[i], cluster = cl$cluster)
	})
	}

	all_hclust <- function(x, ks = ks.default(nrow(x)), point.dist = "euclidean", cluster.dist = "ward") {
	# Plotting the output of FactoMineR's PCA using ggplot2
	#
	# load libraries
	library(FactoMineR)
	library(ggplot2)
	library(scales)
	library(grid)
	library(plyr)
	library(gridExtra)
	#
	# From http://cainarchaeology.weebly.com/r-package-for-seriation-via-ca.html

	library(CAseriation)
	data("perfect_seriation")
	#loads the sample dataset

	check.ca.plot(perfect_seriation,1,2)
	#plot the Correspondence Analysis scatterplot of the first 2 dimensions in order #to inspect data structure (e.g., seeking for the horseshoe effect)

	sort.table(perfect_seriation,1)
	# set working directory, ie. location of JSTOR DfR CSV
	# files on the computer
	setwd("C:\\some directory with JSTOR DfR CSV files")

	# create a list of all the CSV files
	myFiles <- list.files(pattern="*.csv\|CSV")

	# read in all the CSV files to an R data object
	myData <- lapply(myFiles, read.csv)
	# Set working directory
	dir <- "C:\\" # adjust to suit
	setwd(dir)

	# configure variables and filenames for MALLET
	## here using MALLET's built-in example data and
	## variables from http://programminghistorian.org/lessons/topic-modeling-and-mallet

	# folder containing txt files for MALLET to work on
	importdir <- "C:\\mallet-2.0.7\\sample-data\\web\\en"
	# Speed tests of different parallel and non-parallel methods
	# for iterating over different numbers of topics with
	# topicmodels

	# clear workspace and stop any previous cluster instances
	rm(list = ls(all.names = TRUE))
	gc()
	sfStop()

	library(topicmodels)