Skip to content

Instantly share code, notes, and snippets.

@benmarwick
benmarwick / grainAnalysis.R
Created December 28, 2011 00:16 — forked from Sharpie/grainAnalysis.R
R code related to graphical analysis of a sieved soil sample.
# Load Data
grainData <- read.csv('grainSize.csv', check.names=F, na.strings='--' )
# Calculate Derived Sample Values
grainData[['Phi Diameter']] <- -log2( grainData[['Grain Diameter']] )
totalWeight <- sum( grainData[['Sample Weight']] )
grainData[["Percent Retained"]] <- grainData[['Sample Weight']] / totalWeight
grainData[["Cumulative Percent"]] <- cumsum( grainData[["Percent Retained"]] )
grainData[['Percent Finer']] <- 1 - grainData[['Cumulative Percent']]
@benmarwick
benmarwick / TAGS_Stats.R
Created January 22, 2012 04:27 — forked from psychemedia/TAGS_Stats.R
Tools for processing and visualising data from a TAGS archive
require(stringr)
require(RCurl)
require(ggplot2)
gsqAPI = function(key,query,gid=0){ return( read.csv( paste( sep="",'http://spreadsheets.google.com/tq?', 'tqx=out:csv','&tq=', curlEscape(query), '&key=', key, '&gid=', gid) ) ) }
trim <- function (x) sub('@','',x)
twCounts=function(df){
print("Counting @'d users")
to.count=data.frame(table(df$to))
# @author: Michael J Bommarito II
# @date: Feb 20, 2011
# @email: michael.bommarito@gmail.com
# @packages: gridExtra, ggplot2
library(gridExtra)
library(ggplot2)
setwd('/data/workspace/blog/cn220/')
ks.default <- function(rows) seq(2, max(3, rows %/% 4))
many_kmeans <- function(x, ks = ks.default(nrow(x)), ...) {
ldply(seq_along(ks), function(i) {
cl <- kmeans(x, centers = ks[i], ...)
data.frame(obs = seq_len(nrow(x)), i = i, k = ks[i], cluster = cl$cluster)
})
}
all_hclust <- function(x, ks = ks.default(nrow(x)), point.dist = "euclidean", cluster.dist = "ward") {
@benmarwick
benmarwick / ggFactoPlot.R
Created March 20, 2012 18:49
FactoMineR PCA plot with ggplot2
# Plotting the output of FactoMineR's PCA using ggplot2
#
# load libraries
library(FactoMineR)
library(ggplot2)
library(scales)
library(grid)
library(plyr)
library(gridExtra)
#
ks.default <- function(rows) seq(2, max(3, rows %/% 4))
many_kmeans <- function(x, ks = ks.default(nrow(x)), ...) {
ldply(seq_along(ks), function(i) {
cl <- kmeans(x, centers = ks[i], ...)
data.frame(obs = seq_len(nrow(x)), i = i, k = ks[i], cluster = cl$cluster)
})
}
all_hclust <- function(x, ks = ks.default(nrow(x)), point.dist = "euclidean", cluster.dist = "ward") {
@benmarwick
benmarwick / battleship_CAseriation.R
Last active April 2, 2019 14:41
ggbattleship - battleship curves with R and ggplot2, and some other methods
# From http://cainarchaeology.weebly.com/r-package-for-seriation-via-ca.html
library(CAseriation)
data("perfect_seriation")
#loads the sample dataset
check.ca.plot(perfect_seriation,1,2)
#plot the Correspondence Analysis scatterplot of the first 2 dimensions in order #to inspect data structure (e.g., seeking for the horseshoe effect)
sort.table(perfect_seriation,1)
@benmarwick
benmarwick / JSTOR2MALLET.r
Last active January 24, 2018 22:29
R code to take JSTOR DfR wordcount CSV files and convert them to the bags of words txt files ready for input to MALLET
# set working directory, ie. location of JSTOR DfR CSV
# files on the computer
setwd("C:\\some directory with JSTOR DfR CSV files")
# create a list of all the CSV files
myFiles <- list.files(pattern="*.csv|CSV")
# read in all the CSV files to an R data object
myData <- lapply(myFiles, read.csv)
@benmarwick
benmarwick / R2MALLET.r
Last active April 12, 2021 10:27
R code to operate MALLET entirely from within R. Set variables, send commands to Windows' command console and get MALLET's result back into R for further analysis.
# Set working directory
dir <- "C:\\" # adjust to suit
setwd(dir)
# configure variables and filenames for MALLET
## here using MALLET's built-in example data and
## variables from http://programminghistorian.org/lessons/topic-modeling-and-mallet
# folder containing txt files for MALLET to work on
importdir <- "C:\\mallet-2.0.7\\sample-data\\web\\en"
@benmarwick
benmarwick / parallel-topicmodels.r
Last active March 18, 2019 00:09
Speed test of different methods of parallel processing to generate topic models with different numbers of topics. Coded for a single Windows 7 laptop with a four core processor (ie not a networked cluster) and data from the topicmodels package.
# Speed tests of different parallel and non-parallel methods
# for iterating over different numbers of topics with
# topicmodels
# clear workspace and stop any previous cluster instances
rm(list = ls(all.names = TRUE))
gc()
sfStop()
library(topicmodels)