inkhorn

## scotland.R
# ****Introduction****

# Data analysis is like an interview.  In any interview, the interviewer hopes to use a series of
# questions in order to discover a story.  The questions the interviewer asks, of course, are
# subjectively chosen.  As such, the story that one interviewer gets out of an interviewee might
# be fairly different from the story that another interviewer gets out of the same person.  In the
# same way, the commands (and thus the analysis) below are not the only way of analyzing the data.
# When you understand what the commands are doing, you might decide to take a different approach
# to analyzing the data.  Please do so, and be sure to share what you find!

## process recipes.py
import os
rfiles = os.listdir('.')
rc = []
for f in rfiles:
    if '.txt' in f:
    # The recipes come in 3 txt files consisting of 1 recipe per line, the
    # cuisine of the recipe as the first entry in the line, and all subsequent ingredient
    # entries separated by a tab
        infile = open(f, 'r')
        rc.append(infile.read())

## first nations libraries.r
library(plyr)
library(ggplot2)
library(ggmap)

libraries = read.csv("ontario_library_stats_2010.csv")
libraries$isFN = ifelse(libraries$Library.Service.Type == "First Nations Library",1,0)

# Here we create the 'proportionate' versions of all the variables
libraries[,143:265] = sapply(libraries[,20:142], function (x) x/libraries[,13])
names(libraries)[143:265] = paste(names(libraries)[20:142], "P",sep=".")

## cellphone analysis.R
library(jsonlite)

cp = fromJSON(txt = "Cell Phone Data.txt", simplifyDataFrame = TRUE)

num.atts = c(4,9,11,12,13,14,15,16,18,22)

cp[,num.atts] = sapply(cp[,num.atts], function (x) as.numeric(x))
cp$aspect.ratio = cp$att_pixels_y / cp$att_pixels_x
cp$isSmartPhone = ifelse(grepl("smart|iphone|blackberry", cp$name, ignore.case=TRUE) == TRUE | cp$att_screen_size >= 4, "Yes", "No")

## recipe analysis.R
recipes = readLines('recipes combined.tsv')

# Once I read it into R, I have to get rid of the /t
# characters so that it's more acceptable to the tm package

recipes.new = apply(as.matrix(recipes), 1, function (x) gsub('\t',' ', x))

recipes.corpus = Corpus(VectorSource(recipes.new))
recipes.dtm = DocumentTermMatrix(recipes.corpus)

## gist:2151594
# Assuming the input is a stored binomial GLM object
Concordance = function(GLM.binomial) {
  outcome_and_fitted_col = cbind(GLM.binomial$y, GLM.binomial$fitted.values)
  # get a subset of outcomes where the event actually happened
  ones = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 1,]
  # get a subset of outcomes where the event didn't actually happen
  zeros = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 0,]
  # Equate the length of the event and non-event tables
  if (length(ones[,1])>length(zeros[,1])) {ones = ones[1:length(zeros[,1]),]}
    else {zeros = zeros[1:length(ones[,1]),]}

## lengthby.r
LengthBy = function(y, x) {
tapply(!is.na(y), x, sum) }

## dfsample.r
df.sample = function(df.in, n) {
  return(df.in[sample(nrow(df.in), size=n),])
}

## df_sample_exIDs.r
# This function assumes that you're going to input ID1.name and ID2.name as strings.
df.sample.exIDs = function(main.df, sample1.df, n, ID1.name, ID2.name) {
  main.ID1.notin.ID2 = main.df[!main.df[,ID1.name] %in% sample1.df[,ID2.name],]
  sample2.df = main.ID1.notin.ID2[sample(nrow(main.ID1.notin.ID2), size=n),]
  return(sample2.df)
}

## fmatchresults
Call:
glm(formula = Probable.Match. ~ First.Name.Match + Spouse.First.Name.Match:Spouse.Last.Name.Match +
    Parenthetical + Ampersand, family = binomial(logit), data = fuzzy.matching)

Deviance Residuals:
    Min       1Q   Median       3Q      Max
-2.9371  -0.2437  -0.1136  -0.0462   3.3885

Coefficients:
                                               Estimate Std. Error z value Pr(>|z|)
	# **Introduction**

	# Data analysis is like an interview. In any interview, the interviewer hopes to use a series of
	# questions in order to discover a story. The questions the interviewer asks, of course, are
	# subjectively chosen. As such, the story that one interviewer gets out of an interviewee might
	# be fairly different from the story that another interviewer gets out of the same person. In the
	# same way, the commands (and thus the analysis) below are not the only way of analyzing the data.
	# When you understand what the commands are doing, you might decide to take a different approach
	# to analyzing the data. Please do so, and be sure to share what you find!
	import os
	rfiles = os.listdir('.')
	rc = []
	for f in rfiles:
	if '.txt' in f:
	# The recipes come in 3 txt files consisting of 1 recipe per line, the
	# cuisine of the recipe as the first entry in the line, and all subsequent ingredient
	# entries separated by a tab
	infile = open(f, 'r')
	rc.append(infile.read())
	library(plyr)
	library(ggplot2)
	library(ggmap)

	libraries = read.csv("ontario_library_stats_2010.csv")
	libraries$isFN = ifelse(libraries$Library.Service.Type == "First Nations Library",1,0)

	# Here we create the 'proportionate' versions of all the variables
	libraries[,143:265] = sapply(libraries[,20:142], function (x) x/libraries[,13])
	names(libraries)[143:265] = paste(names(libraries)[20:142], "P",sep=".")
	library(jsonlite)

	cp = fromJSON(txt = "Cell Phone Data.txt", simplifyDataFrame = TRUE)

	num.atts = c(4,9,11,12,13,14,15,16,18,22)

	cp[,num.atts] = sapply(cp[,num.atts], function (x) as.numeric(x))
	cp$aspect.ratio = cp$att_pixels_y / cp$att_pixels_x
	cp$isSmartPhone = ifelse(grepl("smart\|iphone\|blackberry", cp$name, ignore.case=TRUE) == TRUE \| cp$att_screen_size >= 4, "Yes", "No")
	recipes = readLines('recipes combined.tsv')

	# Once I read it into R, I have to get rid of the /t
	# characters so that it's more acceptable to the tm package

	recipes.new = apply(as.matrix(recipes), 1, function (x) gsub('\t',' ', x))

	recipes.corpus = Corpus(VectorSource(recipes.new))
	recipes.dtm = DocumentTermMatrix(recipes.corpus)
	# Assuming the input is a stored binomial GLM object
	Concordance = function(GLM.binomial) {
	outcome_and_fitted_col = cbind(GLM.binomial$y, GLM.binomial$fitted.values)
	# get a subset of outcomes where the event actually happened
	ones = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 1,]
	# get a subset of outcomes where the event didn't actually happen
	zeros = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 0,]
	# Equate the length of the event and non-event tables
	if (length(ones[,1])>length(zeros[,1])) {ones = ones[1:length(zeros[,1]),]}
	else {zeros = zeros[1:length(ones[,1]),]}
	df.sample = function(df.in, n) {
	return(df.in[sample(nrow(df.in), size=n),])
	}
	# This function assumes that you're going to input ID1.name and ID2.name as strings.
	df.sample.exIDs = function(main.df, sample1.df, n, ID1.name, ID2.name) {
	main.ID1.notin.ID2 = main.df[!main.df[,ID1.name] %in% sample1.df[,ID2.name],]
	sample2.df = main.ID1.notin.ID2[sample(nrow(main.ID1.notin.ID2), size=n),]
	return(sample2.df)
	}
	Call:
	glm(formula = Probable.Match. ~ First.Name.Match + Spouse.First.Name.Match:Spouse.Last.Name.Match +
	Parenthetical + Ampersand, family = binomial(logit), data = fuzzy.matching)

	Deviance Residuals:
	Min 1Q Median 3Q Max
	-2.9371 -0.2437 -0.1136 -0.0462 3.3885

	Coefficients:
	Estimate Std. Error z value Pr(>\|z\|)