inkhorn

## toronto.casino.glm.r
Call:
glm(formula = casino$Q6 == "City of Toronto" ~ GoBigorGoHome +
    TechnicalDetails + Soc.Env.Issues, family = binomial(logit),
    data = casino)

Deviance Residuals:
    Min       1Q   Median       3Q      Max
-3.6426  -0.4745  -0.1156   0.4236   3.4835

Coefficients:

## toronto_casino.r
library(ff)
library(ffbase)
library(stringr)
library(ggplot2)
library(ggthemes)
library(reshape2)
library(RgoogleMaps)

# Loading 2 copies of the same data set so that I can convert one and have the original for its text values
casino = read.csv("/home/inkhorn/Downloads/casino_survey_results20130325.csv")

## stack multiple copies of the same data type together.r
# Here's where I extract the database IDs and repeat them 50 times to make the column long enough for
# my new long-form dataset (596,100 rows)

client.data.new = rep(client.data[,1],50)

for (i in 2:32){
# for each column in the first 31 after the ID column, find the 49 matching columns
# to the right and stack them using melt

    stacked.data = melt(client.data, id.vars="CnBio_ID", measure.vars=seq(i,(i+(31*49)),31), value.name=names(client.data)[i])

## penultimax.r
penultimax = function(invector) {
   # If the vector starts off as only having 1 or 0 numbers, return NA
  if (length(invector) <= 1) {
    return(NA)
  }
  first.max = safe.max(invector)
   #Once we get the max, take it out of the vector and make newvector
  newvector = invector[!invector == first.max]
   #If newvector now has nothing in it, return NA
  if (length(newvector) == 0) {

## crossbarminmax.r
scents = read.table("clipboard",header=TRUE,sep="\t")
strial3.by.sex.wide = ddply(scents, 'Sex', function (x) quantile(x$S.Trial.3, c(0,.5,1), na.rm=TRUE))
strial3.by.sex.smokers = melt(ddply(subset(scents,Smoker == "Y") , 'Sex', function (x) quantile(x$S.Trial.3, c(0,1), na.rm=TRUE)),variable.name="Percentile",value.name="Time")

ggplot() + geom_crossbar(data=strial3.by.sex.wide, aes(x=Sex, y=strial3.by.sex.wide$"50%", ymin=strial3.by.sex.wide$"0%", ymax=strial3.by.sex.wide$"100%"),fill="#bcc927",width=.75) +
geom_point(data=strial3.by.sex.smokers, aes(x=Sex, y=Time, stat="identity"), size=3)
+ opts(legend.title = theme_text(size=10, face="bold"), legend.text = theme_text(size=10),
 axis.text.x=theme_text(size=10), axis.text.y=theme_text(size=10,hjust=1), axis.title.x=theme_text(size=12,face="bold"), axis.title.y=theme_text(size=12, angle=90,
face="bold")) + scale_y_continuous(name="Time to Completion")

## dedupe_records_w_less_info.r
# These column numbers represent fields with name/contact info that I've
# marked with 1s and 0s depending on whether or not there's anything in
# the field.

bio_cols = c(5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26)

# Now we get the row numbers of all the records with duplicate IDs

dupe_id_rows = which(duplicated(big.dataset$ID) == TRUE)

## chisq_mining.r
testvars = c(6,7,9,10,11,12,13,14,16, 17,18,19,20,21,23,24,25,26,384,375,376,386,385,387,388)

resultlist = c()
for (i in testvars) {
  xsq = chisq.test(big.dataset[,i], big.dataset$DV_3lvls)$statistic
  varname = names(big.dataset)[i]
  tab = xtabs(~DV_3lvls + big.dataset[,i], data=big.dataset)
  resultlist = rbind(resultlist, list(chisq=xsq, testvar=varname, xtab=tab))
}

## safe.max.r
safe.max = function(invector) {
  na.pct = sum(is.na(invector))/length(invector)
  if (na.pct == 1) {
    return(NA) }
  else {
    return(max(invector,na.rm=TRUE))
  }
}

## fmatchresults
Call:
glm(formula = Probable.Match. ~ First.Name.Match + Spouse.First.Name.Match:Spouse.Last.Name.Match +
    Parenthetical + Ampersand, family = binomial(logit), data = fuzzy.matching)

Deviance Residuals:
    Min       1Q   Median       3Q      Max
-2.9371  -0.2437  -0.1136  -0.0462   3.3885

Coefficients:
                                               Estimate Std. Error z value Pr(>|z|)

## df_sample_exIDs.r
# This function assumes that you're going to input ID1.name and ID2.name as strings.
df.sample.exIDs = function(main.df, sample1.df, n, ID1.name, ID2.name) {
  main.ID1.notin.ID2 = main.df[!main.df[,ID1.name] %in% sample1.df[,ID2.name],]
  sample2.df = main.ID1.notin.ID2[sample(nrow(main.ID1.notin.ID2), size=n),]
  return(sample2.df)
}
	Call:
	glm(formula = casino$Q6 == "City of Toronto" ~ GoBigorGoHome +
	TechnicalDetails + Soc.Env.Issues, family = binomial(logit),
	data = casino)

	Deviance Residuals:
	Min 1Q Median 3Q Max
	-3.6426 -0.4745 -0.1156 0.4236 3.4835

	Coefficients:
	library(ff)
	library(ffbase)
	library(stringr)
	library(ggplot2)
	library(ggthemes)
	library(reshape2)
	library(RgoogleMaps)

	# Loading 2 copies of the same data set so that I can convert one and have the original for its text values
	casino = read.csv("/home/inkhorn/Downloads/casino_survey_results20130325.csv")
	# Here's where I extract the database IDs and repeat them 50 times to make the column long enough for
	# my new long-form dataset (596,100 rows)

	client.data.new = rep(client.data[,1],50)

	for (i in 2:32){
	# for each column in the first 31 after the ID column, find the 49 matching columns
	# to the right and stack them using melt

	stacked.data = melt(client.data, id.vars="CnBio_ID", measure.vars=seq(i,(i+(31*49)),31), value.name=names(client.data)[i])
	penultimax = function(invector) {
	# If the vector starts off as only having 1 or 0 numbers, return NA
	if (length(invector) <= 1) {
	return(NA)
	}
	first.max = safe.max(invector)
	#Once we get the max, take it out of the vector and make newvector
	newvector = invector[!invector == first.max]
	#If newvector now has nothing in it, return NA
	if (length(newvector) == 0) {
	scents = read.table("clipboard",header=TRUE,sep="\t")
	strial3.by.sex.wide = ddply(scents, 'Sex', function (x) quantile(x$S.Trial.3, c(0,.5,1), na.rm=TRUE))
	strial3.by.sex.smokers = melt(ddply(subset(scents,Smoker == "Y") , 'Sex', function (x) quantile(x$S.Trial.3, c(0,1), na.rm=TRUE)),variable.name="Percentile",value.name="Time")

	ggplot() + geom_crossbar(data=strial3.by.sex.wide, aes(x=Sex, y=strial3.by.sex.wide$"50%", ymin=strial3.by.sex.wide$"0%", ymax=strial3.by.sex.wide$"100%"),fill="#bcc927",width=.75) +
	geom_point(data=strial3.by.sex.smokers, aes(x=Sex, y=Time, stat="identity"), size=3)
	+ opts(legend.title = theme_text(size=10, face="bold"), legend.text = theme_text(size=10),
	axis.text.x=theme_text(size=10), axis.text.y=theme_text(size=10,hjust=1), axis.title.x=theme_text(size=12,face="bold"), axis.title.y=theme_text(size=12, angle=90,
	face="bold")) + scale_y_continuous(name="Time to Completion")
	# These column numbers represent fields with name/contact info that I've
	# marked with 1s and 0s depending on whether or not there's anything in
	# the field.

	bio_cols = c(5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26)

	# Now we get the row numbers of all the records with duplicate IDs

	dupe_id_rows = which(duplicated(big.dataset$ID) == TRUE)
	testvars = c(6,7,9,10,11,12,13,14,16, 17,18,19,20,21,23,24,25,26,384,375,376,386,385,387,388)

	resultlist = c()
	for (i in testvars) {
	xsq = chisq.test(big.dataset[,i], big.dataset$DV_3lvls)$statistic
	varname = names(big.dataset)[i]
	tab = xtabs(~DV_3lvls + big.dataset[,i], data=big.dataset)
	resultlist = rbind(resultlist, list(chisq=xsq, testvar=varname, xtab=tab))
	}
	safe.max = function(invector) {
	na.pct = sum(is.na(invector))/length(invector)
	if (na.pct == 1) {
	return(NA) }
	else {
	return(max(invector,na.rm=TRUE))
	}
	}
	Call:
	glm(formula = Probable.Match. ~ First.Name.Match + Spouse.First.Name.Match:Spouse.Last.Name.Match +
	Parenthetical + Ampersand, family = binomial(logit), data = fuzzy.matching)

	Deviance Residuals:
	Min 1Q Median 3Q Max
	-2.9371 -0.2437 -0.1136 -0.0462 3.3885

	Coefficients:
	Estimate Std. Error z value Pr(>\|z\|)
	# This function assumes that you're going to input ID1.name and ID2.name as strings.
	df.sample.exIDs = function(main.df, sample1.df, n, ID1.name, ID2.name) {
	main.ID1.notin.ID2 = main.df[!main.df[,ID1.name] %in% sample1.df[,ID2.name],]
	sample2.df = main.ID1.notin.ID2[sample(nrow(main.ID1.notin.ID2), size=n),]
	return(sample2.df)
	}