inkhorn

## dedupe_records_w_less_info.r
# These column numbers represent fields with name/contact info that I've
# marked with 1s and 0s depending on whether or not there's anything in
# the field.

bio_cols = c(5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26)

# Now we get the row numbers of all the records with duplicate IDs

dupe_id_rows = which(duplicated(big.dataset$ID) == TRUE)

## chisq_mining.r
testvars = c(6,7,9,10,11,12,13,14,16, 17,18,19,20,21,23,24,25,26,384,375,376,386,385,387,388)

resultlist = c()
for (i in testvars) {
  xsq = chisq.test(big.dataset[,i], big.dataset$DV_3lvls)$statistic
  varname = names(big.dataset)[i]
  tab = xtabs(~DV_3lvls + big.dataset[,i], data=big.dataset)
  resultlist = rbind(resultlist, list(chisq=xsq, testvar=varname, xtab=tab))
}

## safe.max.r
safe.max = function(invector) {
  na.pct = sum(is.na(invector))/length(invector)
  if (na.pct == 1) {
    return(NA) }
  else {
    return(max(invector,na.rm=TRUE))
  }
}

## fmatchresults
Call:
glm(formula = Probable.Match. ~ First.Name.Match + Spouse.First.Name.Match:Spouse.Last.Name.Match +
    Parenthetical + Ampersand, family = binomial(logit), data = fuzzy.matching)

Deviance Residuals:
    Min       1Q   Median       3Q      Max
-2.9371  -0.2437  -0.1136  -0.0462   3.3885

Coefficients:
                                               Estimate Std. Error z value Pr(>|z|)

## df_sample_exIDs.r
# This function assumes that you're going to input ID1.name and ID2.name as strings.
df.sample.exIDs = function(main.df, sample1.df, n, ID1.name, ID2.name) {
  main.ID1.notin.ID2 = main.df[!main.df[,ID1.name] %in% sample1.df[,ID2.name],]
  sample2.df = main.ID1.notin.ID2[sample(nrow(main.ID1.notin.ID2), size=n),]
  return(sample2.df)
}

## dfsample.r
df.sample = function(df.in, n) {
  return(df.in[sample(nrow(df.in), size=n),])
}

## lengthby.r
LengthBy = function(y, x) {
tapply(!is.na(y), x, sum) }

## gist:2151594
# Assuming the input is a stored binomial GLM object
Concordance = function(GLM.binomial) {
  outcome_and_fitted_col = cbind(GLM.binomial$y, GLM.binomial$fitted.values)
  # get a subset of outcomes where the event actually happened
  ones = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 1,]
  # get a subset of outcomes where the event didn't actually happen
  zeros = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 0,]
  # Equate the length of the event and non-event tables
  if (length(ones[,1])>length(zeros[,1])) {ones = ones[1:length(zeros[,1]),]}
    else {zeros = zeros[1:length(ones[,1]),]}
	# These column numbers represent fields with name/contact info that I've
	# marked with 1s and 0s depending on whether or not there's anything in
	# the field.

	bio_cols = c(5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26)

	# Now we get the row numbers of all the records with duplicate IDs

	dupe_id_rows = which(duplicated(big.dataset$ID) == TRUE)
	testvars = c(6,7,9,10,11,12,13,14,16, 17,18,19,20,21,23,24,25,26,384,375,376,386,385,387,388)

	resultlist = c()
	for (i in testvars) {
	xsq = chisq.test(big.dataset[,i], big.dataset$DV_3lvls)$statistic
	varname = names(big.dataset)[i]
	tab = xtabs(~DV_3lvls + big.dataset[,i], data=big.dataset)
	resultlist = rbind(resultlist, list(chisq=xsq, testvar=varname, xtab=tab))
	}
	safe.max = function(invector) {
	na.pct = sum(is.na(invector))/length(invector)
	if (na.pct == 1) {
	return(NA) }
	else {
	return(max(invector,na.rm=TRUE))
	}
	}
	Call:
	glm(formula = Probable.Match. ~ First.Name.Match + Spouse.First.Name.Match:Spouse.Last.Name.Match +
	Parenthetical + Ampersand, family = binomial(logit), data = fuzzy.matching)

	Deviance Residuals:
	Min 1Q Median 3Q Max
	-2.9371 -0.2437 -0.1136 -0.0462 3.3885

	Coefficients:
	Estimate Std. Error z value Pr(>\|z\|)
	# This function assumes that you're going to input ID1.name and ID2.name as strings.
	df.sample.exIDs = function(main.df, sample1.df, n, ID1.name, ID2.name) {
	main.ID1.notin.ID2 = main.df[!main.df[,ID1.name] %in% sample1.df[,ID2.name],]
	sample2.df = main.ID1.notin.ID2[sample(nrow(main.ID1.notin.ID2), size=n),]
	return(sample2.df)
	}
	df.sample = function(df.in, n) {
	return(df.in[sample(nrow(df.in), size=n),])
	}
	# Assuming the input is a stored binomial GLM object
	Concordance = function(GLM.binomial) {
	outcome_and_fitted_col = cbind(GLM.binomial$y, GLM.binomial$fitted.values)
	# get a subset of outcomes where the event actually happened
	ones = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 1,]
	# get a subset of outcomes where the event didn't actually happen
	zeros = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 0,]
	# Equate the length of the event and non-event tables
	if (length(ones[,1])>length(zeros[,1])) {ones = ones[1:length(zeros[,1]),]}
	else {zeros = zeros[1:length(ones[,1]),]}