Josef Fruehwald JoFrhwld

## sql_load.R
#' SQL Load
#'
#' This is function meant to be used along with ldply to read data in using sqldf.
#'
#' @param x the path to a file to be read
#' @param selection the columns to return. Defaults to \code{"*"}
#' @param condition conditions defining which data rows to load in SQL
#' @param file.format an argument to be passed to \code{sqldf}.
#' Defaults to assume a tab-delimited file with a header row.
#' See \code{?sqldf} for more info

## dplyr_to_ggplot2.r
libarary(plyr)
library(dplyr)
library(ggplot2)


baseball %>%
  group_by(year)%>%
  summarise(r=sum(r)) %>%
  ggplot(., aes(year, r)) +
    geom_point()

## bootMer_ex.R
library(lme4)

mod <-  lmer(F1_n ~ plt_vclass * Decade_c * freq_c + (plt_vclass + freq_c| File) + (Decade_c|word),
             data = ays_to_test)

boot_fun <- function(mod){
  # x is a named vector
  x <- fixef(mod)

  #out is a longer named vector

## person_entropy.R
library(babynames)
library(dplyr)
library(ggplot2)

lifetables %>%
  mutate(decade = year)%>%
  group_by(decade)%>%
  mutate(prob_alive = lx/100000,
         study_year = year + x)->prob_people

## cmu_n.py
from nltk.corpus import cmudict
import string
import re

the_dict = cmudict.dict()
the_dict2 = {word: [string.join(x, sep = " ")
			for x in entries]
				for word, entries in the_dict.items()}

two_n = {word: entries

## talk_gist.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                JoFrhwld
                / talk_gist.md
            
            
              Last active
              August 29, 2015 14:17
            
              
                Big Data and Sociolinguistics
              
          
As datasets grow in size, it's going to become trivial to find "significant" effects (i.e. non-zero).

That isn't a problem that can be fixed by just shrinking α down.


We need to ask ourselves:

Are the effects we're observing large enough to be interesting?
How big did we expect them to be?


To answer (2), we need an articulated theory that can make quantitative predictions.
I walk through two examples where I try to predict effect sizes given background theory.
link: https://jofrhwld.github.io/papers/plc39_2015/


## scored_vlda.r
scored_vclass <- function(df,class,vowels, dims = c("F1","F2")){
	require(MASS)
	df <- df[df[ ,class] %in% vowels, ]


	df[ ,class] <- as.character(df[ ,class])
	df[,class] <- as.factor(df[,class])


	for(i in dims){

## clip.border.R
clip.border <- function(deldir.obj, border){
	## deldir.obj = output of deldir::deldir
	## border = list of borders defined in data frames
	##		Importantly, with columns called "x" and "y"
	require(gpclib)

	## bord will be a gpc polygon of all of the regions given to the argument "border"
	bord <- as(matrix(ncol = 2), "gpc.poly")
	for(i in seq(along = border)){
		b <- as(border[[i]], "gpc.poly")

## philly.motion.R
##########
## Block 1: Preparing the data

library(plyr)
library(reshape)

## Create a Subsystem Column
all_philly$Subsystem <- all_philly$VClass
levels(all_philly$Subsystem) <- c("Vhr", "Vw", "V", "misc", "Vh", "Vh", "Vhr", "Vw", "Vy",
"Vy", "V", "Vy", "Vy", "Vy", "V", "Vw", "Vy", "Vy", "Vhr",

## global_means.R
library(purrr)
library(dplyr)
library(data.table)

meas_files <- Sys.glob("DataDirectory/speakers/*/*.txt")

meas_files %>%
  map(~fread(.)[,list(idstring =  gsub("(*).txt",
                                       "\\1",
                                       basename(.)),
	#' SQL Load
	#'
	#' This is function meant to be used along with ldply to read data in using sqldf.
	#'
	#' @param x the path to a file to be read
	#' @param selection the columns to return. Defaults to \code{"*"}
	#' @param condition conditions defining which data rows to load in SQL
	#' @param file.format an argument to be passed to \code{sqldf}.
	#' Defaults to assume a tab-delimited file with a header row.
	#' See \code{?sqldf} for more info
	libarary(plyr)
	library(dplyr)
	library(ggplot2)


	baseball %>%
	group_by(year)%>%
	summarise(r=sum(r)) %>%
	ggplot(., aes(year, r)) +
	geom_point()
	library(lme4)

	mod <- lmer(F1_n ~ plt_vclass * Decade_c * freq_c + (plt_vclass + freq_c\| File) + (Decade_c\|word),
	data = ays_to_test)

	boot_fun <- function(mod){
	# x is a named vector
	x <- fixef(mod)

	#out is a longer named vector
	library(babynames)
	library(dplyr)
	library(ggplot2)

	lifetables %>%
	mutate(decade = year)%>%
	group_by(decade)%>%
	mutate(prob_alive = lx/100000,
	study_year = year + x)->prob_people
	from nltk.corpus import cmudict
	import string
	import re

	the_dict = cmudict.dict()
	the_dict2 = {word: [string.join(x, sep = " ")
	for x in entries]
	for word, entries in the_dict.items()}

	two_n = {word: entries
	scored_vclass <- function(df,class,vowels, dims = c("F1","F2")){
	require(MASS)
	df <- df[df[ ,class] %in% vowels, ]


	df[ ,class] <- as.character(df[ ,class])
	df[,class] <- as.factor(df[,class])


	for(i in dims){
	clip.border <- function(deldir.obj, border){
	## deldir.obj = output of deldir::deldir
	## border = list of borders defined in data frames
	## Importantly, with columns called "x" and "y"
	require(gpclib)

	## bord will be a gpc polygon of all of the regions given to the argument "border"
	bord <- as(matrix(ncol = 2), "gpc.poly")
	for(i in seq(along = border)){
	b <- as(border[[i]], "gpc.poly")
	##########
	## Block 1: Preparing the data

	library(plyr)
	library(reshape)

	## Create a Subsystem Column
	all_philly$Subsystem <- all_philly$VClass
	levels(all_philly$Subsystem) <- c("Vhr", "Vw", "V", "misc", "Vh", "Vh", "Vhr", "Vw", "Vy",
	"Vy", "V", "Vy", "Vy", "Vy", "V", "Vw", "Vy", "Vy", "Vhr",
	library(purrr)
	library(dplyr)
	library(data.table)

	meas_files <- Sys.glob("DataDirectory/speakers//.txt")

	meas_files %>%
	map(~fread(.)[,list(idstring = gsub("(*).txt",
	"\\1",
	basename(.)),