Josef Fruehwald JoFrhwld

## terror.R
#' rvest for scraping 538
library(rvest)
library(magrittr)

#' scrape the forecast
five38 <- read_html("http://projects.fivethirtyeight.com/2016-election-forecast/?ex_cid=rrpromo#plus")

#' I'd prefer to be using the polls-pluss forecast here, but
#' can only seem to get the polls only
clinton <- five38 %>%

## zero_crossings.R
#' Find zero crossings in an fd object
#'
#' @import fda
#' @import magrittr
#'
#' @param fd an fd object
#' @param Lfdobj the derivative (0, 1, 2)
#' @param slope The slope of interest at the zero crossing
#' @param eps The prediction granularity
#' @param min Localize the zero crossing search to be greater than min

## list2fd.R
list2fd <- function(list, basis){
  if(class(list[[1]]) == "fdSmooth"){
    coef_list <- lapply(list, function(x)x$fd$coefs)
  }else if(class(list[[1]]) == "fd"){
    coef_list <- lapply(list, function(x)x$coefs)
  }
  n_coefs <- unlist(lapply(coef_list, length))

  if(!all(n_coefs == max(n_coefs))) stop()


## global_means.R
library(purrr)
library(dplyr)
library(data.table)

meas_files <- Sys.glob("DataDirectory/speakers/*/*.txt")

meas_files %>%
  map(~fread(.)[,list(idstring =  gsub("(*).txt",
                                       "\\1",
                                       basename(.)),

## talk_gist.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                JoFrhwld
                / talk_gist.md
            
            
              Last active
              August 29, 2015 14:17
            
              
                Big Data and Sociolinguistics
              
          
As datasets grow in size, it's going to become trivial to find "significant" effects (i.e. non-zero).

That isn't a problem that can be fixed by just shrinking α down.


We need to ask ourselves:

Are the effects we're observing large enough to be interesting?
How big did we expect them to be?


To answer (2), we need an articulated theory that can make quantitative predictions.
I walk through two examples where I try to predict effect sizes given background theory.
link: https://jofrhwld.github.io/papers/plc39_2015/


## cmu_n.py
from nltk.corpus import cmudict
import string
import re

the_dict = cmudict.dict()
the_dict2 = {word: [string.join(x, sep = " ")
			for x in entries]
				for word, entries in the_dict.items()}

two_n = {word: entries

## person_entropy.R
library(babynames)
library(dplyr)
library(ggplot2)

lifetables %>%
  mutate(decade = year)%>%
  group_by(decade)%>%
  mutate(prob_alive = lx/100000,
         study_year = year + x)->prob_people

## bootMer_ex.R
library(lme4)

mod <-  lmer(F1_n ~ plt_vclass * Decade_c * freq_c + (plt_vclass + freq_c| File) + (Decade_c|word),
             data = ays_to_test)

boot_fun <- function(mod){
  # x is a named vector
  x <- fixef(mod)

  #out is a longer named vector

## dplyr_to_ggplot2.r
libarary(plyr)
library(dplyr)
library(ggplot2)


baseball %>%
  group_by(year)%>%
  summarise(r=sum(r)) %>%
  ggplot(., aes(year, r)) +
    geom_point()

## sql_load.R
#' SQL Load
#'
#' This is function meant to be used along with ldply to read data in using sqldf.
#'
#' @param x the path to a file to be read
#' @param selection the columns to return. Defaults to \code{"*"}
#' @param condition conditions defining which data rows to load in SQL
#' @param file.format an argument to be passed to \code{sqldf}.
#' Defaults to assume a tab-delimited file with a header row.
#' See \code{?sqldf} for more info
	#' rvest for scraping 538
	library(rvest)
	library(magrittr)

	#' scrape the forecast
	five38 <- read_html("http://projects.fivethirtyeight.com/2016-election-forecast/?ex_cid=rrpromo#plus")

	#' I'd prefer to be using the polls-pluss forecast here, but
	#' can only seem to get the polls only
	clinton <- five38 %>%
	#' Find zero crossings in an fd object
	#'
	#' @import fda
	#' @import magrittr
	#'
	#' @param fd an fd object
	#' @param Lfdobj the derivative (0, 1, 2)
	#' @param slope The slope of interest at the zero crossing
	#' @param eps The prediction granularity
	#' @param min Localize the zero crossing search to be greater than min
	list2fd <- function(list, basis){
	if(class(list[[1]]) == "fdSmooth"){
	coef_list <- lapply(list, function(x)x$fd$coefs)
	}else if(class(list[[1]]) == "fd"){
	coef_list <- lapply(list, function(x)x$coefs)
	}
	n_coefs <- unlist(lapply(coef_list, length))

	if(!all(n_coefs == max(n_coefs))) stop()
	library(purrr)
	library(dplyr)
	library(data.table)

	meas_files <- Sys.glob("DataDirectory/speakers//.txt")

	meas_files %>%
	map(~fread(.)[,list(idstring = gsub("(*).txt",
	"\\1",
	basename(.)),
	from nltk.corpus import cmudict
	import string
	import re

	the_dict = cmudict.dict()
	the_dict2 = {word: [string.join(x, sep = " ")
	for x in entries]
	for word, entries in the_dict.items()}

	two_n = {word: entries
	library(babynames)
	library(dplyr)
	library(ggplot2)

	lifetables %>%
	mutate(decade = year)%>%
	group_by(decade)%>%
	mutate(prob_alive = lx/100000,
	study_year = year + x)->prob_people
	library(lme4)

	mod <- lmer(F1_n ~ plt_vclass * Decade_c * freq_c + (plt_vclass + freq_c\| File) + (Decade_c\|word),
	data = ays_to_test)

	boot_fun <- function(mod){
	# x is a named vector
	x <- fixef(mod)

	#out is a longer named vector
	libarary(plyr)
	library(dplyr)
	library(ggplot2)


	baseball %>%
	group_by(year)%>%
	summarise(r=sum(r)) %>%
	ggplot(., aes(year, r)) +
	geom_point()
	#' SQL Load
	#'
	#' This is function meant to be used along with ldply to read data in using sqldf.
	#'
	#' @param x the path to a file to be read
	#' @param selection the columns to return. Defaults to \code{"*"}
	#' @param condition conditions defining which data rows to load in SQL
	#' @param file.format an argument to be passed to \code{sqldf}.
	#' Defaults to assume a tab-delimited file with a header row.
	#' See \code{?sqldf} for more info