Matt Parker mmparker

## data_indeed_searches.md

      
              1 file
            
          
              0 forks
            
          
              1 comment
            
          
              0 stars
            
          
                mmparker
                / data_indeed_searches.md
            
            
              Last active
              August 29, 2015 14:16
            
              
                Data Analyst/Scientist/Etc. Indeed Searches
              
          
    No SAS, no ads, and no post-docs, please

("statistical programmer" or "statistical programming") -SAS

data scientist -marketing -advertising

applied data (analysis or analyst) -SAS -postdoc -postdoctoral -"post-doctoral"

  
## transform_example.r
# This is a quick script to illustrate how to go about transforming variables
# for statistical analysis and the effects of some basic transformations.
# I'm by no means an expert on transformations, so be sure to read up on
# how to best apply the transformations!

# These two packages are for demonstrating the transformations -
# not necessary for the transformations themselves.
library(reshape2)
library(ggplot2)

## review_random.r


get_random_subset <- function(df, id, n_ids = 1, select = TRUE) {

    # Pick one ID to review
    random_id <- sample(df[ , id], size = n_ids)

    # Print it
    df[df[ , id] %in% random_id, select]


## equals_vs_in.r


# Broadly speaking, it's safer to %in% instead of == when using a logical
# vector in R because R's indexing will return an NA - which is
# probably not the intuitive behavior. Here's what I mean:
x <- c("a", "b", NA, "c")

# Indexing with ==
x[x == "a"]

## calc_percent.r

# Calculate percent of responses by department

# The input here is a subset of kano_questions - all of the
# responses related to one item
x_by_dept <- ddply(subset(x, !is.na(department)),  # Dropping that person with no department
                   .var = "department", # I'm going to split x by department
                   .fun = function(y) {   # Writing a custom, nameless function to apply
                                          # to each chunk of x


## lists_to_df.r
# Dummy list
z <- list(list('a' = 1, 'b' = 2, 'c' = 3), list('a' = 4, 'b' = 5, 'c' = 6))


# If you just want to stack all of the values, not keeping any of the list name data
# but ensuring they're all of one type
data.frame(values = as.numeric(unlist(z)))


## date_diffs.r

library(zoo)


# Some sequence of Dates
x <- as.Date("2014-01-01") + c(0, 1, 2, 5, 10)


data.frame(x,
           basediff = c(NA, diff(x)),

## applytorows.r
# Sample data
X <- data.frame(
  x = c(1, 2, 3),
  y = c(4, 5, 6),
  etc = c("a", "b", "c")
)

# Arbitrary stand-in for function that can't be vectorized (no pmax)
max.fun <- function(a, b) { max(c(a, b)) }

## printable_table.css
@media print {

  tr{
    page-break-after: always;
    display: block;
  }

}

## calc_qtr_end.r
# Calculate the date of the last day of a given quarter by pasting
# together its first day, adding three months, and subtracting a day.
# Very elegance
# Such vectorized
calc_qtr_end <- function(year, qtr) {

    require(lubridate) # Easiest way to add a month to a date

    (as.Date(paste(year, qtr * 3, "01", sep = "-")) %m+% months(1)) - 1
	# This is a quick script to illustrate how to go about transforming variables
	# for statistical analysis and the effects of some basic transformations.
	# I'm by no means an expert on transformations, so be sure to read up on
	# how to best apply the transformations!

	# These two packages are for demonstrating the transformations -
	# not necessary for the transformations themselves.
	library(reshape2)
	library(ggplot2)


	get_random_subset <- function(df, id, n_ids = 1, select = TRUE) {

	# Pick one ID to review
	random_id <- sample(df[ , id], size = n_ids)

	# Print it
	df[df[ , id] %in% random_id, select]


	# Broadly speaking, it's safer to %in% instead of == when using a logical
	# vector in R because R's indexing will return an NA - which is
	# probably not the intuitive behavior. Here's what I mean:
	x <- c("a", "b", NA, "c")

	# Indexing with ==
	x[x == "a"]

	# Calculate percent of responses by department

	# The input here is a subset of kano_questions - all of the
	# responses related to one item
	x_by_dept <- ddply(subset(x, !is.na(department)), # Dropping that person with no department
	.var = "department", # I'm going to split x by department
	.fun = function(y) { # Writing a custom, nameless function to apply
	# to each chunk of x
	# Dummy list
	z <- list(list('a' = 1, 'b' = 2, 'c' = 3), list('a' = 4, 'b' = 5, 'c' = 6))


	# If you just want to stack all of the values, not keeping any of the list name data
	# but ensuring they're all of one type
	data.frame(values = as.numeric(unlist(z)))

	library(zoo)


	# Some sequence of Dates
	x <- as.Date("2014-01-01") + c(0, 1, 2, 5, 10)


	data.frame(x,
	basediff = c(NA, diff(x)),
	# Sample data
	X <- data.frame(
	x = c(1, 2, 3),
	y = c(4, 5, 6),
	etc = c("a", "b", "c")
	)

	# Arbitrary stand-in for function that can't be vectorized (no pmax)
	max.fun <- function(a, b) { max(c(a, b)) }
	# Calculate the date of the last day of a given quarter by pasting
	# together its first day, adding three months, and subtracting a day.
	# Very elegance
	# Such vectorized
	calc_qtr_end <- function(year, qtr) {

	require(lubridate) # Easiest way to add a month to a date

	(as.Date(paste(year, qtr * 3, "01", sep = "-")) %m+% months(1)) - 1