agalea91/R-tips.R

## R-tips.R
### Miscellaneous tips for using R

# Select a set of columns from a dataframe
wage_red = wage %>% select(-sibs, -brthord, -meduc, -feduc)
wage_subset = wage %>% select(hours, lwage)

# Use q + dist type to get confidence interval
beta_iq_mean = summary(lm.wage_1)$coefficients[2]
beta_iq_std = summary(lm.wage_1)$coefficients[4]
qt(p = c(0.025, 0.975), df = 933)*beta_iq_std + beta_iq_mean # t = student-t dist., can also use e.g. qnorm

# Alternate method for confidence intervel (unsure about this one, but worth trying)
ci = confint(BPM_pred_lwage, param="pred") # see bas.lm example below, BPM_pred_lwage is basically a linear model
top_y = which.max(BPM_pred_lwage$fit)
ci[top_y, ]

# Adjust plotting environment to display multiple side-by-side
par(mfrow = c(1,3)) # plot 3 in a row

# Model prediction with BAS library lm
lm.wage = bas.lm(formula = lwage ~ . - wage,
                 data = na.omit(wage),
                 prior = "BIC", modelprior = uniform())
lm.wage
summary(lm.wage)
BPM_pred_lwage =  predict(bma_lwage, estimator="BPM", # BPM = "best predictive model", alt. MPM (median probability), HPP (highest posterior)
                          se.fit=TRUE)
bma_lwage$namesx[BPM_pred_lwage$bestmodel+1]

# Print easy to read dataframe
t( put dataframe here )

# Getting features correspond to max element (this trick would work for numpy arrays in python as well)
y = BPM_pred_lwage$fit # Getting predicted values of y for training features
top_y = which.max(y)
X = na.omit(wage) # X is a dataframe contining the features
t(X[top_y, ])

# Useful dplyr functions
df.diamonds_ideal <- filter(diamonds, cut=="Ideal") # filter on a given column
df.diamonds_ideal <- select(df.diamonds_ideal, carat, cut, color, price, clarity) # select a set of columns
df.diamonds_ideal <- mutate(df.diamonds_ideal, price_per_carat = price/carat) # create a new column
arrange(df.disordered_data, num_var) # sort by a given column
arrange(df.disordered_data, desc(num_var)) # sort in reversed order (big to small)
df.diamonds_ideal_chained <- diamonds %>% # chaining where the output of each command becomes the input for the next
                              filter(cut=="Ideal") %>%
                              select(carat, cut, color, price, clarity) %>%
                              mutate(price_per_carat = price/carat)

# Equivalent of sns pairplot
library(ggplot2)
library(GGally)
ggpairs(iris, colour='Species', alpha=0.4)
	### Miscellaneous tips for using R

	# Select a set of columns from a dataframe
	wage_red = wage %>% select(-sibs, -brthord, -meduc, -feduc)
	wage_subset = wage %>% select(hours, lwage)

	# Use q + dist type to get confidence interval
	beta_iq_mean = summary(lm.wage_1)$coefficients[2]
	beta_iq_std = summary(lm.wage_1)$coefficients[4]
	qt(p = c(0.025, 0.975), df = 933)*beta_iq_std + beta_iq_mean # t = student-t dist., can also use e.g. qnorm

	# Alternate method for confidence intervel (unsure about this one, but worth trying)
	ci = confint(BPM_pred_lwage, param="pred") # see bas.lm example below, BPM_pred_lwage is basically a linear model
	top_y = which.max(BPM_pred_lwage$fit)
	ci[top_y, ]

	# Adjust plotting environment to display multiple side-by-side
	par(mfrow = c(1,3)) # plot 3 in a row

	# Model prediction with BAS library lm
	lm.wage = bas.lm(formula = lwage ~ . - wage,
	data = na.omit(wage),
	prior = "BIC", modelprior = uniform())
	lm.wage
	summary(lm.wage)
	BPM_pred_lwage = predict(bma_lwage, estimator="BPM", # BPM = "best predictive model", alt. MPM (median probability), HPP (highest posterior)
	se.fit=TRUE)
	bma_lwage$namesx[BPM_pred_lwage$bestmodel+1]

	# Print easy to read dataframe
	t( put dataframe here )

	# Getting features correspond to max element (this trick would work for numpy arrays in python as well)
	y = BPM_pred_lwage$fit # Getting predicted values of y for training features
	top_y = which.max(y)
	X = na.omit(wage) # X is a dataframe contining the features
	t(X[top_y, ])

	# Useful dplyr functions
	df.diamonds_ideal <- filter(diamonds, cut=="Ideal") # filter on a given column
	df.diamonds_ideal <- select(df.diamonds_ideal, carat, cut, color, price, clarity) # select a set of columns
	df.diamonds_ideal <- mutate(df.diamonds_ideal, price_per_carat = price/carat) # create a new column
	arrange(df.disordered_data, num_var) # sort by a given column
	arrange(df.disordered_data, desc(num_var)) # sort in reversed order (big to small)
	df.diamonds_ideal_chained <- diamonds %>% # chaining where the output of each command becomes the input for the next
	filter(cut=="Ideal") %>%
	select(carat, cut, color, price, clarity) %>%
	mutate(price_per_carat = price/carat)

	# Equivalent of sns pairplot
	library(ggplot2)
	library(GGally)
	ggpairs(iris, colour='Species', alpha=0.4)