normundsneimanis/prog.r

## prog.r
R Programming
# Ex 2
# Addition
5 + 5

# Subtraction
5 - 5

# Multiplication
3 * 5

 # Division
(5 + 5) / 2

# Exponentiation
2^5

# Modulo
28 %% 6


# Ex 4
# Assign the value 42 to x
x <- 42

# Print out the value of the variable x
x


# Ex 5
# Assign a value to the variables my_apples and my_oranges
my_apples <- 5

my_oranges <- 6
# Add these two variables together and print the result
my_apples + my_oranges

# Create the variable my_fruit
my_fruit = my_apples + my_oranges


# Ex 6
# Assign a value to the variable called my_apples
my_apples <- 5

# Print out the value of my_apples
my_apples

# Assign a value to the variable my_oranges and print it out
my_oranges <- 6
my_oranges

# New variable that contains the total amount of fruit
my_fruit <- my_apples + my_oranges
my_fruit


# Ex 7
# What is the answer to the universe?
my_numeric <- 42

# The quotation marks indicate that the variable is of type character
my_character <- "forty-two"

# Change the value of my_logical
my_logical <- FALSE


# Ex 9
# Create variables var1, var2 and var3
var1 <- TRUE
var2 <- 0.3
var3 <- "i"

# var1 is logical; convert it to character: var1_char
var1_char <- as.character(var1)

# See whether var1_char is a character
is.character(var1_char)

# var2 is numeric; convert it to logical: var2_log
var2_log <- as.logical(var2)

# Inspect the class of var2_log
class(var2_log)

# var3 is character; convert it to numeric: var3_num
var3_num <- as.numeric(var3)


# Ex 10
numeric_vector <- c(1, 10, 49)
character_vector <- c("a", "b", "c")

# Complete the code for 'boolean_vector'
boolean_vector <- c(TRUE, FALSE, TRUE)

character_vector[1]


larger_than_ten <- numeric_vector > 10

larger_than_ten

# Ex 13
numeric_vector <- c(1, 10, 49)
larger_than_ten <- numeric_vector > 10

numeric_vector[larger_than_ten]

# Ex 14
# Construction of a matrix with 5 rows that contain the numbers 1 up to 20 and assign it to m
m = matrix(1:20, byrow = TRUE, nrow = 5, ncol = 4)

# print m to the console
m


# Ex 15
# a vector called student_status
student_status <- c("student", "not student", "student", "not student")

# turn student_status into a factor and save it in the variable categorical_student
categorical_student <- factor(student_status)

# Ex 17
# print the first 6 rows of mtcars
head(mtcars)

# print the structure of mtcars
str(mtcars)

# print the dimensions of mtcars
dim(mtcars)

# Ex 18
# planets vector
planets <- c("Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune")

# type vector
type <- c("Terrestrial planet", "Terrestrial planet", "Terrestrial planet", "Terrestrial planet", "Gas giant", "Gas giant", "Gas giant", "Gas giant")

# diameter vector
diameter <- c(0.382, 0.949, 1, 0.532, 11.209, 9.449, 4.007, 3.883)

# rotation vector
rotation <- c(58.64, -243.02, 1, 1.03, 0.41, 0.43, -0.72, 0.67)

# rings vector
rings <- c(FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE)

# construct a dataframe planet_df from all the above variables
planet_df <- data.frame(planets, type, diameter, rotation, rings)

# select the values in the first row and second and third columns
planet_df[1,2:3]

# select the entire third column
planet_df$diameter

# Ex 20
# Vector with numerics from 1 up to 10
my_vector <- 1:10

# Matrix with numerics from 1 up to 9
my_matrix <- matrix(1:9, ncol = 3)

# First 10 elements of the built-in data frame 'mtcars'
my_df <- mtcars[1:10,]

# Construct my_list with these different elements:
my_list <- list(my_vector, my_matrix, my_df)

# print my_list to the console
my_list


# Ex 21
# Vector with numerics from 1 up to 10
my_vector <- 1:10

# Matrix with numerics from 1 up to 9
my_matrix <- matrix(1:9, ncol = 3)

# First 10 elements of the built-in data frame 'mtcars'
my_df <- mtcars[1:10,]

# Construct list with these different elements:
my_list <- list(my_vector, my_matrix, my_df)

# Grab the second element of my_list and print it to the console
my_list[2]

# Grab the first column of the third component of `my_list` and print it to the console
my_list[[3]][,1]


###############################################################################
# Ex 1

# ask for help on the mean function
help(mean)

# ask for the arguments used by the mean function
args(mean)

# a grades vector
grades <- c(8.5, 7, 9, 5.5, 6)

# calculate the mean of grades using matching by name
mean(x = grades)

# calculate the mean of grades using matching by position
mean(grades)

# a grades vector
grades <- c(8.5, 7, 9, NA, 6)

# Calculate the mean of the grades vector without removing NA values.
mean(grades)

# Calculate the mean of the grades vector with removing NA values and observe the difference.
mean(grades, trim=0, na.rm = TRUE)

# make a function called multiply_a_b
multiply_a_b <- function(a, b){
  return (a * b)
}

# call the function multiply_a_b and store the result into a variable result
result <- multiply_a_b(3, 7)

##
# load in the data and store it in the variable cars
cars <- read.csv("http://s3.amazonaws.com/assets.datacamp.com/course/uva/mtcars.csv")

# print the first 6 rows of the dataset using the head() function
head(cars)


##
# load in the dataset
cars <- read.csv(sep = ';', "http://s3.amazonaws.com/assets.datacamp.com/course/uva/mtcars_semicolon.csv")

# print the first 6 rows of the dataset
head(cars)

##
# retrieve the current working directory
getwd()

##
# set the working directory again to the output of your current working directory
setwd(".")

##
# list all the files in the working directory
list.files()

# read in the cars dataset and store it in a variable called cars
cars <- read.csv(sep = ';', "cars.csv")

# print the first 6 rows of cars
head(cars)

## Ex 10
# load the ggplot2 package using the library function
install.packages("ggplot2")

library(ggplot2)

# load the ggplot2 package using the require function
require(ggplot2)

##
###############################################################################
# Show dataset types
str(mtcars)


# Look at the levels of the variable am
levels(mtcars$am)


##

# Assign the value of mtcars to the new variable mtcars2
mtcars2 <- mtcars

# Assign the label "high" to mpgcategory where mpg is greater than or equal to 20
mtcars2$mpgcategory[mtcars$mpg >= 20] <- "high"

# Assign the label "low" to mpgcategory where mpg is less than 20
mtcars2$mpgcategory[mtcars$mpg < 20] <- "low"

# Assign mpgcategory as factor to mpgfactor
mtcars2$mpgfactor <- as.factor(mtcars2$mpgcategory)


##
# How many of the cars have a manual transmission?
table(mtcars$am)
13

##
# Assign the frequency of the mtcars variable "am" to a variable called "height"
height <- table(mtcars$am)
# Create a barplot of "height"
barplot(height)

## Ex 8
# vector of bar heights
height <- table(mtcars$am)
# Make a vector of the names of the bars called "barnames"
barnames <- c("automatic", "manual")
# Label the y axis "number of cars" and label the bars using barnames
barplot(height, ylab = "number of cars", names.arg = barnames)

##
# Make a histogram of the carb variable from the mtcars data set. Set the title to "Carburetors"
hist(mtcars$carb, main = "Carburetors")

##
# arguments to change the y-axis scale to 0 - 20, label the x-axis and colour the bars red
hist(mtcars$carb, main = "Carburetors", ylim = c(0,20), xlab = "Number of Carburetors", col = "red")

##
# Calculate the mean miles per gallon
mean(mtcars$mpg)
# Calculate the median miles per gallon
median(mtcars$mpg)


##
# Produce a sorted frequency table of `carb` from `mtcars`
sort(table(mtcars$carb), decreasing = TRUE)

##
# Minimum value
x <- min(mtcars$mpg)
# Maximum value
y <- max(mtcars$mpg)
# Calculate the range of mpg using x and y
y - x

##
# What is the value of the second quartile?
17.7100
# What is the value of the first quartile?
16.8925


##
# Make a boxplot of qsec
boxplot(mtcars$qsec)
# Calculate the interquartile range of qsec
IQR(mtcars$qsec)


##
# What is the threshold value for an outlier below the first quartile?
13.88125
# What is the threshold value for an outlier above the third quartile?
21.91125


##
# Find the IQR of horsepower
IQR(mtcars$hp)
# Find the standard deviation of horsepower
sd(mtcars$hp)
# Find the IQR of miles per gallon
IQR(mtcars$mpg)
# Find the standard deviation of miles per gallon
sd(mtcars$mpg)


##
# Calculate the z-scores of mpg
(mtcars$mpg - mean(mtcars$mpg)) / sd(mtcars$mpg)


## Calculate Pearsons r
bars <- c(2,4,1.5,2,3)
happiness <- c(7,3,8,8,6)

ZBars <- (bars - mean(bars)) / sd(bars)
ZHappiness <- (happiness- mean(happiness)) / sd(happiness)

r <-  sum(ZBars*ZHappiness)/4
r

##
# Plot height and weight of the "women" dataset. Make the title "Heights and Weights"
plot(women$weight, women$height, main = "Heights and Weights")

##
# Make a contingency table of tobacco consumption and education
table(smoking$tobacco, smoking$student)

##
# Calculate the correlation between var1 and var2
cor(var1, var2)

##
# predicted values of y according to line 1
y1 <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)

# predicted values of y according to line 2
y2 <- c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11)

# actual values of y
y <- c(3, 2, 1, 4, 5, 10, 8, 7, 6, 9)

# calculate the squared error of line 1
sum((y1 - y) ^ 2)

# calculate the squared error of line 2
sum((y2 - y) ^ 2)

##
# Our data
money <- c(1,2,3,4,5,6,7,8,9,10)
prosocial <- c(3, 2, 1, 4, 5, 10, 8, 7, 6,9)
# Find the regression coefficients
lm(prosocial ~ money)

##
# Your plot
plot(money, prosocial, xlab = "Money", ylab = "Prosocial Behavior")
# Store your regression coefficients in a variable called "line"
line <- lm(prosocial ~ money)
# Use "line" to tell abline() to make a line on your graph
abline(line)

##
# Your plot
plot(money, prosocial, xlab = "Money", ylab = "Prosocial Behavior")
# Your regression line
line <- lm(prosocial ~ money)
abline(line)
# Add a line that shows the mean of the dependent variable
abline(mean(prosocial), 0)

##
# Calculate the correlation - R squared of prosocial and money
cor(prosocial, money) ^ 2


##
# your data
money <- c(4, 3, 2, 2, 8, 1, 1, 2, 3, 4, 5, 6, 7, 9, 9, 8, 12)
education <- c(3, 4, 6, 9, 3, 3, 1, 2, 1, 4, 5, 7, 10, 8, 7, 6, 9)

# calculate the correlation between X and Y
cor(money, education)

# save regression coefficients as object "line"
line <- lm(money ~ education)

# print the regression coefficients
line

# plot education and money and create title
plot(education, money, main = "My Scatterplot")

# add the regression line
abline(line)

###############################################################################
# List your sample space
samplespace <- c("AAA", "AAE", "AEA", "AEE", "EAA", "EAE", "EEA")

# What is the probability of AAE?
aae <- (3 / 5) * (3 / 5) * (2 / 5)
# What is the probability of EAE?
eae <- (2 / 5) * (3 / 5) * (2 / 5)
# What is the probability of drawing AAA or EEA?
aaaeea <- ( (3 / 5) * (3 / 5) * (3 / 5) ) + ( (2 / 5) * (2 / 5) * (3 / 5) )

################################################################################
# the data frame
data <- data.frame(outcome = 0:5, probs = c(0.1, 0.2, 0.3, 0.2, 0.1, 0.1))

# make a histogram of the probability distribution
barplot(names = data$outcome, height = data$probs)

##
# simulating data
set.seed(11225)
data <- rnorm(10000)

# check for documentation of the dnorm function
help(dnorm)

# calculate the density of data and store it in the variable density
density <- dnorm(data)

# make a plot with as x variable data and as y variable density
plot(x = data, y = density)

##
# probability that x is smaller or equal to two
prob <- (0.1 + 0.2 + 0.3)

#' probability that x is 0, smaller or equal to one,
#' smaller or equal to two, and smaller or equal to three
cumsum(c(0.1, 0.2, 0.3, 0.2))

# calculate the expected probability value
expected_score <- sum(data$outcome * data$probs)

# print the variable expected_score
expected_score

##
# the mean of the probability mass function
expected_score <- sum(data$outcome * data$probs)

# calculate the variance and store it in a variable called variance
variance <- sum((data$outcome -expected_score)^2 * data$probs)

# calculate the standard deviation and store it in a variable called std
std <- sqrt(variance)

##
# probability of a woman having a hair length of less than 20 centimeters
round(pnorm(20, mean = 25, sd = 5), 2)

##
# 85th percentile of female hair length
round(qnorm(0.85, mean = 25, sd = 5), 2)

##
# calculate the z value and store it in the variable z_value
z_value <- (38 - 25) / 5

# calculate the mean and store it in the variable mean_chance
mean_chance <- 25 * 0.2

# calculate the standard deviation and store it in the variable std_chance
std_chance <- sqrt(25 * 0.2 * (1 - 0.2))

##
# probability of answering 5 questions correctly
five_correct <- dbinom(5, size = 25, prob = 0.2)

# probability of answering at least 5 questions correctly
atleast_five_correct <- pbinom(4, size = 25, prob = 0.2, lower.tail = FALSE)

##
# calculate the 60th percentile
qbinom(0.60, size = 25, prob = 0.2)

###############################################################################
# sample
set.seed(11225)
first_sample <- sample(scandinavia_data, size = 100)

# calculate average
mean(first_sample)

##
# initialize an empty vector
new_number <- NULL

#'run an operation 10 times.
#'The ith position of new number will be set to i
#'at the end of the loop, the vector new_number is printed
for (i in 1:10){
  new_number[i] <- i
}

print(new_number)

##
# set the seed such that you will get the same sample as in the solution code
set.seed(11225)

# empty vector sample means
sample_means <- NULL

# take 200 samples from scandinavia_data
for (i in 1:500){
  samp <- sample(scandinavia_data, 200)
  sample_means[i] <- mean(samp)
}

# mean scandinavia data
mean(scandinavia_data)

# mean sample means
mean(sample_means)

##
# standard deviation of the population
population_sd <- sd(scandinavia_data)
population_sd

# standard deviation of the sampling distribution
sampling_sd <- population_sd / sqrt(200)
sampling_sd

##
# empty vector sample means
sample_means <- NULL

# take 200 samples from scandinavia_data
for (i in 1:1000){
  samp <- sample(household_income, 200)
  sample_means[i] <- mean(samp)
}

# make a histogram of household_income
hist(household_income)

# make a histogram of sample_means
hist(sample_means)

##
# z_score of hipster with a beard of 32 millimeter
z_score <- (32 - 25) / 3.47


# print the variable z_score to the console
z_score

##
# calculate the area under the curve left of the observation
pnorm(2.02, lower.tail = TRUE)

# calculate the area under the curve right of the observation
pnorm(2.02, lower.tail = FALSE)

##
# calculate the population mean
population_mean <- mean(scandinavia_data)

# calculate the population standard deviation
population_sd <- sd(scandinavia_data)

# calculate the standard deviation of the sampling distribution
sampling_sd <- population_sd / sqrt(50)

# calculate the Z score
z_score <- (26 - population_mean) / sampling_sd

# cumulative probability calculation. Don't forget to set lower.tail to FALSE
pnorm(z_score, lower.tail = FALSE)

##
# sample proportion
proportion_hipsters <- 0.10

# standard deviation of the sampling distribution
sample_sd <- sqrt((0.10 * (1 - 0.10)) / 200)

##
# calculate the standard deviation of the sampling distribution
sample_sd <- sqrt((0.10 * (1 - 0.10)) / 200)

# calculate the probability
pnorm(0.13, mean = 0.10, sd = sample_sd, lower.tail = FALSE)

##


###############################################################################
# calculate the value of cut_off
# the prefered option
cut_off <- round(qnorm(0.95, mean = 25, sd = round(3.5 / sqrt(40), 2)), 2)
# other accepted solutions
cut_off_2 <- round(qnorm(0.95, mean = 25, sd = 3.5 / sqrt(40)), 2)
cut_off_3 <- qnorm(0.95, mean = 25, sd = round(3.5 / sqrt(40), 2))
cut_off_4 <- qnorm(0.95, mean = 25, sd = 3.5 / sqrt(40))

# print the value of cut_off to the console
cut_off

##
# calculate the value of the variable lower_cut_off
# the prefered option
lower_cut_off <- round(qnorm(0.025, mean = 25, sd = round(3.5 / sqrt(40),2)), 2)
# other accepted solutions
lower_cut_off2 <- qnorm(0.025, mean = 25, sd = round(3.5 / sqrt(40),2))
lower_cut_off3 <- round(qnorm(0.025, mean = 25, sd = 3.5 / sqrt(40)), 2)
lower_cut_off4 <- qnorm(0.025, mean = 25, sd = 3.5 / sqrt(40))

# calculate the value of the variable upper_cut_off
# the prefered option
upper_cut_off <- round(qnorm(0.975, mean = 25, sd = round(3.5 / sqrt(40), 2)), 2)
# other accepted solutions
upper_cut_off2 <- qnorm(0.975, mean = 25, sd = round(3.5 / sqrt(40), 2))
upper_cut_off3 <- round(qnorm(0.975, mean = 25, sd = 3.5 / sqrt(40)), 2)
upper_cut_off4 <- qnorm(0.975, mean = 25, sd = 3.5 / sqrt(40))

# print lower_cut_off to the console
lower_cut_off

# print upper_cut_off to the console
upper_cut_off

##
# calculate the z score and assign it to a variable called z_value
# the prefered option
z_value <- round((25.95 - 25) / round(3.5 / sqrt(40), 2), 2)
# other accepted solutions
z_value2 <- (25.95 - 25) / round(3.5 / sqrt(40), 2)
z_value3 <- round((25.95 - 25) / (3.5 / sqrt(40)), 2)
z_value4 <- (25.95 - 25) / (3.5 / sqrt(40))

# calculate the corresponding p value and store it in the variable called p_value
# the prefered option
p_value <- round(pnorm(z_value, lower.tail = FALSE), 2)
# other accepted solution
p_value2 <- pnorm(z_value, lower.tail = FALSE)

# print p_value to the console
p_value

##
# calculate the z score and assign it to a variable called z_value
# the prefered option
z_value <- round((25.95 - 25) / round(3.5 / sqrt(40), 2), 2)
# other accepted solutions
z_value2 <- (25.95 - 25) / round(3.5 / sqrt(40), 2)
z_value3 <- round((25.95 - 25) / (3.5 / sqrt(40)), 2)
z_value4 <- (25.95 - 25) / (3.5 / sqrt(40))

# calculate the corresponding p value and store it in the variable called p_value
# the prefered option
p_value <- round(pnorm(z_value, lower.tail = FALSE) * 2, 2)
# other accepted solutions
p_value2 <- pnorm(z_value, lower.tail = FALSE) * 2
p_value3 <- round(pnorm(z_value, lower.tail = FALSE), 2) * 2

# print p_value to the console
p_value

##
#' calculate the probability of answering 12 ore more questions correctly given
#' that the student is merely guessing and store this in the variable p_value
# the prefered option
p_value <- round(pbinom(11, size = 25, prob = 0.20, lower.tail = FALSE), 2)
# other accepted solution
p_value2 <- pbinom(11, size = 25, prob = 0.20, lower.tail = FALSE)

# print the probability calculated above to the console
p_value

# assign either accepted or rejected to the variable conclusion
conclusion <- "rejected"


##
# calculate the mean and assign it to a variable called average
average <- 0.20

# calculate the standard error and assign it to a variable called se
# the prefered option
se <- round(sqrt((0.20 * 0.80) / 25), 2)
# other accepted solution
se2 <- sqrt((0.20 * 0.80) / 25)

# calculate the z value and assign it to a variable z_value
# the prefered option
z_value <- round((((12 / 25) - 0.2) / se), 2)
# other accepted solutions
z_value2 <- (((12 / 25) - 0.2) / se)
z_value3 <- round((((12 / 25) - 0.2) / se2), 2)
z_value4 <- (((12 / 25) - 0.2) / se2)

# calculate the p value and store it in a variable p_value
# the prefered option
p_value <- round(pnorm(z_value, lower.tail = FALSE), 2)
# other accepted solutions
p_value2 <- pnorm(z_value, lower.tail = FALSE)
p_value3 <- round(pnorm(z_value2, lower.tail = FALSE), 2)
p_value4 <- pnorm(z_value2, lower.tail = FALSE)
p_value5 <- round(pnorm(z_value3, lower.tail = FALSE), 2)
p_value6 <- pnorm(z_value3, lower.tail = FALSE)
p_value7 <- round(pnorm(z_value4, lower.tail = FALSE), 2)
p_value8 <- pnorm(z_value4, lower.tail = FALSE)

# print p_value to the console
p_value

##
# calculate the critical cut off value and store it in a variable called cut_off
# the prefered option
cut_off <- round(qt(0.95, df = 49), 2)
# other accepted solution
cut_off2 <- qt(0.95, df = 49)

# print cut_off to the console
cut_off

##
# calculate the standard error and store it in the variable se
# the prefered option
se <- round(5 / sqrt(50), 2)
# other accepted solution
se2 <- 5 / sqrt(50)

# calculate the t value and store it in a variable called t_value
# the prefered option
t_value <- round((186.5 - 185) / se, 2)
# other accepted solutions
t_value2 <- (186.5 - 185) / se
t_value3 <- round((186.5 - 185) / se2, 2)
t_value4 <- (186.5 - 185) / se2

# calculate the p value and store it in a variable called p_value
# the prefered option
p_value <- round(pt(t_value, df = 49, lower.tail = FALSE), 2)
# other accepted solutions
p_value2 <- pt(t_value, df = 49, lower.tail = FALSE)
p_value3 <- round(pt(t_value2, df = 49, lower.tail = FALSE), 2)
p_value4 <- pt(t_value2, df = 49, lower.tail = FALSE)
p_value5 <- round(pt(t_value3, df = 49, lower.tail = FALSE), 2)
p_value6 <- pt(t_value3, df = 49, lower.tail = FALSE)
p_value7 <- round(pt(t_value4, df = 49, lower.tail = FALSE), 2)
p_value8 <- pt(t_value4, df = 49, lower.tail = FALSE)

# print p_value to the console
p_value

##
# calculate the t value and store it in the variable t_value
# The prefered option
t_value <- round(qt(0.975, df = 49), 2)
# other accepted solution
t_value2 <- qt(0.975, df = 49)

#' calculate a 95% confidence interval as a vector with two values and store it in a
#' a variable called conf_interval.The prefered option
conf_interval <- round(186.5 + c(-1, 1) * t_value * 0.71, 2)
# other accepted solutions
conf_interval2 <- 186.5 + c(-1, 1) * t_value * 0.71
conf_interval3 <- 186.5 + c(-1, 1) * t_value2 * 0.71
conf_interval4 <- round(186.5 + c(-1, 1) * t_value2 * 0.71, 2)

# print conf_interval to the console
conf_interval