Created
August 3, 2019 22:31
-
-
Save BroVic/b175746d6452c5a016bf62c2ea979ad3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# titanic.R | |
## An R Script on simple exploration of the Titanic dataset | |
## In RGui, to run an R script's line hold CTRL + R | |
## Download the dataset into the working directory | |
# Check the working directory, | |
getwd() | |
# if you need to change it use 'setwd()' | |
# Check the files in the directory. 'train.csv' ought to be there | |
list.files() | |
# import the data in train.csv into R | |
df <- read.csv("train.csv") | |
# Check out the data | |
dim(df) | |
str(df) | |
# Check the variable names | |
colnames(df) | |
# Examine the variable 'Embarked' | |
df$Embarked | |
# Make a frequency table | |
table(df$Embarked) | |
# Create a variable with the 'Age' column | |
age <- df$Age | |
# Check the objects in the workspace | |
ls() | |
# Compute the arithmetic mean of the ages | |
mean(age) | |
# Something is wrong! Check if there are missing values i.e. NAs | |
anyNA(age) | |
# Call mean() again, this time removing the NAs | |
mean(age, na.rm=TRUE) | |
# Get the range, minimun, maximum, and Tukey's 5-number summary | |
min(age) | |
max(age) | |
range(age) | |
fivenum(age) | |
## All functions failed because of the presence of NAs?? | |
# Check the function signature i.e its arguments | |
args(min) # This also has the 'na.rm' argument used in 'mean' earlier. | |
min(age, na.rm=T) | |
args(range) | |
range(age, na.rm=T) | |
table(age) | |
median(age, na.rm=T) | |
# Work on the 'Sex' variable | |
sex <- df$Sex | |
ls() | |
table(sex) | |
anyNA(sex) | |
# Draw plots | |
plot(age) # By default a scatterplot is drawn with numerical data | |
plot(sex) # By default, a bar chart is drawn with categorical data | |
plot(sex, age) # By default the 2 variables will give a box-and-whiskers plot | |
# Now, customise the plot, step-by-step for easy understanding | |
plot(sex, age, col = "red") | |
plot(sex, age, col = "red", main = "Plot of sex vs. age") | |
plot(sex, age, col = c("pink", "blue"), main = "Plot of sex vs. age") | |
plot(sex, age, col = c("pink", "blue"), main = "Plot of sex vs. age", xlab = "Sex", ylab = "Age") | |
# Draw a histogram | |
hist(age) | |
# This is the same chart but now customized. | |
# Note the styling used to accommodate the many arguments! | |
hist(age, | |
col = "blue", | |
main = "Distribution of Passengers' Age", | |
ylab = "No. of Passengers", | |
xlab = "Age", | |
ylim = c(0, 100) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment