Created
May 3, 2015 07:24
-
-
Save gowrishankarin/0d57bee0892a33e78346 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Course Project | |
# Coursera Getting and Cleaning Data | |
# Johns Hopkins University | |
# Author: Ben McKibben | |
# 2015-04-24 | |
# https://github.com/bmckibben/Tidy_Data_Project/blob/master/run_analysis.R | |
# Requirements | |
# | |
# 1. Merges the training and the test sets to create one data set. | |
# 2. Extracts only the measurements on the mean and standard deviation for each measurement. | |
# 3. Uses descriptive activity names to name the activities in the data set | |
# 4. Appropriately labels the data set with descriptive activity names. | |
# 5. Creates a second, independent tidy data set with the average of each variable for each activity and each subject. | |
#============================================================================= | |
# STEP ZERO: Prepare environment | |
#============================================================================= | |
if (!require("data.table")) { | |
install.packages("data.table") | |
} | |
if (!require("reshape2")) { | |
install.packages("reshape2") | |
} | |
if (!require("plyr")) { | |
install.packages("plyr") | |
} | |
require("data.table") | |
require("reshape2") | |
require("plyr") | |
#============================================================================= | |
# STEP ONE: Merges the training and the test sets to create one data set. | |
#============================================================================= | |
# Read Test Data Files | |
testSubject <- read.table('./UCI HAR Dataset/test/subject_test.txt') | |
testX <- read.table('./UCI HAR Dataset/test/x_test.txt') | |
testY <- read.table('./UCI HAR Dataset/test/y_test.txt') | |
# create a set column for DF | |
set <- 'test' | |
#merge first two dt's and identify set. using cbind so rows remain in original sort | |
testSY <- cbind(testSubject,set,testY) | |
#merge third dt's | |
test <- cbind(testSY,testX) | |
# Read Train Data Files | |
trainSubject <- read.table('./UCI HAR Dataset/train/subject_train.txt') | |
trainX <- read.table('./UCI HAR Dataset/train/x_train.txt') | |
trainY <- read.table('./UCI HAR Dataset/train/y_train.txt') | |
# create a set column for DF | |
set <- 'train' | |
#merge first two dt's and identify set. using cbind so rows remain in original sort | |
trainSY <- cbind(trainSubject,set,trainY) | |
#merge third dt's | |
train<- cbind(trainSY,trainX) | |
#merge the test and train data tables | |
c3pd <- rbind(test,train) | |
# clean up the environment | |
rm(test,testSubject,testY,testX,testSY,train,trainSubject,trainY,trainX,trainSY) | |
#============================================================================= | |
# STEP TWO: Extracts only the measurements on the mean and standard deviation | |
# for each measurement. | |
#============================================================================= | |
#read feature column names | |
colNames <- read.table('./UCI HAR Dataset/features.txt',stringsAsFactors=FALSE)[[2]] | |
colNames <- c("subject","set","activity", colNames) | |
#set remaining column names from features.txt | |
names(c3pd) <- colNames | |
# discard data columns without "mean' or "std" | |
colNames <- names(c3pd) | |
selectedCols <- c("subject" ,"set", "activity", grep("-(mean|std)\\(\\)", colNames, value=TRUE)) | |
c3pd <- c3pd[,selectedCols] | |
#============================================================================= | |
# STEP THREE: Uses descriptive activity names to name the activities in the data set | |
#============================================================================= | |
# replace value in the activity column with their text equivelents from activity_labels.txt | |
c3pd$activity[c3pd$activity==1] <- "walking" | |
c3pd$activity[c3pd$activity==2] <- "walking upstairs" | |
c3pd$activity[c3pd$activity==3] <- "walking downstairs" | |
c3pd$activity[c3pd$activity==4] <- "sitting" | |
c3pd$activity[c3pd$activity==5] <- "standing" | |
c3pd$activity[c3pd$activity==6] <- "laying" | |
#============================================================================= | |
# STEP FOUR Appropriately labels the data set with descriptive activity names. | |
#============================================================================= | |
# translate column names | |
colNames <- names(c3pd) | |
colNames <- gsub(pattern="^t",replacement="time",x=colNames) | |
colNames <- gsub(pattern="^f",replacement="freq",x=colNames) | |
colNames <- gsub(pattern="-?mean[(][)]-?",replacement="Mean",x=colNames) | |
colNames <- gsub(pattern="-?std[()][)]-?",replacement="Std",x=colNames) | |
colNames <- gsub(pattern="-?meanFreq[()][)]-?",replacement="MeanFreq",x=colNames) | |
colNames <- gsub(pattern="BodyBody",replacement="Body",x=colNames) | |
names(c3pd) <- colNames | |
#============================================================================= | |
# STEP FIVE: Creates a second, independent tidy data set with the average of | |
# each variable for each activity and each subject. | |
#============================================================================= | |
# calc average mean and export tidy data set | |
id_labels = c("subject","set","activity") | |
data_labels = setdiff(colnames(c3pd), id_labels) | |
melt_data = melt(c3pd, id = id_labels, measure.vars = data_labels) | |
tidy_data = dcast(melt_data, subject + activity ~ variable, mean) | |
write.table(tidy_data, file = "./tidy_data.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment