tehp/mw.R

## mw.R
library("ggplot2")
library("depmixS4")
library("lubridate")
library("chron")
library("dplyr")
library("corrplot")
getwd()
df <- read.table("TrainData.txt", header = T, sep = ",")
df$Date <- as.Date(df$Date, format="%d/%m/%Y")
df$Time <- format(as.POSIXlt(strptime(df$Time, "%H:%M:%S"), format="%H:%M:%S"), "%H:%M:%S")
#Data exploration
correlationMatrix <- cor(df[3:9], method="pearson", use = "complete.obs")
corrplot(correlationMatrix, type = "upper")
#Determine a single observation time window during a weekday and a weekend day
#that shows a clearly recognizable electricity consumption pattern over a time period of several hours
weekday_window_data <- df[wday(df$Date) == 2, ]
weekend_window_data <- df[wday(df$Date) == 7, ]

### monday and saturday subsets for 7 to 9 am ###
monday <- subset(df, wday(df$Date) == 2)
saturday <- subset(df, wday(df$Date) == 7)
monday <- monday[complete.cases(monday),]
saturday <- saturday[complete.cases(saturday),]
mondayW <- subset(monday, (monday$Time >= '07:00:00' & monday$Time <= "09:00:00"))
saturdayW <- subset(saturday, (saturday$Time >= '07:00:00' & saturday$Time <= "09:00:00"))

get_ntimes <- function(training) {
  counts <- table(training$Date)
  as.vector(counts)
}
training_weekday <- mondayW[mondayW$Date <= "2008-12-31",]
testing_weekday <- setdiff(mondayW, training_weekday)
training_weekend <- saturdayW[saturdayW$Date <= "2008-12-31",]
testing_weekend <- setdiff(saturdayW, training_weekend)
weekday_ntimes = get_ntimes(training_weekday)
weekend_ntimes = get_ntimes(training_weekend)

# Rolling Average
# Moving window week
mw = df[df$Date >= "2009-11-01" & df$Date <= "2009-11-07",]

# MW days
mw_sat <- subset(mw, wday(mw$Date) == 2)
mw_mon <- subset(mw, wday(mw$Date) == 7)

# MW time
mw_sat_window <- mw_sat[mw_sat$Time >= '07:00:00' & mw_sat$Time <= "09:00:00",]
mw_mon_window <- mw_mon[mw_mon$Time >= '07:00:00' & mw_mon$Time <= "09:00:00",]

# Plot the unsmoothed data (gray)
x <- (strptime(mw_sat_window$Time, '%H:%M:%S'))
y <- mw_sat_window$Global_active_power
plot(x, y, type="l", col=grey(.5), main="Global_active_power MA from 7am-9am, 2009-11-02",sub = "(2 sided MA, 10 levels)", xlab="Time", ylab="Global_active_power",)

f20 <- rep(1/10,10)
f20

y_sym <- stats::filter(y, f20, sides=2)
lines(x, y_sym, col="blue")

legend("topleft", inset=.01,
       c("Moving Average","Original Data"), fill=c("blue","grey"), horiz=FALSE)

legend("topright", inset=.01,
       c("Minor Anomaly","Major Anomaly", "Extreme Anomaly"), fill=c("yellow","orange","red"), horiz=FALSE)


stddev <- sd(y_sym, na.rm = TRUE)
threshold_minor <- 1*stddev
threshold_major <- 2*stddev
threshold_extreme <- 4*stddev

minor_occurances <- 0
major_occurances <- 0
extreme_occurances <- 0

for(i in 1:length(y_sym)) {
  if (is.na(y_sym[i])) {
    # print("na")
  } else {
    difference <- abs(y[i] - y_sym[i])
    if (difference >= threshold_minor) {
      if (difference >= threshold_major) {
        if (difference >= threshold_extreme) {
          points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "red", pch = 19)
          extreme_occurances = extreme_occurances + 1;
          next
        }
        points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "orange", pch = 19)
        major_occurances = major_occurances + 1;
        next
      }
      points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "yellow", pch = 19)
      minor_occurances = minor_occurances + 1;
      next
    }
    next
  }
}

print(minor_occurances)
print(major_occurances)
print(extreme_occurances)

# =======

# Plot the unsmoothed data (gray)
x <- (strptime(mw_mon_window$Time, '%H:%M:%S'))
y <- mw_mon_window$Global_active_power
plot(x, y, type="l", col=grey(.5), main="Global_active_power MA from 7am-9am, 2009-11-07",sub = "(2 sided MA, 10 levels)", xlab="Time", ylab="Global_active_power",)

f20 <- rep(1/10,10)
f20

y_sym <- stats::filter(y, f20, sides=2)
lines(x, y_sym, col="blue")

legend("topleft", inset=.01,
       c("Moving Average","Original Data"), fill=c("blue","grey"), horiz=FALSE)

legend("topright", inset=.01,
       c("Minor Anomaly","Major Anomaly", "Extreme Anomaly"), fill=c("yellow","orange","red"), horiz=FALSE)


stddev <- sd(y_sym, na.rm = TRUE)
threshold_minor <- 1*stddev
threshold_major <- 2*stddev
threshold_extreme <- 4*stddev

minor_occurances <- 0
major_occurances <- 0
extreme_occurances <- 0

for(i in 1:length(y_sym)) {
  if (is.na(y_sym[i])) {
    # print("na")
  } else {
    difference <- abs(y[i] - y_sym[i])
    if (difference >= threshold_minor) {
      if (difference >= threshold_major) {
        if (difference >= threshold_extreme) {
          points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "red", pch = 19)
          extreme_occurances = extreme_occurances + 1;
          next
        }
        points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "orange", pch = 19)
        major_occurances = major_occurances + 1;
        next
      }
      points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "yellow", pch = 19)
      minor_occurances = minor_occurances + 1;
      next
    }
    next
  }
}

print(minor_occurances)
print(major_occurances)
print(extreme_occurances)
	library("ggplot2")
	library("depmixS4")
	library("lubridate")
	library("chron")
	library("dplyr")
	library("corrplot")
	getwd()
	df <- read.table("TrainData.txt", header = T, sep = ",")
	df$Date <- as.Date(df$Date, format="%d/%m/%Y")
	df$Time <- format(as.POSIXlt(strptime(df$Time, "%H:%M:%S"), format="%H:%M:%S"), "%H:%M:%S")
	#Data exploration
	correlationMatrix <- cor(df[3:9], method="pearson", use = "complete.obs")
	corrplot(correlationMatrix, type = "upper")
	#Determine a single observation time window during a weekday and a weekend day
	#that shows a clearly recognizable electricity consumption pattern over a time period of several hours
	weekday_window_data <- df[wday(df$Date) == 2, ]
	weekend_window_data <- df[wday(df$Date) == 7, ]

	### monday and saturday subsets for 7 to 9 am ###
	monday <- subset(df, wday(df$Date) == 2)
	saturday <- subset(df, wday(df$Date) == 7)
	monday <- monday[complete.cases(monday),]
	saturday <- saturday[complete.cases(saturday),]
	mondayW <- subset(monday, (monday$Time >= '07:00:00' & monday$Time <= "09:00:00"))
	saturdayW <- subset(saturday, (saturday$Time >= '07:00:00' & saturday$Time <= "09:00:00"))

	get_ntimes <- function(training) {
	counts <- table(training$Date)
	as.vector(counts)
	}
	training_weekday <- mondayW[mondayW$Date <= "2008-12-31",]
	testing_weekday <- setdiff(mondayW, training_weekday)
	training_weekend <- saturdayW[saturdayW$Date <= "2008-12-31",]
	testing_weekend <- setdiff(saturdayW, training_weekend)
	weekday_ntimes = get_ntimes(training_weekday)
	weekend_ntimes = get_ntimes(training_weekend)

	# Rolling Average
	# Moving window week
	mw = df[df$Date >= "2009-11-01" & df$Date <= "2009-11-07",]

	# MW days
	mw_sat <- subset(mw, wday(mw$Date) == 2)
	mw_mon <- subset(mw, wday(mw$Date) == 7)

	# MW time
	mw_sat_window <- mw_sat[mw_sat$Time >= '07:00:00' & mw_sat$Time <= "09:00:00",]
	mw_mon_window <- mw_mon[mw_mon$Time >= '07:00:00' & mw_mon$Time <= "09:00:00",]

	# Plot the unsmoothed data (gray)
	x <- (strptime(mw_sat_window$Time, '%H:%M:%S'))
	y <- mw_sat_window$Global_active_power
	plot(x, y, type="l", col=grey(.5), main="Global_active_power MA from 7am-9am, 2009-11-02",sub = "(2 sided MA, 10 levels)", xlab="Time", ylab="Global_active_power",)

	f20 <- rep(1/10,10)
	f20

	y_sym <- stats::filter(y, f20, sides=2)
	lines(x, y_sym, col="blue")

	legend("topleft", inset=.01,
	c("Moving Average","Original Data"), fill=c("blue","grey"), horiz=FALSE)

	legend("topright", inset=.01,
	c("Minor Anomaly","Major Anomaly", "Extreme Anomaly"), fill=c("yellow","orange","red"), horiz=FALSE)


	stddev <- sd(y_sym, na.rm = TRUE)
	threshold_minor <- 1*stddev
	threshold_major <- 2*stddev
	threshold_extreme <- 4*stddev

	minor_occurances <- 0
	major_occurances <- 0
	extreme_occurances <- 0

	for(i in 1:length(y_sym)) {
	if (is.na(y_sym[i])) {
	# print("na")
	} else {
	difference <- abs(y[i] - y_sym[i])
	if (difference >= threshold_minor) {
	if (difference >= threshold_major) {
	if (difference >= threshold_extreme) {
	points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "red", pch = 19)
	extreme_occurances = extreme_occurances + 1;
	next
	}
	points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "orange", pch = 19)
	major_occurances = major_occurances + 1;
	next
	}
	points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "yellow", pch = 19)
	minor_occurances = minor_occurances + 1;
	next
	}
	next
	}
	}

	print(minor_occurances)
	print(major_occurances)
	print(extreme_occurances)

	# =======

	# Plot the unsmoothed data (gray)
	x <- (strptime(mw_mon_window$Time, '%H:%M:%S'))
	y <- mw_mon_window$Global_active_power
	plot(x, y, type="l", col=grey(.5), main="Global_active_power MA from 7am-9am, 2009-11-07",sub = "(2 sided MA, 10 levels)", xlab="Time", ylab="Global_active_power",)

	f20 <- rep(1/10,10)
	f20

	y_sym <- stats::filter(y, f20, sides=2)
	lines(x, y_sym, col="blue")

	legend("topleft", inset=.01,
	c("Moving Average","Original Data"), fill=c("blue","grey"), horiz=FALSE)

	legend("topright", inset=.01,
	c("Minor Anomaly","Major Anomaly", "Extreme Anomaly"), fill=c("yellow","orange","red"), horiz=FALSE)


	stddev <- sd(y_sym, na.rm = TRUE)
	threshold_minor <- 1*stddev
	threshold_major <- 2*stddev
	threshold_extreme <- 4*stddev

	minor_occurances <- 0
	major_occurances <- 0
	extreme_occurances <- 0

	for(i in 1:length(y_sym)) {
	if (is.na(y_sym[i])) {
	# print("na")
	} else {
	difference <- abs(y[i] - y_sym[i])
	if (difference >= threshold_minor) {
	if (difference >= threshold_major) {
	if (difference >= threshold_extreme) {
	points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "red", pch = 19)
	extreme_occurances = extreme_occurances + 1;
	next
	}
	points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "orange", pch = 19)
	major_occurances = major_occurances + 1;
	next
	}
	points(strptime(mw_sat_window[i, "Time"], '%H:%M:%S'), y[i], col = "yellow", pch = 19)
	minor_occurances = minor_occurances + 1;
	next
	}
	next
	}
	}

	print(minor_occurances)
	print(major_occurances)
	print(extreme_occurances)