kfeoktistoff/complete.R

## complete.R
## Write a function that reads a directory full of files and reports the number of completely observed cases in each data file.
## The function should return a data frame where the first column is the name of the file and the second column is the number
## of complete cases. A prototype of this function follows

complete <- function(directory, id = 1:332) {
    ## 'directory' is a character vector of length 1 indicating
    ## the location of the CSV files
    ## 'id' is an integer vector indicating the monitor ID numbers
    ## to be used
    ## Return a data frame of the form:
    ## id nobs
    ## 1 117
    ## 2 1041
    ## ...
    ## where 'id' is the monitor ID number and 'nobs' is the
    ## number of complete cases

    comp <- data.frame(id=numeric(), nobs=numeric())

    for (i in id) {
        filename <- obsFileName(directory, i)
        data <- read.csv(filename)
        comp <- rbind(comp, data.frame(id=i, nobs=nrow(data[complete.cases(data), ])))
    }

    comp
}

## corr.R
## Write a function that takes a directory of data files and a threshold
## for complete cases and calculates the correlation between sulfate and
## nitrate for monitor locations where the number of completely observed
## cases (on all variables) is greater than the threshold. The function
## should return a vector of correlations for the monitors that meet the
## threshold requirement. If no monitors meet the threshold requirement,
## then the function should return a numeric vector of length 0.

corr <- function(directory, threshold = 0) {
    ## 'directory' is a character vector of length 1 indicating
    ## the location of the CSV files
    ## 'threshold' is a numeric vector of length 1 indicating the
    ## number of completely observed observations (on all
    ## variables) required to compute the correlation between
    ## nitrate and sulfate; the default is 0
    ## Return a numeric vector of correlations

    source("complete.R")
    source("obsFileName.R")
    observations <- complete(directory, 1:332)
    sulfate <- numeric()
    nitrate <- numeric()
    result <- numeric()

    for (i in observations$id[observations$nobs > threshold]) {
        filename <- obsFileName(directory, i)
        data <- read.csv(filename)
        result <- c(result, cor(data$sulfate, data$nitrate, use="complete.obs"))
    }

    result
}

## obsFileName.R
## Return relative path to csv file by detector number

obsFileName <- function(directory, obs) {
    if (obs<10) {
        filename = paste(directory, "/","00", obs, ".csv", sep="")
    } else if (obs >= 10 && obs < 100) {
        filename = paste(directory, "/", "0", obs, ".csv", sep="")
    } else {
        filename = paste(directory, "/", obs, ".csv", sep="")
    }
}

## pollutantmean.R
## Write a function named 'pollutantmean' that calculates the mean of a pollutant
## (sulfate or nitrate) across a specified list of monitors. The function
## 'pollutantmean' takes three arguments: 'directory', 'pollutant', and 'id'.
## Given a vector monitor ID numbers, 'pollutantmean' reads that monitors'
## particulate matter data from the directory specified in the 'directory' argument
## and returns the mean of the pollutant across all of the monitors,
## ignoring any missing values coded as NA

pollutantmean <- function(directory, pollutant, id = 1:332) {
    ## 'directory' is a character vector of length 1 indicating
    ## the location of the CSV files
    ## 'pollutant' is a character vector of length 1 indicating
    ## the name of the pollutant for which we will calculate the
    ## mean; either "sulfate" or "nitrate".
    ## 'id' is an integer vector indicating the monitor ID numbers
    ## to be used
    ## Return the mean of the pollutant across all monitors list
    ## in the 'id' vector (ignoring NA values)

    source("obsFileName.R")
    allData <- numeric()

    for (i in id) {
        filename <- obsFileName(directory, i)

        data <- read.csv(filename)

        if (pollutant == "sulfate") {
            allData <- c(allData, data$sulfate)
        } else if (pollutant == "nitrate") {
            allData <- c(allData, data$nitrate)
        }
    }

    mean(allData, na.rm=TRUE)
}
	## Write a function that reads a directory full of files and reports the number of completely observed cases in each data file.
	## The function should return a data frame where the first column is the name of the file and the second column is the number
	## of complete cases. A prototype of this function follows

	complete <- function(directory, id = 1:332) {
	## 'directory' is a character vector of length 1 indicating
	## the location of the CSV files
	## 'id' is an integer vector indicating the monitor ID numbers
	## to be used
	## Return a data frame of the form:
	## id nobs
	## 1 117
	## 2 1041
	## ...
	## where 'id' is the monitor ID number and 'nobs' is the
	## number of complete cases

	comp <- data.frame(id=numeric(), nobs=numeric())

	for (i in id) {
	filename <- obsFileName(directory, i)
	data <- read.csv(filename)
	comp <- rbind(comp, data.frame(id=i, nobs=nrow(data[complete.cases(data), ])))
	}

	comp
	}
	## Write a function that takes a directory of data files and a threshold
	## for complete cases and calculates the correlation between sulfate and
	## nitrate for monitor locations where the number of completely observed
	## cases (on all variables) is greater than the threshold. The function
	## should return a vector of correlations for the monitors that meet the
	## threshold requirement. If no monitors meet the threshold requirement,
	## then the function should return a numeric vector of length 0.

	corr <- function(directory, threshold = 0) {
	## 'directory' is a character vector of length 1 indicating
	## the location of the CSV files
	## 'threshold' is a numeric vector of length 1 indicating the
	## number of completely observed observations (on all
	## variables) required to compute the correlation between
	## nitrate and sulfate; the default is 0
	## Return a numeric vector of correlations

	source("complete.R")
	source("obsFileName.R")
	observations <- complete(directory, 1:332)
	sulfate <- numeric()
	nitrate <- numeric()
	result <- numeric()

	for (i in observations$id[observations$nobs > threshold]) {
	filename <- obsFileName(directory, i)
	data <- read.csv(filename)
	result <- c(result, cor(data$sulfate, data$nitrate, use="complete.obs"))
	}

	result
	}
	## Return relative path to csv file by detector number

	obsFileName <- function(directory, obs) {
	if (obs<10) {
	filename = paste(directory, "/","00", obs, ".csv", sep="")
	} else if (obs >= 10 && obs < 100) {
	filename = paste(directory, "/", "0", obs, ".csv", sep="")
	} else {
	filename = paste(directory, "/", obs, ".csv", sep="")
	}
	}
	## Write a function named 'pollutantmean' that calculates the mean of a pollutant
	## (sulfate or nitrate) across a specified list of monitors. The function
	## 'pollutantmean' takes three arguments: 'directory', 'pollutant', and 'id'.
	## Given a vector monitor ID numbers, 'pollutantmean' reads that monitors'
	## particulate matter data from the directory specified in the 'directory' argument
	## and returns the mean of the pollutant across all of the monitors,
	## ignoring any missing values coded as NA

	pollutantmean <- function(directory, pollutant, id = 1:332) {
	## 'directory' is a character vector of length 1 indicating
	## the location of the CSV files
	## 'pollutant' is a character vector of length 1 indicating
	## the name of the pollutant for which we will calculate the
	## mean; either "sulfate" or "nitrate".
	## 'id' is an integer vector indicating the monitor ID numbers
	## to be used
	## Return the mean of the pollutant across all monitors list
	## in the 'id' vector (ignoring NA values)

	source("obsFileName.R")
	allData <- numeric()

	for (i in id) {
	filename <- obsFileName(directory, i)

	data <- read.csv(filename)

	if (pollutant == "sulfate") {
	allData <- c(allData, data$sulfate)
	} else if (pollutant == "nitrate") {
	allData <- c(allData, data$nitrate)
	}
	}

	mean(allData, na.rm=TRUE)
	}