khufkens/ghent_air_quality_vmm.R

## ghent_air_quality_vmm.R
# Ghent air pollution analysis
# parametric t-tests
# non-parametric Mann-Whitney U test

# download and collate the data
pdf_data <- paste0(pdftools::pdf_text("https://klimaat.stad.gent/sites/default/files/nota_circulatieplangent_3.pdf")[2],
               collapse = " ")

# some regular expression moving about
subset <- gsub(',', ".", pdf_data)
subset <- gsub('\n', "  ", subset)
subset <- unlist(strsplit(subset, "\\s{2,}"))[10:129]

# recast into a data frame
df <- data.frame(matrix(subset, 20,6, byrow = TRUE),
                    stringsAsFactors = FALSE)

# assign column names
names(df) <- c("meetplaatscode",
               "straat",
               "voor",
               "na",
               "verschil",
               "verschil_perc")

# convert to numeric
df$voor <- as.numeric(df$voor)
df$na <- as.numeric(df$na)
df$verschil <- as.numeric(df$verschil)

# test for normality
s_test <- shapiro.test(df$verschil)

# t-test on the difference
t.test(df$voor, df$na)
t.test(df$voor, df$na + 3.7)

# non-parametric t-test
wilcox.test(df$voor, df$na)
wilcox.test(df$voor, df$na + 3.7)
	# Ghent air pollution analysis
	# parametric t-tests
	# non-parametric Mann-Whitney U test

	# download and collate the data
	pdf_data <- paste0(pdftools::pdf_text("https://klimaat.stad.gent/sites/default/files/nota_circulatieplangent_3.pdf")[2],
	collapse = " ")

	# some regular expression moving about
	subset <- gsub(',', ".", pdf_data)
	subset <- gsub('\n', " ", subset)
	subset <- unlist(strsplit(subset, "\\s{2,}"))[10:129]

	# recast into a data frame
	df <- data.frame(matrix(subset, 20,6, byrow = TRUE),
	stringsAsFactors = FALSE)

	# assign column names
	names(df) <- c("meetplaatscode",
	"straat",
	"voor",
	"na",
	"verschil",
	"verschil_perc")

	# convert to numeric
	df$voor <- as.numeric(df$voor)
	df$na <- as.numeric(df$na)
	df$verschil <- as.numeric(df$verschil)

	# test for normality
	s_test <- shapiro.test(df$verschil)

	# t-test on the difference
	t.test(df$voor, df$na)
	t.test(df$voor, df$na + 3.7)

	# non-parametric t-test
	wilcox.test(df$voor, df$na)
	wilcox.test(df$voor, df$na + 3.7)