davidski/sugar.R

## sugar.R
library(tm)      # text manipulation
library(stringi) # better string replacement

# specify the source url and the destination location
uri <- 'http://www.actiononsugar.org/News%20Centre/Surveys%20/2016/170862.pdf'
filename <- 'sugar.pdf'

# be kind, download only once
if (!file.exists(filename)) {
  download.file(uri, filename, mode="wb")
}

# depends on having xpdf tools availabe in PATH
if(all(file.exists(Sys.which(c("pdfinfo", "pdftotext"))))) {
  pdf <- readPDF(control = list(text = "-table"))(elem = list(uri = filename),
                                                   language = "en",
                                                   id = "id1")
}

# clean up the data and make it ready for CSV conversion
dat <- stri_replace_all_fixed(content(pdf)[13:274], ',', '-')
dat <- stri_replace_all_regex(dat, '[:space:]{2,}', ',')
dat <- dat[-168] # headers are repeated in the PDF content at this positon

out <- read.csv(textConnection(dat), header = FALSE, stringsAsFactors = FALSE)
colnames(out) <- c("cafe", "drink", "sugars", "teaspooons")
	library(tm) # text manipulation
	library(stringi) # better string replacement

	# specify the source url and the destination location
	uri <- 'http://www.actiononsugar.org/News%20Centre/Surveys%20/2016/170862.pdf'
	filename <- 'sugar.pdf'

	# be kind, download only once
	if (!file.exists(filename)) {
	download.file(uri, filename, mode="wb")
	}

	# depends on having xpdf tools availabe in PATH
	if(all(file.exists(Sys.which(c("pdfinfo", "pdftotext"))))) {
	pdf <- readPDF(control = list(text = "-table"))(elem = list(uri = filename),
	language = "en",
	id = "id1")
	}

	# clean up the data and make it ready for CSV conversion
	dat <- stri_replace_all_fixed(content(pdf)[13:274], ',', '-')
	dat <- stri_replace_all_regex(dat, '[:space:]{2,}', ',')
	dat <- dat[-168] # headers are repeated in the PDF content at this positon

	out <- read.csv(textConnection(dat), header = FALSE, stringsAsFactors = FALSE)
	colnames(out) <- c("cafe", "drink", "sugars", "teaspooons")