christophergandrud/fed.speeches.parse.R

## fed.speeches.parse.R
#######################
## Open text files, parse individually and remove text of the the speeches

setwd("~/fed.text.indv/")

    # Create list of text files to parse and extract speech text from
    files <- list.files(path = "~/fed.text.indv/", pattern = "*.txt")

    # Create object to record empty text files (likely empty due to error in the website download)
    missing <- NULL

    # Indicate folder to save cleaned files into
    outpathB <- "~/fed.text.parsed/"

for (i in files){

    # Parse HTML and extract speech text
    marker <- tryCatch(
        unlist(
            xpathSApply(
                doc = htmlParse(
                    file = i), "//p", xmlValue
                )
            ), error = function(e) e
        )

        # Fill object with the file numbers of the empty text files.
        # Then skip if the text file is empty, to prevent the loop from stopping
        if(inherits(marker, "error")){
            missing <- c(missing, i)
            next
        }

        # Further remove unwanted HTML markup and repeated text
        marker <- gsub("\\n", "", marker)
        marker <- gsub("Return to top", "", marker)
        marker <- gsub("Return to text", "", marker)
        marker <- gsub("Accessible Version", "", marker)
        marker <- gsub("Accessible version", "", marker)
        marker <- gsub("Speeches", "", marker)

        # Collapse into a single character string
        marker <- paste(marker, collapse = "")

    # Save as new .txt file
    write(as.character(marker), file = paste(outpathB, "/", "parsed.", i, sep = ""))
}
	#######################
	## Open text files, parse individually and remove text of the the speeches

	setwd("~/fed.text.indv/")

	# Create list of text files to parse and extract speech text from
	files <- list.files(path = "~/fed.text.indv/", pattern = "*.txt")

	# Create object to record empty text files (likely empty due to error in the website download)
	missing <- NULL

	# Indicate folder to save cleaned files into
	outpathB <- "~/fed.text.parsed/"

	for (i in files){

	# Parse HTML and extract speech text
	marker <- tryCatch(
	unlist(
	xpathSApply(
	doc = htmlParse(
	file = i), "//p", xmlValue
	)
	), error = function(e) e
	)

	# Fill object with the file numbers of the empty text files.
	# Then skip if the text file is empty, to prevent the loop from stopping
	if(inherits(marker, "error")){
	missing <- c(missing, i)
	next
	}

	# Further remove unwanted HTML markup and repeated text
	marker <- gsub("\\n", "", marker)
	marker <- gsub("Return to top", "", marker)
	marker <- gsub("Return to text", "", marker)
	marker <- gsub("Accessible Version", "", marker)
	marker <- gsub("Accessible version", "", marker)
	marker <- gsub("Speeches", "", marker)

	# Collapse into a single character string
	marker <- paste(marker, collapse = "")

	# Save as new .txt file
	write(as.character(marker), file = paste(outpathB, "/", "parsed.", i, sep = ""))
	}