Skip to content

Instantly share code, notes, and snippets.

@mrdwab
Created March 30, 2013 04:55
Show Gist options
  • Save mrdwab/5275438 to your computer and use it in GitHub Desktop.
Save mrdwab/5275438 to your computer and use it in GitHub Desktop.
Functions to get question and answer sets from Stack Overflow, and to extract the code blocks from those questions.
#'Download a Stack Overflow question page as a character vector
#'
#'Helper function for other Stack Overflow related functions. Downloads the
#'page as a character vector and extracts the portion that is needed for other
#'functions.
#'
#'@param qid The numeric question ID.
#'@return A character vector
#'
#'@author Ananda Mahto
#'
SOQuestionPage <- function(qid) {
T1 <- suppressWarnings(
readLines(paste("http://stackoverflow.com/q",
as.character(qid), sep = "/")))
cStart <- which(
grepl(
'<div itemscope itemtype="http://schema.org/Article">', T1,
fixed = TRUE)) + 3
cEnd <- which(grepl("<a name='new-answer'></a>", T1, fixed = TRUE)) - 1
T1[cStart:cEnd]
}
NULL
#'Clean text within \code{<pre><code>} tag blocks
#'
#'Text within \code{<pre><code>} tag blocks need to be "cleaned". For R code
#'that usually means changing \code{&lt;} and \code{&gt;} to \code{<} and
#'\code{>}.
#'
#'@param x The input character vector
#'@return A character vector
#'@author Ananda Mahto
#'
PreTagClean <- function(x) {
x <- gsub(".*<pre><code>", "", x)
x <- gsub("</code></pre>.*", "", x)
x <- gsub("&lt;", "<", x)
x <- gsub("&gt;", ">", x)
pattern <- "</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>"
x <- gsub(pattern, "\\1", x)
x
}
NULL
#'Create a list of code blocks from an input character vector
#'
#'A helper function for \code{\link{SOCodeBlocks}} to extract the lines in
#'the character vector corresponding to a code block.
#'
#'@param x The input character vector
#'@return A character vector of just the code blocks from a Stack Overflow question.
#'@author Ananda Mahto
#'
CodeList <- function(x) {
T1 <- data.frame(start = which(grepl("<pre><code>", x)),
end = which(grepl("</code></pre>", x)))
T1 <- data.frame(t(T1))
codeList <- lapply(T1, function(z) x[z[1]:z[2]])
codeList <- lapply(codeList, PreTagClean)
codeList
}
print.SOCodeBlocks <- function(x, ...) {
lapply(seq_along(x),
function(y) {
cat("\n>>> Block", y, "\n\n")
cat("", paste(x[[y]], "\n"))
cat("", paste(rep("-", 60), sep = "", collapse = ""), "\n")
})
invisible(x)
}
print.QAList <- function(x, ...) {
y <- lapply(x, PreTagClean)
z <- c(paste("Question : ", attr(x, "QTitle"),
"\n>>> Question ID : ", attr(x, "QID"),
"\n>>> Question Tags : ", attr(x, "QTags"), sep =""),
paste("Answer ID:", attr(x, "AnswerIDs")))
lapply(seq_along(y),
function(Y) {
cat("\n>>> ", z[Y], "\n\n")
cat("", paste(y[[Y]], "\n"))
cat("", paste(rep("-", 60), sep = "", collapse = ""), "\n")
})
invisible(x)
}
#'Print a list of question and answers from a Stack Overflow page
#'
#'Print a list of question and answers from a Stack Overflow page (entered by
#'question ID number).
#'
#'Currently hard-coded ot Stack Overflow, but may be generalized for other
#'Stack Exchange sites.
#'
#'@param qid The numeric question ID.
#'@return A list of the class \code{c("QAList", "list")}
#'@note The formatting is handled by the \code{print.QAList} print method. The
#'HTML is still viewable by removing \code{"QAList"} from the list class, or
#'by accessing the individual list items.
#'@author Ananda Mahto
#'@examples
#'
#'temp <- QAList(15332195)
#'temp
#'temp[[1]]
#'
#'\dontshow{rm(temp)}
#'
QAList <- function(qid) {
T1 <- SOQuestionPage(qid)
QTitle <- strsplit(T1[grepl("<h1 itemprop=\"name\">", T1)], ">|<")[[1]][5]
AnswerIDs <- gsub(".*answer-([0-9]+).*", "\\1",
T1[grepl("<div id=\"answer-", T1)])
QTags <- T1[which(grepl("<div class=\"post-taglist\">", T1)) + 1]
QTags <- paste(
strsplit(QTags, "rel=\"tag\">|</a>")[[1]][c(FALSE, TRUE)], collapse = ", ")
QAEnds <- which(grepl("<table class=\"fw\">", T1))
Question <- data.frame(
Start = which(
grepl("<div class=\"post-text\" itemprop=\"description\">", T1)) + 1,
End = QAEnds[1] - 6)
Answers <- data.frame(
Start = which(grepl("<td class=\"answercell\">", T1)) + 1,
End = QAEnds[-1] - 2)
ParsedPage <- apply(rbind(Question, Answers), 1, function(z) T1[z[1]:z[2]])
ParsedPage <- lapply(ParsedPage,
function(x) gsub("<div class=\"post-text\">", "", x))
attr(ParsedPage, "QTitle") <- QTitle
attr(ParsedPage, "QID") <- as.character(qid)
attr(ParsedPage, "QTags") <- QTags
attr(ParsedPage, "AnswerIDs") <- as.character(AnswerIDs)
class(ParsedPage) <- c("QAList", class(ParsedPage))
ParsedPage
}
#'Return a list of the code blocks at a Stack Overflow question page
#'
#'Return a list of the code blocks at a Stack Overflow question page (specified
#'by numeric question ID).
#'
#'Currently hard-coded ot Stack Overflow, but may be generalized for other
#'Stack Exchange sites.
#'
#'@param qid The numeric question ID
#'@return A list of the class c("SOCodeBlocks", "list")
#'@note The formatting is handled by the \code{print.SOCodeBlocks} print method.
#'The HTML is still viewable by removing "SOCodeBlocks" from the list class, or
#'by accessing the individual list items.
#'@author Ananda Mahto
#'@examples
#'
#'temp <- SOCodeBlocks(15332195)
#'temp
#'temp[[6]]
#'
#'\dontshow{rm(temp)}
#'
SOCodeBlocks <- function(qid) {
T1 <- QAList(qid)
T2 <- unlist(lapply(T1, CodeList), recursive=FALSE, use.names=FALSE)
class(T2) <- c("SOCodeBlocks", class(T2))
T2
}
@sebastian-c
Copy link

The last function seems like it could be really useful. Perhaps sourcing all code from a question? Consider the case where there are multiple code chunks in a reproducible example and several intermediate objects. I don't have the expertise, but that would also make a pretty neat Firefox/Chrome extension.

One possible improvement might be to read it in with with an HTML parser (XML package?). I don't know much about the problems, but this answer seems to have some adamant opinions.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment