Created
May 30, 2017 12:11
-
-
Save nutterb/d2e050dada608bb6213e61d0f8471b65 to your computer and use it in GitHub Desktop.
Execute queries to SQL Server that involve VARCHAR(MAX) variable types.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' @name query_varchar_max | |
#' @title Query a VARCHAR(MAX) Variable from SQL Server | |
#' | |
#' @importFrom RODBCext sqlExecute | |
#' | |
#' @description The RODBC driver to SQL Server (SQL Server Native Client 11.0) | |
#' reports the lenght of a VARCHAR(MAX) variable to be zero. This presents | |
#' difficulties in extracting long text values from the database. Often, the | |
#' ODBC will assume a length of 255 characters and truncate the text to that | |
#' many characters. The approach taken here searches the VARCHAR(MAX) variables | |
#' for the longest length, and extracts the data in segments to be pasted | |
#' together in R. | |
#' | |
#' @param channel A valid ODBC channel to a SQL Server database. | |
#' @param id A character vector of ID variables that may be used to merge the | |
#' data from this query into another dataset. | |
#' @param varchar_max a character vector of variable names that are to be | |
#' treated as if they are VARCHAR(MAX) variables. | |
#' @param from A single character string providing the remainder of the query | |
#' to be run, beginning with the \code{FROM} statement. | |
#' @param stringsAsFactors \code{logical(1)}. Should character strings returned | |
#' from the database be converted to factors? | |
#' @param ... Additional arguments to \code{sqlExecute} when running the full | |
#' query. | |
#' | |
#' @details \code{query_varchar_max} operates by determining how many columns of up to | |
#' 8000 characters each are required to export a complete VARCHAR(MAX) variable. | |
#' It then creates the necessary number of intermediate variables and queries the | |
#' data using the SQL Server \code{SUBSTRING} command, extracting the VARCHAR(MAX) | |
#' variable in increments of 8000 characters. After completing the query, | |
#' the intemediary variables are concatenated and removed from the data. | |
#' | |
#' The function makes accommodation for multi-part queries as far as [TABLE].[VARIABLE] | |
#' formats are concerned. It is not intended for use in [SCHEMA].[TABLE].[VARIABLE] | |
#' formats. This at least allows \code{from} to include joins for more complex | |
#' queries. Parameterized queries are also supported through \code{sqlExecute}. | |
#' | |
#' @export | |
query_varchar_max <- function(channel, id, varchar_max, from, | |
stringsAsFactors = FALSE, ...) | |
{ | |
coll <- checkmate::makeAssertCollection() | |
checkmate::assert_class(x = channel, | |
classes = "RODBC", | |
add = coll) | |
checkmate::assert_character(x = id, | |
add = coll) | |
checkmate::assert_character(x = varchar_max, | |
add = coll) | |
checkmate::assert_character(x = from, | |
len = 1, | |
add = coll) | |
checkmate::assert_logical(x = stringsAsFactors, | |
len = 1, | |
add = coll) | |
checkmate::reportAssertions(coll) | |
varchar_max_len <- | |
paste0( | |
sprintf("MAX(LEN(%s)) AS len_%s", | |
varchar_max, | |
sub("[.]", "_", varchar_max)), | |
collapse = ", " | |
) | |
varchar_len <- | |
unlist( | |
RODBCext::sqlExecute( | |
channel = channel, | |
query = sprintf("SELECT %s %s", | |
varchar_max_len, | |
from), | |
fetch = TRUE | |
) | |
) | |
varchar_max_cols <- | |
unlist( | |
mapply(expand_varchar_max, | |
varchar_max, | |
varchar_len, | |
SIMPLIFY = FALSE) | |
) | |
Prelim <- | |
RODBCext::sqlExecute( | |
channel = channel, | |
query = sprintf("SELECT %s, %s %s", | |
paste0(id, collapse = ", "), | |
paste0(varchar_max_cols, collapse = ", "), | |
from), | |
fetch = TRUE, | |
stringsAsFactors = stringsAsFactors, | |
... | |
) | |
var_stub_to_combine <- | |
unique( | |
sub( | |
"(part)(\\d{1,3})", | |
"\\1", | |
sub(".+AS ", "", varchar_max_cols) | |
) | |
) | |
col_to_combine <- | |
lapply(var_stub_to_combine, | |
grep, | |
names(Prelim)) | |
Prelim[sub(".+[.]", "", varchar_max)] <- | |
lapply(col_to_combine, | |
function(col) apply(Prelim[col], 1, paste0, collapse = "")) | |
Prelim[-unlist(col_to_combine)] | |
} | |
expand_varchar_max <- function(varchar_max, varchar_len) | |
{ | |
nvar <- varchar_len %/% 8000 + 1 | |
var_list <- vector("character", length = nvar) | |
for (i in seq_along(var_list)) | |
{ | |
var_list[i] <- | |
sprintf("SUBSTRING(%s, %s, %s) AS %s_part%s", | |
varchar_max, | |
1 + (i - 1) * 8000, | |
8000, | |
paste0(sub("[.]", "_", varchar_max)), | |
i) | |
} | |
var_list | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment