Skip to content

Instantly share code, notes, and snippets.

@kenthzhang
Last active April 27, 2017 00:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kenthzhang/e62d53780b74ffb41560401e771353d7 to your computer and use it in GitHub Desktop.
Save kenthzhang/e62d53780b74ffb41560401e771353d7 to your computer and use it in GitHub Desktop.
cc <- rxSparkConnect(reset = TRUE)
colInfo <- list(
ArrDelay = list(
type = "numeric"),
DayOfWeek = list(
type = "factor",
levels = c(
"Monday",
"Sunday",
"Wednesday",
"Thursday",
"Friday",
"Tuesday")))
hiveData <- RxHiveData(table = "AirlineDemoSmall", colInfo = colInfo)
.Factor <- function(keys, data)
{
rxSummary(~., data)
}
result <- rxExecBy(inData = hiveData, keys = c("DayOfWeek"), func = .Factor)
# Partitoin result of "Saturday", which is not in the factor level list
result[[4]]
#$keys
#$keys[[1]]
#[1] <NA>
#Levels: Monday Sunday Wednesday Thursday Friday Tuesday
#
#
#$result
#Call:
#rxSummary(formula = ~., data = data)
#
#Summary Statistics Results for: ~.
#Data: data (RxXdfData Data Source)
#File name: /dev/shm/MRS-sshuser/2729766045047878893/PXDF0
#Number of valid observations: 86159
#
# Name Mean StdDev Min Max ValidObs MissingObs
# ArrDelay 11.87533 45.245402 -73.000000 1370.00000 83851 2308
# CRSDepTime 13.15288 4.598508 0.083333 23.98333 86159 0
#
#Category Counts for DayOfWeek
#Number of categories: 6
#Number of valid observations: 0
#Number of missing observations: 86159
#
# DayOfWeek Counts
# Monday 0
# Sunday 0
# Wednesday 0
# Thursday 0
# Friday 0
# Tuesday 0
#
#$status
#$status[[1]]
#[1] "OK"
#
#$status[[2]]
#NULL
#
#$status[[3]]
#NULL
# Partition result of "Friday"
result[[2]]
#$keys
#$keys[[1]]
#[1] Friday
#Levels: Monday Sunday Wednesday Thursday Friday Tuesday
#
#
#$result
#Call:
#rxSummary(formula = ~., data = data)
#
#Summary Statistics Results for: ~.
#Data: data (RxXdfData Data Source)
#File name: /dev/shm/MRS-sshuser/2397107652729461182/PXDF0
#Number of valid observations: 82987
#
# Name Mean StdDev Min Max ValidObs MissingObs
# ArrDelay 14.80433 41.792601 -78.000000 1490.00000 80142 2845
# CRSDepTime 13.50271 4.739651 0.083333 23.98333 82987 0
#
#Category Counts for DayOfWeek
#Number of categories: 6
#Number of valid observations: 82987
#Number of missing observations: 0
#
# DayOfWeek Counts
# Monday 0
# Sunday 0
# Wednesday 0
# Thursday 0
# Friday 82987
# Tuesday 0
#
#$status
#$status[[1]]
#[1] "OK"
#
#$status[[2]]
#NULL
#
#$status[[3]]
#NULL
rxSparkDisconnect(cc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment