Skip to content

Instantly share code, notes, and snippets.

@kenthzhang
Last active April 27, 2017 00:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kenthzhang/1388bd3d7bfdad71216663c4affef4bb to your computer and use it in GitHub Desktop.
Save kenthzhang/1388bd3d7bfdad71216663c4affef4bb to your computer and use it in GitHub Desktop.
RevoScaleR rxExecBy Guide
# Create myDir in hdfs and copy csv file
source <-system.file("SampleData/AirlineDemoSmall.csv", package="RevoScaleR")
myDir <- "/share/AirlineDemoSmall"
rxHadoopMakeDir(myDir)
rxHadoopCopyFromLocal(source, myDir)
cc <- rxSparkConnect(reset = TRUE)
# Summary of data
hdfsFileSystem <- RxHdfsFileSystem()
csvFile <- file.path(myDir, "AirlineDemoSmall.csv")
textData <- RxTextData(file = csvFile, missingValueString = "M", stringsAsFactors = TRUE, fileSystem = hdfsFileSystem)
rxSummary(~., textData)
#Call:
#rxSummary(formula = ~., data = textData)
#Summary Statistics Results for: ~.
#Data: textData (RxTextData Data Source)
#File name: /share/AirlineDemoSmall/AirlineDemoSmall.csv
#Number of valid observations: 6e+05
#
# Name Mean StdDev Min Max ValidObs MissingObs
# ArrDelay 11.31794 40.688536 -86.000000 1490.00000 582628 17372
# CRSDepTime 13.48227 4.697566 0.016667 23.98333 600000 0
#
#Category Counts for DayOfWeek
#Number of categories: 7
#Number of valid observations: 6e+05
#Number of missing observations: 0
#
# DayOfWeek Counts
# Monday 97975
# Tuesday 77725
# Wednesday 78875
# Thursday 81304
# Friday 82987
# Saturday 86159
# Sunday 94975
# Create XDF data
xdfFile <- file.path(myDir, "AirlineDemoSmallXDF")
xdfData <- RxXdfData(file = xdfFile, createCompositeSet = TRUE, fileSystem = hdfsFileSystem)
rxDataStep(textData, xdfData, overwrite = TRUE)
# Remove factor conversion from textData
textData <- RxTextData(file = csvFile, missingValueString = "M", fileSystem = hdfsFileSystem)
# Create Parquet data
parquetFile <- file.path(myDir, "AirlineDemoSmallParquet")
parquetData <- RxParquetData(file = parquetFile, fileSystem = hdfsFileSystem)
rxDataStep(textData, parquetData, overwrite = TRUE)
# Create Parquet data
orcFile <- file.path(myDir, "AirlineDemoSmallOrc")
orcData <- RxOrcData(file = orcFile, fileSystem = hdfsFileSystem)
rxDataStep(textData, orcData, overwrite = TRUE)
# Create Hive data
hiveData <- RxHiveData(table = "AirlineDemoSmall")
rxDataStep(textData, hiveData, overwrite = TRUE)
rxSparkDisconnect(cc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment