Last active
April 27, 2017 00:01
-
-
Save kenthzhang/1388bd3d7bfdad71216663c4affef4bb to your computer and use it in GitHub Desktop.
RevoScaleR rxExecBy Guide
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create myDir in hdfs and copy csv file | |
source <-system.file("SampleData/AirlineDemoSmall.csv", package="RevoScaleR") | |
myDir <- "/share/AirlineDemoSmall" | |
rxHadoopMakeDir(myDir) | |
rxHadoopCopyFromLocal(source, myDir) | |
cc <- rxSparkConnect(reset = TRUE) | |
# Summary of data | |
hdfsFileSystem <- RxHdfsFileSystem() | |
csvFile <- file.path(myDir, "AirlineDemoSmall.csv") | |
textData <- RxTextData(file = csvFile, missingValueString = "M", stringsAsFactors = TRUE, fileSystem = hdfsFileSystem) | |
rxSummary(~., textData) | |
#Call: | |
#rxSummary(formula = ~., data = textData) | |
#Summary Statistics Results for: ~. | |
#Data: textData (RxTextData Data Source) | |
#File name: /share/AirlineDemoSmall/AirlineDemoSmall.csv | |
#Number of valid observations: 6e+05 | |
# | |
# Name Mean StdDev Min Max ValidObs MissingObs | |
# ArrDelay 11.31794 40.688536 -86.000000 1490.00000 582628 17372 | |
# CRSDepTime 13.48227 4.697566 0.016667 23.98333 600000 0 | |
# | |
#Category Counts for DayOfWeek | |
#Number of categories: 7 | |
#Number of valid observations: 6e+05 | |
#Number of missing observations: 0 | |
# | |
# DayOfWeek Counts | |
# Monday 97975 | |
# Tuesday 77725 | |
# Wednesday 78875 | |
# Thursday 81304 | |
# Friday 82987 | |
# Saturday 86159 | |
# Sunday 94975 | |
# Create XDF data | |
xdfFile <- file.path(myDir, "AirlineDemoSmallXDF") | |
xdfData <- RxXdfData(file = xdfFile, createCompositeSet = TRUE, fileSystem = hdfsFileSystem) | |
rxDataStep(textData, xdfData, overwrite = TRUE) | |
# Remove factor conversion from textData | |
textData <- RxTextData(file = csvFile, missingValueString = "M", fileSystem = hdfsFileSystem) | |
# Create Parquet data | |
parquetFile <- file.path(myDir, "AirlineDemoSmallParquet") | |
parquetData <- RxParquetData(file = parquetFile, fileSystem = hdfsFileSystem) | |
rxDataStep(textData, parquetData, overwrite = TRUE) | |
# Create Parquet data | |
orcFile <- file.path(myDir, "AirlineDemoSmallOrc") | |
orcData <- RxOrcData(file = orcFile, fileSystem = hdfsFileSystem) | |
rxDataStep(textData, orcData, overwrite = TRUE) | |
# Create Hive data | |
hiveData <- RxHiveData(table = "AirlineDemoSmall") | |
rxDataStep(textData, hiveData, overwrite = TRUE) | |
rxSparkDisconnect(cc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment