Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Create OLAP cube for Saiku from any data frame
GenerateCubeSQLite <- function(DataFrame,PrimaryKey,CubeDestination,DataSourceDestination,NumericAggregators=NA,DateAggregators=NA,MaxTime='2015-01-01',MinTime='2001-01-01') {
# Function to clean names
standardize_name <- function(string) {
capped <- grep("^[^A-Z]*$", string, perl = TRUE)
substr(string[capped], 1, 1) <- toupper(substr(string[capped], 1, 1))
string <- gsub('_',' ',string, fixed=TRUE)
return(string)
}
# --------- Define connection
library(sqldf)
library("RSQLite")
file.remove('__this_is_temporary_db__.db')
sqlite <- dbDriver("SQLite")
tmpdb <- dbConnect(sqlite,"__this_is_temporary_db__.db")
DataFrameName <- deparse(substitute(DataFrame))
TableName <- DataFrameName
dbWriteTable(tmpdb,DataFrameName,DataFrame)
#------ Generate time dimension
ddiff <- as.integer(as.Date(MaxTime)) - as.integer(as.Date(MinTime))
tmp_time <- data.frame(time_date = as.Date(MinTime) + seq(1:ddiff))
tmp_time$month_number <- as.integer(format(tmp_time$time_date, "%m"))
tmp_time$year_number <- as.integer(format(tmp_time$time_date, "%Y"))
tmp_time$quarter_number[tmp_time$month_number %in% 1:3] <- 1
tmp_time$quarter_number[tmp_time$month_number %in% 4:6] <- 2
tmp_time$quarter_number[tmp_time$month_number %in% 7:9] <- 3
tmp_time$quarter_number[tmp_time$month_number %in% 10:12] <- 4
TimeTableName <- 'tmp_time'
dbWriteTable(tmpdb,'tmp_time',tmp_time)
#------ Disconnect
dbDisconnect(tmpdb)
# -------- Get column types from R data frame
TableDesign <- data.frame(type=sapply(DataFrame, typeof), datetype=sapply(names(DataFrame), function(x) inherits(DataFrame[[x]], "Date")))
TableDesign$column_name <- row.names(TableDesign)
row.names(TableDesign) <- NULL
TableDesign <- sqldf("select column_name, case when datetype then 'date' else type end as data_type from TableDesign", drv='SQLite')
# -------- Get lists of dimensions, time dimensions, measures
DimensionVariables <- TableDesign[TableDesign$data_type %in% c('character','integer'),1]
DimensionVariablesNames <- standardize_name(DimensionVariables)
VariablesNumeric <- sort(TableDesign[TableDesign$data_type %in% c('numeric','integer','double'),1])
VariablesNumeric <- VariablesNumeric[!(VariablesNumeric %in% PrimaryKey)]
VariablesDate <- TableDesign[TableDesign$data_type %in% c('date'),1]
VariablesDateNames <- standardize_name(VariablesDate)
#----------- Define aggregators for different types of variables
Aggregators <- c('sum','count','min','max','avg','distinct-count')
AggregatorsNames <- c('Sum','Cnt','Min','Max','Avg','Dcnt')
# Numeric Aggregators
if (!all(is.na(NumericAggregators))) {
NumericAggregators <- Aggregators[Aggregators %in% NumericAggregators]
NumericAggregatorsNames <- AggregatorsNames[Aggregators %in% NumericAggregators]
} else {
NumericAggregators <- Aggregators
NumericAggregatorsNames <- AggregatorsNames
}
# Date Aggregators
if (!all(is.na(DateAggregators))) {
DateAggregators <- Aggregators[Aggregators %in% DateAggregators]
DateAggregatorsNames <- AggregatorsNames[Aggregators %in% DateAggregators]
} else {
DateAggregators <- 'count'
DateAggregatorsNames <- '#'
}
#---------- Define Dimensions
# Initialize empty vectors
DimensionsDate <- character(0)
DimensionsVarchar <- character(0)
# Time dimension
if (length(VariablesDate) > 0) {
DimensionsDate <- paste0('
<Dimension type="TimeDimension" visible="true" foreignKey="',VariablesDate,'" highCardinality="false" name="',VariablesDateNames,'">
<Hierarchy name="Time Hierarchy" visible="true" hasAll="true" primaryKey="time_date">
<Table name="',TimeTableName,'"></Table>
<Level name="Year" visible="true" column="year_number" ordinalColumn="year_number" type="Integer" internalType="int" uniqueMembers="false" levelType="TimeYears" hideMemberIf="Never">
</Level>
<Level name="Quarter" visible="true" column="quarter_number" ordinalColumn="quarter_number" type="Integer" uniqueMembers="false" levelType="TimeQuarters" hideMemberIf="Never">
</Level>
<Level name="Month" visible="true" column="month_number" ordinalColumn="month_number" type="Integer" uniqueMembers="false" levelType="TimeMonths" hideMemberIf="Never">
</Level>
</Hierarchy>
</Dimension>',collapse='')
}
# Other dimensions
if(length(DimensionVariables) > 0) {
DimensionsVarchar <- paste('
<Dimension type="StandardDimension" visible="true" foreignKey="',PrimaryKey,'" highCardinality="false" name="',DimensionVariablesNames,'">
<Hierarchy name="',DimensionVariablesNames,'" visible="true" hasAll="true" primaryKey="',PrimaryKey,'">
<Table name="',TableName,'">
</Table>
<Level name="',DimensionVariablesNames,'" visible="true" column="',DimensionVariables,'" type="String" uniqueMembers="false" levelType="Regular" hideMemberIf="Never">
</Level>
</Hierarchy>
</Dimension>',sep='', collapse='')
}
# Merge All dimensions togetgher
Dimensions <- paste0(DimensionsDate, DimensionsVarchar)
#------------- Generate Measures
# Generate default measure - count of primary column
MeasurePrimaryKey <- paste(' <Measure name="','Count (PK)','" column="',PrimaryKey,'" aggregator="count" visible="true"></Measure>',sep='')
# Initialize empty vectors
NumericMeasures <- character(0)
DateMeasures <- character(0)
# Generate measures for numeric variables
if(length(VariablesNumeric) > 0) {
for (i in seq_along(NumericAggregators)) {
NumericMeasures <- c(NumericMeasures,paste0(' <Measure name="',standardize_name(VariablesNumeric),'-',NumericAggregatorsNames[i]
,'" column="',VariablesNumeric,'" aggregator="',NumericAggregators[i],'" visible="true"></Measure>'))
}
}
# Generate measures for date variables
if(length(VariablesDate) > 0) {
for (i in seq_along(DateAggregators)) {
DateMeasures <- c(DateMeasures,paste0(' <Measure name="',standardize_name(VariablesDate),' ',DateAggregatorsNames[i]
,'" column="',VariablesDate,'" aggregator="',DateAggregators[i],'" visible="true"></Measure>'))
}
}
# Paste measures together in desired order
Measures <- paste0( MeasurePrimaryKey,'\n', paste0(sort(c(NumericMeasures, DateMeasures)),collapse='\n'),collapse='\n')
# ---------- Define Header and Footer for cube
CubeHeader <- paste0(c(
paste0('<Schema name="R Generated Cube">'),
paste0(' <Cube name="',TableName,'" visible="true" cache="true" enabled="true">'),
paste0(' <Table name="',TableName,'"></Table>'))
,collapse='\n')
CubeFooter <- '</Cube></Schema>'
# --------- Generate connection file
ConnectionFile <- paste0('type=OLAP
name=RData
driver=mondrian.olap4j.MondrianOlap4jDriver
location=jdbc:mondrian:Jdbc=jdbc:sqlite:',getwd(),'/__this_is_temporary_db__.db; \\
Catalog=',CubeDestination,';JdbcDrivers=org.sqlite.JDBC
')
# ---------- Write results to file
# Write Cube
writeLines(paste0(CubeHeader,'\n',Dimensions,'\n\n',Measures,'\n',CubeFooter),con=CubeDestination)
# Write Data source
writeLines(ConnectionFile,con=DataSourceDestination)
}
# Example 1
mtcars2 <- mtcars
mtcars2$id <- seq(1:nrow(mtcars2))
mtcars2$vs <- as.integer(mtcars2$vs)
mtcars2$am <- as.integer(mtcars2$am)
mtcars2$gear <- as.integer(mtcars2$gear)
mtcars2$carb <- as.integer(mtcars2$carb)
GenerateCubeSQLite(mtcars2,
PrimaryKey='id',
CubeDestination='/home/tgr/Applications/Saiku/saiku-server/tomcat/webapps/saiku/WEB-INF/classes/foodmart/test.xml',
DataSourceDestination='/home/tgr/Applications/Saiku/saiku-server/tomcat/webapps/saiku/WEB-INF/classes/saiku-datasources/test')
# Example 2
big_portfolio <- read.csv(file='big_portfolio.csv',header=TRUE,sep=';',stringsAsFactors=FALSE)
big_portfolio$origination_date <- as.Date(big_portfolio$origination_date)
big_portfolio$repayment_date <- as.Date(big_portfolio$repayment_date,format='%Y-%m-%d')
str(big_portfolio)
GenerateCubeSQLite(big_portfolio,
PrimaryKey='id',
CubeDestination='/home/tgr/Applications/Saiku/saiku-server/tomcat/webapps/saiku/WEB-INF/classes/foodmart/test.xml',
DataSourceDestination='/home/tgr/Applications/Saiku/saiku-server/tomcat/webapps/saiku/WEB-INF/classes/saiku-datasources/test')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment