Joel Gombin joelgombin

## parquet.md

      
              1 file
            
          
              0 forks
            
          
              1 comment
            
          
              1 star
            
          
                hrbrmstr
                / parquet.md
            
            
              Created
              December 4, 2017 11:50
            
          
    This thread http://mail-archives.apache.org/mod_mbox/drill-user/201707.mbox/%3cCAHfzKEoYeE08GXtF3pJsCfExTJgJPj7nx9bjTnW3a=hgMJAxhQ@mail.gmail.com%3e from the Drill mailing list has a good discussion abt that as well as this (open) JIRA https://issues.apache.org/jira/browse/DRILL-3534.
You can kinda do this with Spark http://aseigneurin.github.io/2017/03/14/incrementally-loaded-parquet-files.html but I actually like the filesystem-based manual partitions better (but that's just me).
What I tend to do is have data-source and date-based partitions for parquet files. So, say for one of our Sonar internet studies (I'll use our 3 SMTP scans for this example) I do something like:

/data/sonar/smtp/port25/yyyy/mm/dd/port25.parquet
/data/sonar/smtp/port465/yyyy/mm/dd/port465.parquet
/data/sonar/smtp/port587/yyyy/mm/dd/port587.parquet


## cartodougenik.R
library(tmap)
library(sp)
library(rgeos)
library(maptools)


# Cartogram
# algorithm from  Dougenik, Chrisman, Niemeyer (1985): An Algorithm To Construct Continuous Area Cartograms. In: Professional Geographer, 37(1), 75-81.
cartogram <- function(shp, weight, itermax=15, maxSizeError=1.0001) {

## server.r
#install.packages(c("devtools","svSockets")) # if not already installed
#devtools::install_github("gosocket","analytixware") # if not already installed
#devtools::install_github("shinysky","analytixware") # if not already installed

library(shiny)
library(shinysky)
library(gosocket)

# Define server logic required to generate and plot a random distribution
shinyServer(function(input, output,session) {

## BibTeXKeyOnly.js
{
	"translatorID": "12345",
	"label": "BibTeX CiteKey-only Exporter",
	"creator": "Simon Kornblith and Richard Karnesky with tweaks by Tan",
	"target": "bib",
	"minVersion": "2.1.9",
	"maxVersion": "",
	"priority": 200,
	"inRepository": false,
	"translatorType": 3,

## SmoothCoefficientPlot.R
SmoothCoefficientPlot <- function(models, modelnames = "", removeintercept = FALSE){
  # models must be a list()

  Alphas <- seq(1, 99, 2) / 100

  Multiplier <- qnorm(1 - Alphas / 2)
  zzTransparency <<- 1/(length(Multiplier)/4)
  CoefficientTables <- lapply(models, function(x){summary(x)$coef})
  TableRows <- unlist(lapply(CoefficientTables, nrow))

## CoefficientPlot.R
CoefficientPlot <- function(models, alpha = 0.05, modelnames = ""){
  # models must be a list()

  Multiplier <- qnorm(1 - alpha / 2)
  CoefficientTables <- lapply(models, function(x){summary(x)$coef})
  TableRows <- unlist(lapply(CoefficientTables, nrow))

  if(modelnames[1] == ""){
    ModelNameLabels <- rep(paste("Model", 1:length(TableRows)), TableRows)
    } else {
	library(tmap)
	library(sp)
	library(rgeos)
	library(maptools)


	# Cartogram
	# algorithm from Dougenik, Chrisman, Niemeyer (1985): An Algorithm To Construct Continuous Area Cartograms. In: Professional Geographer, 37(1), 75-81.
	cartogram <- function(shp, weight, itermax=15, maxSizeError=1.0001) {
	#install.packages(c("devtools","svSockets")) # if not already installed
	#devtools::install_github("gosocket","analytixware") # if not already installed
	#devtools::install_github("shinysky","analytixware") # if not already installed

	library(shiny)
	library(shinysky)
	library(gosocket)

	# Define server logic required to generate and plot a random distribution
	shinyServer(function(input, output,session) {
	{
	"translatorID": "12345",
	"label": "BibTeX CiteKey-only Exporter",
	"creator": "Simon Kornblith and Richard Karnesky with tweaks by Tan",
	"target": "bib",
	"minVersion": "2.1.9",
	"maxVersion": "",
	"priority": 200,
	"inRepository": false,
	"translatorType": 3,
	SmoothCoefficientPlot <- function(models, modelnames = "", removeintercept = FALSE){
	# models must be a list()

	Alphas <- seq(1, 99, 2) / 100

	Multiplier <- qnorm(1 - Alphas / 2)
	zzTransparency <<- 1/(length(Multiplier)/4)
	CoefficientTables <- lapply(models, function(x){summary(x)$coef})
	TableRows <- unlist(lapply(CoefficientTables, nrow))
	CoefficientPlot <- function(models, alpha = 0.05, modelnames = ""){
	# models must be a list()

	Multiplier <- qnorm(1 - alpha / 2)
	CoefficientTables <- lapply(models, function(x){summary(x)$coef})
	TableRows <- unlist(lapply(CoefficientTables, nrow))

	if(modelnames[1] == ""){
	ModelNameLabels <- rep(paste("Model", 1:length(TableRows)), TableRows)
	} else {