- Install tools to build:
sudo apt-get update
sudo apt-get install kernel-package fakeroot wget bzip2
- Linux-2.6.39.1-linode34 is same as regular 2.6.39
/* | |
* MIT License | |
Copyright (c) 2020-2021 | |
Authors: Sacheendra Talluri, Giulia Frascaria, and, Animesh Trivedi | |
This code is part of the Storage System Course at VU Amsterdam | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal |
" Don't try to be vi compatible | |
set nocompatible | |
" Helps force plugins to load correctly when it is turned back on below | |
filetype off | |
" TODO: Load plugins here (pathogen or vundle) | |
" Turn on syntax highlighting | |
syntax on |
This patch deos the following changes: | |
* moves two common function "getNullCount" and "splitAndTransferValidityBuffer" to the top-level BaseValueVector. This change requries moving "validityBuffer" to the BaseValueVector class (as recommended in this TODO: https://github.com/apache/arrow/blob/master/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java#L89) | |
* optimize the implementation of loadValidityBuffer (in the BaseValueVector) to just pass the reference for the validity buffer read from the storage | |
* optimize for the common boundary condition when all variables are valid (as done in the C++ code: https://github.com/apache/arrow/blob/master/cpp/src/arrow/array.h#L290) | |
The optimization delivers performance. | |
Tests: Read 50M integers from a single Int column (2GB). |
// Author: Animesh Trivedi | |
// atr@zurich.ibm.com | |
import org.apache.spark.sql.{SaveMode, SparkSession} | |
import scala.collection.mutable.ListBuffer | |
import scala.util.Random | |
private def generateTSRecord(key: Array[Byte], recBuf:Array[Byte], rand: Random): Unit = { | |
val fixed = 10 |
# Command to launch TPCDS: | |
# ./bin/spark-submit -v --master local[2] --class com.ibm.crail.spark.tools.ParquetGenerator ~/jars/parquet-generator-1.0.jar -c tpcds -o crail://localhost:9060/F1/tpcds/ -p 4 -t 4 -tsf 1 -tdsd /home/atr/zrl/external/github/databricks/tpcds-kit/tools/ -tdd 1 | |
# And you need to put core-site.xml from crail into the conf folder. | |
# Licensed to the Apache Software Foundation (ASF) under one or more | |
# contributor license agreements. See the NOTICE file distributed with | |
# this work for additional information regarding copyright ownership. | |
# The ASF licenses this file to You under the Apache License, Version 2.0 | |
# (the "License"); you may not use this file except in compliance with | |
# the License. You may obtain a copy of the License at |
crail.blocksize 4096 | |
crail.buffersize 4096 | |
#crail.buffersize 1048576 | |
#crail.buffersize 8192 | |
#crail.slicesize 8192 | |
crail.regionsize 1073741824 | |
crail.cachelimit 1073741824 |
/* This code snippet is a part of the blog at | |
https://github.com/animeshtrivedi/blog/blob/master/post/2017-12-26-arrow.md | |
*/ | |
import com.google.common.collect.ImmutableList; | |
import org.apache.arrow.memory.RootAllocator; | |
import org.apache.arrow.vector.*; | |
import org.apache.arrow.vector.dictionary.DictionaryProvider; | |
import org.apache.arrow.vector.types.FloatingPointPrecision; | |
import org.apache.arrow.vector.types.pojo.ArrowType; |
/* This code snippet is a part of the blog at | |
https://github.com/animeshtrivedi/blog/blob/master/post/2017-12-26-arrow.md | |
*/ | |
import org.apache.hadoop.fs.FSDataInputStream; | |
import java.io.IOException; | |
import java.nio.ByteBuffer; | |
import java.nio.channels.SeekableByteChannel; | |
/** |
/* This code snippet is a part of the blog at | |
https://github.com/animeshtrivedi/blog/blob/master/post/2017-12-26-arrow.md | |
*/ | |
import org.apache.hadoop.fs.FSDataOutputStream; | |
import java.io.IOException; | |
import java.nio.ByteBuffer; | |
import java.nio.channels.WritableByteChannel; |