Skip to content

Instantly share code, notes, and snippets.

@wesm
Created July 29, 2018 23:16
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wesm/04e9a1c11f446887ad40978ddd7f8383 to your computer and use it in GitHub Desktop.
Save wesm/04e9a1c11f446887ad40978ddd7f8383 to your computer and use it in GitHub Desktop.
Wes's Apache Arrow helper bash scripts
#!/usr/bin/env bash
export ARROW_CLANG_VERSION=6.0
export ARROW_GCC=gcc
export ARROW_GXX=g++
export ARROW_LLVM_VERSION=$ARROW_CLANG_VERSION
export PYARROW_WITH_ORC=1
export PYARROW_WITH_PARQUET=1
export PYARROW_WITH_PLASMA=1
export PYARROW_BUNDLE_ARROW_CPP=0
export PYARROW_BUNDLE_BOOST=0
export PYARROW_PARALLEL=4
export PYARROW_CMAKE_GENERATOR=Ninja
export XCODE_ROOT=/Applications/Xcode.app/Contents/Developer
export DEVELOPER_DIR=$XCODE_ROOT
# export USE_NINJA_BUILD=
export USE_NINJA_BUILD=-GNinja
function osx_toolchain {
export MACOSX_DEPLOYMENT_TARGET=10.9
export CC=$XCODE_ROOT/usr/bin/gcc
export CXX=$XCODE_ROOT/usr/bin/g++
export CONDA_ENV_PATH=/Users/wesm/anaconda/envs/arrow-test
}
function linux_toolchain {
export CC=clang-$ARROW_CLANG_VERSION
export CXX=clang++-$ARROW_CLANG_VERSION
export CPP_TOOLCHAIN=$HOME/cpp-toolchain
export CPP_RUNTIME_TOOLCHAIN=$HOME/cpp-runtime-toolchain
}
function xcode64 {
export XCODE_ROOT=/Applications/Xcode-6.app/Contents/Developer
export DEVELOPER_DIR=$XCODE_ROOT
osx_toolchain
}
function system_toolchain {
if [[ $OSTYPE == "darwin"* ]]; then
osx_toolchain
else
linux_toolchain
fi
}
system_toolchain
export ARROW_BUILD_GPU=ON
export ARROW_BUILD_TENSORFLOW=ON
export ASAN_SYMBOLIZER_PATH=$(type -p llvm-symbolizer)
export TOOLCHAIN_CUDA_VERSION=8.0
DEBUG_TP_DIR=$HOME/local
RELEASE_TP_DIR=$HOME/local-release
export TP_DIR=$DEBUG_TP_DIR
export TOOLCHAIN_BUILD_TYPE=debug
export PARQUET_ROOT=$HOME/code/parquet-cpp
export PARQUET_TEST_DATA=$PARQUET_ROOT/data
export LD_LIBRARY_PATH_BAK=${LD_LIBRARY_PATH_BAK:=$LD_LIBRARY_PATH}
export PATH_BAK=$PATH
export ARROW_LIBHDFS3_DIR=$HOME/anaconda3/lib
export CUDA_HOME=/usr/local/cuda-${TOOLCHAIN_CUDA_VERSION}
export CUDA_TOOLKIT_ROOT_DIR=${CUDA_HOME}
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
export PATH=${CUDA_HOME}/bin:${PATH}
# Use local ruby
export PATH=$HOME/ruby/bin:$PATH
function set_build_env() {
echo "Thirdparty dir: $TP_DIR"
export ARROW_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN
export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN
export BOOST_ROOT=$CPP_TOOLCHAIN
# libprotobuf used by Orc EP build
export PROTOBUF_HOME=$CPP_TOOLCHAIN
export ARROW_HOME=$TP_DIR
export PARQUET_HOME=$TP_DIR
export PATH=$CPP_TOOLCHAIN/bin:$PATH_BAK
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_BAK:$TP_DIR/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CPP_RUNTIME_TOOLCHAIN/lib
# export GTEST_HOME=$CPP_TOOLCHAIN
export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$TP_DIR/lib/pkgconfig
export PYARROW_BUILD_TYPE=$TOOLCHAIN_BUILD_TYPE
# export CXXFLAGS="-Werror -Wall -fno-omit-frame-pointer"
# -DARROW_ORC=on \
export ARROW_GCC_OPTIONS="\
$USE_NINJA_BUILD \
-DCMAKE_INSTALL_PREFIX=$TP_DIR \
-DCMAKE_BUILD_TYPE=$TOOLCHAIN_BUILD_TYPE \
-DCMAKE_CXX_FLAGS='-D_GLIBCXX_USE_CXX11_ABI=0' \
-DARROW_VERBOSE_THIRDPARTY_BUILD=off \
-DARROW_NO_DEPRECATED_API=on \
-DARROW_EXTRA_ERROR_CONTEXT=on \
-DARROW_BOOST_USE_SHARED=on \
-DARROW_BUILD_BENCHMARKS=on \
-DARROW_BUILD_TESTS=on \
-DARROW_HDFS=on \
-DARROW_ORC=on \
-DARROW_PYTHON=on \
-DARROW_GPU=$ARROW_BUILD_GPU \
$EXTRA_ARROW_FLAGS"
export PARQUET_CXXFLAGS="-DARROW_NO_DEPRECATED_API"
export PYARROW_CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"
export PARQUET_GCC_OPTIONS="\
$USE_NINJA_BUILD \
-DCMAKE_INSTALL_PREFIX=$TP_DIR \
-DCMAKE_BUILD_TYPE=$TOOLCHAIN_BUILD_TYPE \
-DCMAKE_CXX_FLAGS='-D_GLIBCXX_USE_CXX11_ABI=0' \
-DPARQUET_BOOST_USE_SHARED=on \
-DPARQUET_BUILD_BENCHMARKS=on \
-DPARQUET_THRIFT_USE_BOOST=off \
$EXTRA_PARQUET_FLAGS"
echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
echo "CC: $CC"
echo "CXX: $CXX"
echo "ARROW_CXXFLAGS: $ARROW_CXXFLAGS"
echo "ARROW_OPTIONS: $ARROW_GCC_OPTIONS"
echo "PARQUET_OPTIONS: $PARQUET_GCC_OPTIONS"
echo "PARQUET_CXXFLAGS: $PARQUET_CXXFLAGS"
echo "PYARROW_CXXFLAGS: $ARROW_CXXFLAGS"
}
function debug() {
export TP_DIR=$DEBUG_TP_DIR
export TOOLCHAIN_BUILD_TYPE=debug
export EXTRA_ARROW_FLAGS="\
$ARROW_TOOLCHAIN_FLAGS
-DBUILD_WARNING_LEVEL=CHECKIN"
export EXTRA_PARQUET_FLAGS="\
-DPARQUET_BUILD_WARNING_LEVEL=CHECKIN -Werror"
export ASAN_IF_ENABLED=OFF
set_build_env
}
function release() {
export TP_DIR=$RELEASE_TP_DIR
export TOOLCHAIN_BUILD_TYPE=release
export ASAN_IF_ENABLED=OFF
export EXTRA_ARROW_FLAGS=""
export EXTRA_PARQUET_FLAGS=""
set_build_env
}
function set_build_type_flags() {
if [ $TOOLCHAIN_BUILD_TYPE = "release" ]; then
release
else
debug
fi
}
function toolchain_clang {
# export CC=$CLANG_TOOLS_PATH/clang
# export CXX=$CLANG_TOOLS_PATH/clang++
export CC=clang-$ARROW_CLANG_VERSION
export CXX=clang++-$ARROW_CLANG_VERSION
export ARROW_TOOLCHAIN_FLAGS="\
-DARROW_FUZZING=ON \
-DARROW_TEST_MEMCHECK=off \
-DARROW_USE_ASAN=$ASAN_IF_ENABLED"
set_build_type_flags
}
function toolchain_gcc {
export CC=$ARROW_GCC
export CXX=$ARROW_GXX
export ARROW_TOOLCHAIN_FLAGS="\
-DARROW_FUZZING=OFF \
-DARROW_TEST_MEMCHECK=off \
-DARROW_USE_ASAN=OFF"
set_build_type_flags
}
function toolchain_gcc48 {
export CC=gcc-4.8
export CXX=g++-4.8
export ARROW_TOOLCHAIN_FLAGS="\
-DARROW_FUZZING=OFF \
-DARROW_TEST_MEMCHECK=off \
-DARROW_USE_ASAN=OFF"
set_build_type_flags
}
toolchain_clang
export PATH=$CPP_TOOLCHAIN/bin:$PATH
export ARROW_USE_CCACHE=1
# export TERM=xterm-color
# Using Impala's thirdparty bits. Looking at output of impala-config.sh
export JAVA_HOME=/usr/lib/jvm/java-8-oracle
export HADOOP_HOME=/home/wesm/code/cloudera/impala/thirdparty/hadoop-2.6.0-cdh5.7.0
# export HADOOP_HOME=/home/wesm/code/cloudera/impala/thirdparty/hadoop-2.6.0-cdh5.7.0-SNAPSHOT
if [ ! -d "$HADOOP_HOME" ]; then
export HADOOP_HOME=/home/wesm/code/cloudera/impala/thirdparty/hadoop-2.6.0-cdh5.8.0-SNAPSHOT
fi
# This avoids native-hadoop loading error / warning =( =(
if [ -d "$HADOOP_HOME" ]; then
export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
export HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$HADOOP_HOME/lib/native"
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HADOOP_HOME/lib/native/
fi
export ARROW_HDFS_TEST_HOST=localhost
export ARROW_HDFS_TEST_PORT=20500
export ARROW_HDFS_TEST_USER=wesm
function parquet_cmake {
cmake $PARQUET_GCC_OPTIONS \
-DPARQUET_CXXFLAGS="$PARQUET_CXXFLAGS" \
..
}
function parquet_cpp_update {
# rm -rf ~/code/parquet-cpp/library-build
mkdir -p ~/code/parquet-cpp/library-build
pushd ~/code/parquet-cpp/library-build
rm -rf *
cmake -GNinja \
-DCMAKE_INSTALL_PREFIX=$TP_DIR \
-DCMAKE_BUILD_TYPE=$TOOLCHAIN_BUILD_TYPE \
-DCMAKE_CXX_FLAGS='-D_GLIBCXX_USE_CXX11_ABI=0' \
-DPARQUET_BUILD_TESTS=OFF \
-DPARQUET_BUILD_EXECUTABLES=OFF \
-DPARQUET_BOOST_USE_SHARED=ON \
-DPARQUET_CXXFLAGS="$PARQUET_CXXFLAGS" \
..
ninja clean
ninja
ninja install
popd
}
function arrow_cpp_update {
# rm -rf ~/code/arrow/cpp/library-build
mkdir -p ~/code/arrow/cpp/library-build
pushd ~/code/arrow/cpp/library-build
rm -rf *
cmake -GNinja \
-DCMAKE_INSTALL_PREFIX=$TP_DIR \
-DCMAKE_BUILD_TYPE=$TOOLCHAIN_BUILD_TYPE \
-DCMAKE_CXX_FLAGS='-D_GLIBCXX_USE_CXX11_ABI=0' \
-DARROW_EXTRA_ERROR_CONTEXT=ON \
-DARROW_NO_DEPRECATED_API=OFF \
-DARROW_BOOST_USE_SHARED=ON \
-DARROW_BUILD_BENCHMARKS=off \
-DARROW_GPU=$ARROW_BUILD_GPU \
-DARROW_ORC=on \
-DARROW_PLASMA=on \
-DARROW_PYTHON=on \
-DARROW_TENSORFLOW=$ARROW_BUILD_TENSORFLOW \
-DARROW_BUILD_TESTS=off \
-DCMAKE_BUILD_TYPE=$TOOLCHAIN_BUILD_TYPE ..
ninja clean
ninja
ninja install
popd
}
function arrow_cmake {
cmake $ARROW_GCC_OPTIONS \
-DARROW_PLASMA=on \
-DARROW_CXXFLAGS="-Werror $ARROW_CXXFLAGS" \
-DCMAKE_BUILD_TYPE=$TOOLCHAIN_BUILD_TYPE ..
}
function arrow_gcc {
toolchain_gcc
arrow_cmake
}
function arrow_clang {
toolchain_clang
arrow_cmake
}
function build_pyarrow {
python setup.py build_ext --inplace --with-parquet --with-plasma
}
function arrow_glib_test {
arrow_cpp_update
pushd $HOME/code/arrow/c_glib
git clean -fdx .
export PKG_CONFIG_PATH=$TP_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
GLIB_CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0"
./autogen.sh
./configure CXXFLAGS=$GLIB_CXXFLAGS CFLAGS=$GLIB_CXXFLAGS \
--prefix=$TP_DIR --enable-gtk-doc
CXXFLAGS=$GLIB_CXXFLAGS CFLAGS=$GLIB_CXXFLAGS make -j8
make install
export GI_TYPELIB_PATH=$TP_DIR/lib/girepository-1.0
NO_MAKE=yes test/run-test.sh
popd
}
function arrow_preflight {
ARROW_PREFLIGHT_DIR=$HOME/code/arrow/cpp/preflight
mkdir -p $ARROW_PREFLIGHT_DIR
pushd $ARROW_PREFLIGHT_DIR
arrow_cmake
ninja format
ninja lint
popd
pushd $HOME/code/arrow/python
flake8 --count pyarrow
flake8 --count --config=.flake8.cython pyarrow
popd
}
function pandas_gcc {
toolchain_gcc
cmake -DPANDAS_BUILD_CYTHON=off \
..
}
function pandas_clang {
toolchain_clang
cmake -DPANDAS_BUILD_CYTHON=off \
..
}
function parquet_clang_cmake {
toolchain_clang
parquet_cmake
}
function get_arrow_sha256 {
TMPNAME=`uuidgen`.tar.gz
wget https://github.com/apache/arrow/archive/$1.tar.gz -O $TMPNAME
echo `sha256sum $TMPNAME`
rm -rf $TMPNAME
}
function get_parquet_sha256 {
TMPNAME=`uuidgen`.tar.gz
wget https://github.com/apache/parquet-cpp/archive/$1.tar.gz -O $TMPNAME
echo `sha256sum $TMPNAME`
rm -rf $TMPNAME
}
function update_tp_toolchain {
ccache -C
arrow_cpp_update
parquet_cpp_update
}
function update_pyarrow {
update_tp_toolchain
pushd $HOME/code/arrow/python
rm -rf build/
python setup.py build_ext --inplace
popd
}
#----------------------------------------------------------------------
# Spark stuff
export PATH=$HOME/java/maven-3.3.9/bin:$PATH
export SPARK_HOME=$HOME/code/spark
export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
# ----------------------------------------------------------------------
# Ocaml stuff
. /home/wesm/.opam/opam-init/init.sh > /dev/null 2> /dev/null || true
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment