Skip to content

Instantly share code, notes, and snippets.

@prateek
Created August 14, 2014 19:55
Show Gist options
  • Save prateek/a5afdc33d8892e6ca42f to your computer and use it in GitHub Desktop.
Save prateek/a5afdc33d8892e6ca42f to your computer and use it in GitHub Desktop.

Install Steps for RMR

These steps have been tested with

  • Oracle Linux 6.4
  • RHEL 6.5
  • CDH 5.1

Note I wish this was parceled up.

Step 0 (all nodes)

sudo bash
cd /tmp

enable epel repo (all nodes)

wget http://download.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm
rpm -ivh epel-release-6-8.noarch.rpm

prereqs (all nodes)

yum install -y xdg-utils bzip2-devel gcc-c++ gcc-gfortran libX11-devel pcre-devel   \
  tcl-devel tk-devel zlib-devel readline-devel libXt-devel libpng-devel cairo-devel \
  pango-devel 'libXmu.so.6()(64bit)' 'libgfortran.so.1()(64bit)' 'perl(File::Copy::Recursive)'

r-install (all-nodes)

cd /tmp/
wget http://cran.rstudio.com/src/base/R-2/R-2.15.3.tar.gz
tar xvfz R-2.15.3.tar.gz
cd R-2.15.3
./configure --enable-R-shlib --without-x --with-cairo && make && sudo make install
sudo ln -s /usr/local/bin/Rscript /usr/bin/
sudo ln -s /usr/local/bin/R /usr/bin/
cd ..

java install (all-nodes)

alternatives --install /usr/bin/java java /usr/java/jdk1.7.0_45-cloudera/bin/java 2000
alternatives --install /usr/bin/javac javac /usr/java/jdk1.7.0_45-cloudera/bin/javac 2000
alternatives --install /usr/bin/jar jar /usr/java/jdk1.7.0_45-cloudera/bin/jar 2000
alternatives --install /usr/bin/javah javah /usr/java/jdk1.7.0_45-cloudera/bin/javah 2000
# verify the java version
# java -version
# javac -version

rimpala rhadoop (all nodes)

export JAVA_HOME=/usr/java/jdk1.7.0_45-cloudera/
export JAVA_CPPFLAGS="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux"
export JAVA_LD_LIBRARY_PATH="$JAVA_HOME/jre/lib/amd64/server:$JAVA_HOME/jre/lib/amd64:$JAVA_HOME/jre/../lib/amd64:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib"
export LD_LIBRARY_PATH="$JAVA_HOME/jre/lib/amd64/server:$JAVA_HOME/jre/lib/amd64:$JAVA_HOME/jre/../lib/amd64:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib"
export JAVA_LIBS="-L$JAVA_HOME/jre/lib/amd64/server -L$JAVA_HOME/jre/lib/amd64 -L$JAVA_HOME/../lib/amd64 -L/usr/java/packages/lib/amd64 -L/usr/lib64 -L/lib64 -L/lib -L/usr/lib -ljvm"

configure java for R (all nodes)

R CMD javareconf

rJava install & verify (all nodes)

# Inside R
install.packages(c("rJava"), repos="http://cran.us.r-project.org/")
library(rJava)
.jinit()
.jcall("java/lang/System","S","getProperty","java.version")

Rcpp (all nodes)

wget http://cran.us.r-project.org/src/contrib/Archive/Rcpp/Rcpp_0.9.15.tar.gz
R CMD INSTALL Rcpp_0.9.15.tar.gz

plyr v1.8 (all nodes)

wget http://cran.revolutionanalytics.com/src/contrib/Archive/plyr/plyr_1.8.tar.gz
R CMD INSTALL plyr_1.8.tar.gz

reshape2 v1.2.2 (all nodes)

wget http://cran.revolutionanalytics.com/src/contrib/Archive/reshape2/reshape2_1.2.2.tar.gz
R CMD INSTALL reshape2_1.2.2.tar.gz

Install other R packages (all nodes)

# Inside R
install.packages(c("RJSONIO", "bitops", "digest", "functional", "RImpala"), repos="http://cran.us.r-project.org/")
install.packages(c('itertools'), repos="http://cran.revolutionanalytics.com", INSTALL_opts=c('--byte-compile') )
install.packages(c('functional', 'stringr'), repos="http://cran.revolutionanalytics.com", INSTALL_opts=c('--byte-compile') )
install.packages(c('randomForest'), repos="http://cran.revolutionanalytics.com" )
install.packages(c("caTools"), repos="http://cran.us.r-project.org/")

install git

yum install -y git

install rmr2 and rhdfs (all nodes)

export HADOOP_CMD=/usr/bin/hadoop
export HADOOP_STREAMING=/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar

git clone git://github.com/RevolutionAnalytics/rmr2.git
sudo R CMD INSTALL --byte-compile rmr2/pkg/

git clone git://github.com/RevolutionAnalytics/rhdfs.git
sudo HADOOP_CMD=/usr/bin/hadoop R CMD INSTALL --byte-compile rhdfs/pkg/

RSTUDIO! (only edge)

wget http://download2.rstudio.org/rstudio-server-0.98.490-x86_64.rpm
yum install -y 'libssl.so.6()(64bit)' shared-mime-info
rpm -i --nodeps rstudio-server-0.98.490-x86_64.rpm

use-unix login for rstudio

sudo /usr/sbin/rstudio-server stop
cp /etc/pam.d/login /etc/pam.d/rstudio
echo rsession-which-r=/usr/local/bin/R | sudo tee /etc/rstudio/rstudio.conf
sudo /usr/sbin/rstudio-server start

Browser

go to "edge-host:8787"

Check if RMR is working - example job

# in rstudio
Sys.setenv(HADOOP_CMD = "/usr/bin/hadoop")
Sys.setenv(HADOOP_STREAMING = "/opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar")
library(rmr2)
small.ints = to.dfs(1:1000)
result = mapreduce(
    input = small.ints,
    map = function(k, v) cbind(v, v^2),
    backend.parameters =
      list(
        hadoop =
          list(
            D = "mapreduce.map.memory.mb=8192",
            D = "mapreduce.reduce.memory.mb=8192")))
from.dfs(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment