allanbatista/deploy_rstudio_with_dataproc.md

## readme.md

      
    Raw
  

              readme.md
            
          
    tibble is a abstraction of a remote table in SparkR
# print few rows
print(tibble, n=5, width=Inf)

# **str** function will analyse only tibble object, **not a remote data**
str(tibble)

# to analyse remote data like **str**, use **glimpse**
glimpse(tibble)

# Select and filter data
tibble %>%
  # select columns use column names
  select(artist_name, release, title, year) %>%
  # Filter rows
  filter(year >= 1960, year < 1970)
  # Sort rows
  arrange(artist_name, desc(year), title)

# change columns or create new one
tibble %>%
   select(x, y) %>%
   # this mutate will remove x with x * 1.5 and will create new y2 with y square
   mutate(
     x = x * 1.5,
     y2 = y * y
   )

# summary data
tibble %>%
  select(x) %>%
  # this with return a new **tibble** with summarization 
  summarize(x_mean = mean(x))

# select with starts_with and ends_with
tibble %>%
  select(starts_with("name")) %>%
  select(ends_with("code"))

# select columns tbm contains
tibble %>%
  select(contains("ti"))

# regex with columns names
tibble %>%
  select(matches("ti.?t"))

# distinct rows by columns
tibble %>%
  distinct(x[, y, z])

# count distinct rows
tibble %>%
  count(x, [y], sort=T) %>%
  # Restrict to top 20
  top_n(20)

# copy from spark to R
df <- results %>%
  collect()

# create a tmp table on spark with **compute** that creates a new tibble
computed <- tibble %>%
  compute("new_table_name")

# group and summarise will return only columns defined in group and summarise
tibble %>%
  group_by(x, y) %>%
  summarise(x_mean = mean(x))

# group and mutate will return all columns in the dataset
tibble %>%
  group_by(x, y) %>%
  murate(x_mean = mean(x))

# join two tables (tibble's)
tible_c <- left_join(tibble_a, tibble_b, by = c("tibble_a_column", "tibble_b_column"))

# anti join return the rows of A not match with B
tibble_c <- anti_join(tibble_A, tibble_B, by="id")

# semi join return only rows of A matched with B
tibble_c <- semi_join(tibble_A, tibble_B, by="id")

Executing SQL Query on spark Dataframes

Run the query

(data.frame <- dbGetQuery(spark_conn, "SELECT * FROM table limit 10"))

  
## deploy_rstudio_with_dataproc.md

      
    Raw
  

              deploy_rstudio_with_dataproc.md
            
          
    Create cluster

$ gcloud beta dataproc clusters create [CLUSTER-NAME] \
                              --enable-component-gateway \
                              --bucket bucket-name \
                              --region us-central1 \
                              --subnet default \
                              --zone us-central1-a \
                              --master-machine-type n1-standard-4 \
                              --master-boot-disk-size 500 \
                              --num-workers 2 \
                              --worker-machine-type n1-standard-4 \
                              --worker-boot-disk-size 500 \
                              --image-version 1.4-debian9 \
                              --scopes 'https://www.googleapis.com/auth/cloud-platform' \
                              --project bucket-name \
                              --max-idle 1h \
                              --initialization-actions 'gs://bucket-name/scripts/install_studio_server.sh'

create virtual tunnel to make possible access node master

$ gcloud compute ssh --zone us-central1-a allanbatista@[CLUSTER-NAME]-m --ssh-key-file=~/.ssh/id_rsa -- -4 -N -L 8787:[CLUSTER-NAME]-m:8787

access

http://localhost:8787
user: guest
pass: guest

example code 1

# install.packages('sparklyr') # execute only one time
library(sparklyr)

# create connection
sc <- spark_connect(master = "local", spark_home = '/usr/lib/spark')

# read from GCS
df <- spark_read_text(sc, "gs://bucket-name/dados/example.tsv")

# print sample
df %>%
    top_n(10)


## install_studio_server.sh
#!/bin/bash

# Update
apt-get update

# used for devtools
apt-get install libcurl4-openssl-dev \
                     libssl-dev \
                     libxml2-dev \
                     libclang-3.8-dev \
                     libclang-common-3.8-dev \
                     libclang-dev \
                     libclang1-3.8 \
                     libgc1c2 \
                     libllvm3.8 \
                     libobjc-6-dev \
                     libobjc4 \
                     r-base \
                     r-base-dev \
                     git -y

wget https://download2.rstudio.org/server/debian9/x86_64/rstudio-server-1.2.1335-amd64.deb
dpkg -i rstudio-server-1.2.1335-amd64.deb

# export envvars
echo "HADOOP_CONF_DIR=/etc/hadoop/conf" >> /etc/environment
echo "YARN_CONF_DIR=/etc/hadoop/conf" >> /etc/environment
echo "SPARK_HOME=/usr/lib/spark" >> /etc/environment

# create user
useradd -m -p $(python -c "import crypt; print(crypt.crypt('guest'))") -s /bin/bash guest
useradd -m -p $(python -c "import crypt; print(crypt.crypt('guest'))") -s /bin/bash guest2

# Install aditional packages
R -e "install.packages(c('tidyverse', 'DBI', 'sparklyr', 'caTools', 'bitops'), repos='https://cran.rstudio.com/')"
	#!/bin/bash

	# Update
	apt-get update

	# used for devtools
	apt-get install libcurl4-openssl-dev \
	libssl-dev \
	libxml2-dev \
	libclang-3.8-dev \
	libclang-common-3.8-dev \
	libclang-dev \
	libclang1-3.8 \
	libgc1c2 \
	libllvm3.8 \
	libobjc-6-dev \
	libobjc4 \
	r-base \
	r-base-dev \
	git -y

	wget https://download2.rstudio.org/server/debian9/x86_64/rstudio-server-1.2.1335-amd64.deb
	dpkg -i rstudio-server-1.2.1335-amd64.deb

	# export envvars
	echo "HADOOP_CONF_DIR=/etc/hadoop/conf" >> /etc/environment
	echo "YARN_CONF_DIR=/etc/hadoop/conf" >> /etc/environment
	echo "SPARK_HOME=/usr/lib/spark" >> /etc/environment

	# create user
	useradd -m -p $(python -c "import crypt; print(crypt.crypt('guest'))") -s /bin/bash guest
	useradd -m -p $(python -c "import crypt; print(crypt.crypt('guest'))") -s /bin/bash guest2

	# Install aditional packages
	R -e "install.packages(c('tidyverse', 'DBI', 'sparklyr', 'caTools', 'bitops'), repos='https://cran.rstudio.com/')"