Skip to content

Instantly share code, notes, and snippets.

@allanbatista
Last active March 27, 2023 09:11
Show Gist options
  • Save allanbatista/281562ffe739b12052c82bed04f18aa5 to your computer and use it in GitHub Desktop.
Save allanbatista/281562ffe739b12052c82bed04f18aa5 to your computer and use it in GitHub Desktop.
Deploy RStudio with Dataproc

tibble is a abstraction of a remote table in SparkR

# print few rows
print(tibble, n=5, width=Inf)

# **str** function will analyse only tibble object, **not a remote data**
str(tibble)

# to analyse remote data like **str**, use **glimpse**
glimpse(tibble)

# Select and filter data
tibble %>%
  # select columns use column names
  select(artist_name, release, title, year) %>%
  # Filter rows
  filter(year >= 1960, year < 1970)
  # Sort rows
  arrange(artist_name, desc(year), title)

# change columns or create new one
tibble %>%
   select(x, y) %>%
   # this mutate will remove x with x * 1.5 and will create new y2 with y square
   mutate(
     x = x * 1.5,
     y2 = y * y
   )

# summary data
tibble %>%
  select(x) %>%
  # this with return a new **tibble** with summarization 
  summarize(x_mean = mean(x))

# select with starts_with and ends_with
tibble %>%
  select(starts_with("name")) %>%
  select(ends_with("code"))

# select columns tbm contains
tibble %>%
  select(contains("ti"))

# regex with columns names
tibble %>%
  select(matches("ti.?t"))

# distinct rows by columns
tibble %>%
  distinct(x[, y, z])

# count distinct rows
tibble %>%
  count(x, [y], sort=T) %>%
  # Restrict to top 20
  top_n(20)

# copy from spark to R
df <- results %>%
  collect()

# create a tmp table on spark with **compute** that creates a new tibble
computed <- tibble %>%
  compute("new_table_name")

# group and summarise will return only columns defined in group and summarise
tibble %>%
  group_by(x, y) %>%
  summarise(x_mean = mean(x))

# group and mutate will return all columns in the dataset
tibble %>%
  group_by(x, y) %>%
  murate(x_mean = mean(x))

# join two tables (tibble's)
tible_c <- left_join(tibble_a, tibble_b, by = c("tibble_a_column", "tibble_b_column"))

# anti join return the rows of A not match with B
tibble_c <- anti_join(tibble_A, tibble_B, by="id")

# semi join return only rows of A matched with B
tibble_c <- semi_join(tibble_A, tibble_B, by="id")

Executing SQL Query on spark Dataframes

Run the query

(data.frame <- dbGetQuery(spark_conn, "SELECT * FROM table limit 10"))

Create cluster

$ gcloud beta dataproc clusters create [CLUSTER-NAME] \
                              --enable-component-gateway \
                              --bucket bucket-name \
                              --region us-central1 \
                              --subnet default \
                              --zone us-central1-a \
                              --master-machine-type n1-standard-4 \
                              --master-boot-disk-size 500 \
                              --num-workers 2 \
                              --worker-machine-type n1-standard-4 \
                              --worker-boot-disk-size 500 \
                              --image-version 1.4-debian9 \
                              --scopes 'https://www.googleapis.com/auth/cloud-platform' \
                              --project bucket-name \
                              --max-idle 1h \
                              --initialization-actions 'gs://bucket-name/scripts/install_studio_server.sh'

create virtual tunnel to make possible access node master

$ gcloud compute ssh --zone us-central1-a allanbatista@[CLUSTER-NAME]-m --ssh-key-file=~/.ssh/id_rsa -- -4 -N -L 8787:[CLUSTER-NAME]-m:8787

access

http://localhost:8787
user: guest
pass: guest

example code 1

# install.packages('sparklyr') # execute only one time
library(sparklyr)

# create connection
sc <- spark_connect(master = "local", spark_home = '/usr/lib/spark')

# read from GCS
df <- spark_read_text(sc, "gs://bucket-name/dados/example.tsv")

# print sample
df %>%
    top_n(10)
#!/bin/bash
# Update
apt-get update
# used for devtools
apt-get install libcurl4-openssl-dev \
libssl-dev \
libxml2-dev \
libclang-3.8-dev \
libclang-common-3.8-dev \
libclang-dev \
libclang1-3.8 \
libgc1c2 \
libllvm3.8 \
libobjc-6-dev \
libobjc4 \
r-base \
r-base-dev \
git -y
wget https://download2.rstudio.org/server/debian9/x86_64/rstudio-server-1.2.1335-amd64.deb
dpkg -i rstudio-server-1.2.1335-amd64.deb
# export envvars
echo "HADOOP_CONF_DIR=/etc/hadoop/conf" >> /etc/environment
echo "YARN_CONF_DIR=/etc/hadoop/conf" >> /etc/environment
echo "SPARK_HOME=/usr/lib/spark" >> /etc/environment
# create user
useradd -m -p $(python -c "import crypt; print(crypt.crypt('guest'))") -s /bin/bash guest
useradd -m -p $(python -c "import crypt; print(crypt.crypt('guest'))") -s /bin/bash guest2
# Install aditional packages
R -e "install.packages(c('tidyverse', 'DBI', 'sparklyr', 'caTools', 'bitops'), repos='https://cran.rstudio.com/')"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment