tibble is a abstraction of a remote table in SparkR
# print few rows
print(tibble, n=5, width=Inf)
# **str** function will analyse only tibble object, **not a remote data**
str(tibble)
# to analyse remote data like **str**, use **glimpse**
glimpse(tibble)
# Select and filter data
tibble %>%
# select columns use column names
select(artist_name, release, title, year) %>%
# Filter rows
filter(year >= 1960, year < 1970)
# Sort rows
arrange(artist_name, desc(year), title)
# change columns or create new one
tibble %>%
select(x, y) %>%
# this mutate will remove x with x * 1.5 and will create new y2 with y square
mutate(
x = x * 1.5,
y2 = y * y
)
# summary data
tibble %>%
select(x) %>%
# this with return a new **tibble** with summarization
summarize(x_mean = mean(x))
# select with starts_with and ends_with
tibble %>%
select(starts_with("name")) %>%
select(ends_with("code"))
# select columns tbm contains
tibble %>%
select(contains("ti"))
# regex with columns names
tibble %>%
select(matches("ti.?t"))
# distinct rows by columns
tibble %>%
distinct(x[, y, z])
# count distinct rows
tibble %>%
count(x, [y], sort=T) %>%
# Restrict to top 20
top_n(20)
# copy from spark to R
df <- results %>%
collect()
# create a tmp table on spark with **compute** that creates a new tibble
computed <- tibble %>%
compute("new_table_name")
# group and summarise will return only columns defined in group and summarise
tibble %>%
group_by(x, y) %>%
summarise(x_mean = mean(x))
# group and mutate will return all columns in the dataset
tibble %>%
group_by(x, y) %>%
murate(x_mean = mean(x))
# join two tables (tibble's)
tible_c <- left_join(tibble_a, tibble_b, by = c("tibble_a_column", "tibble_b_column"))
# anti join return the rows of A not match with B
tibble_c <- anti_join(tibble_A, tibble_B, by="id")
# semi join return only rows of A matched with B
tibble_c <- semi_join(tibble_A, tibble_B, by="id")
(data.frame <- dbGetQuery(spark_conn, "SELECT * FROM table limit 10"))