This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load Data | |
writeLines("=========================== Create Spark Data frame ======================") | |
library(nycflights13) | |
library(Lahman) | |
# copy_to creates a spark Dataframe | |
flights_tbl_spark <- copy_to(sc, nycflights13::flights, "flights", overwrite = TRUE) | |
iris_tbl_spark <- copy_to(sc, iris, "iris", overwrite = TRUE) | |
batting_tbl_spark <- copy_to(sc, Lahman::Batting, "batting", overwrite = TRUE) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
library(arrow) | |
library(sparklyr) | |
library(dplyr) | |
################################ | |
# Create the Spark Session | |
################################ | |
Sys.setenv(SPARK_HOME = "/opt/spark") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM gcr.io/datamechanics/spark:3.0.2-hadoop-3.2.0-java-8-scala-2.12-python-3.8-dm13 | |
ARG R_VERSION="4.0.4" | |
ARG LIBARROW_BINARY="true" | |
ARG RSPM_CHECKPOINT=2696074 | |
ARG CRAN="https://packagemanager.rstudio.com/all/__linux__/focal/${RSPM_CHECKPOINT}" | |
USER root | |
RUN echo "deb http://cloud.r-project.org/bin/linux/debian buster-cran40/" >> /etc/apt/sources.list \ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"spark.kubernetes.driver.reusePersistentVolumeClaim": "true", | |
"spark.kubernetes.driver.ownPersistentVolumeClaim": "true", | |
"spark.kubernetes.executor.volumes.persistentVolumeClaim.data.options.claimName": "OnDemand", | |
"spark.kubernetes.executor.volumes.persistentVolumeClaim.data.options.storageClass": "standard", | |
"spark.kubernetes.executor.volumes.persistentVolumeClaim.data.options.sizeLimit": "500Gi", | |
"spark.kubernetes.executor.volumes.persistentVolumeClaim.data.mount.path": "/var/data", | |
"spark.kubernetes.executor.volumes.persistentVolumeClaim.data.mount.readOnly": "false" | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
image_name := 'pyspark-example:dev' | |
aws_access_key_id := '' | |
aws_secret_access_key := '' | |
postgres_user := '' | |
postgres_password := '' | |
postgres_host := '' | |
postgres_port := '' | |
postgres_db_name := '' | |
table_name := '' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, datetime | |
from databricks import koalas as ks | |
from pyspark.sql.dataframe import DataFrame | |
from pyspark.sql.types import * | |
from pyspark.sql.functions import explode, split, col, sum, lit | |
from pyspark.sql import SparkSession | |
def apply_transforms(df: DataFrame) -> DataFrame: | |
# split _c0 column as it is a string and we want the population data from it | |
split_col = split(df['_c0'], '\t') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gcr.io/datamechanics/spark:platform-3.1-dm14 | |
ENV PYSPARK_MAJOR_PYTHON_VERSION=3 | |
WORKDIR /opt/application/ | |
RUN wget https://jdbc.postgresql.org/download/postgresql-42.2.5.jar | |
RUN mv postgresql-42.2.5.jar /opt/spark/jars | |
COPY requirements.txt . | |
RUN pip3 install -r requirements.txt |