Created
January 8, 2022 23:29
-
-
Save omairaasim/7f215bd5c7f2e57617c109461bd951e3 to your computer and use it in GitHub Desktop.
Explain the sample and sampleBy functions in PySpark in Databricks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Databricks notebook source | |
# Importing packages | |
import pyspark | |
from pyspark.sql import SparkSession, Row | |
from pyspark.sql.types import MapType, StringType | |
from pyspark.sql.functions import col | |
from pyspark.sql.types import StructType,StructField, StringType | |
# COMMAND ---------- | |
# Implementing the sample() function and sampleBy() function in Databricks in PySpark | |
spark = SparkSession.builder \ | |
.master("local[1]") \ | |
.appName("sample() and sampleBy() PySpark") \ | |
.getOrCreate() | |
dataframe = spark.range(100) | |
print(dataframe.sample(0.06).collect()) | |
# Using sample() function | |
print(dataframe.sample(0.1,123).collect()) | |
print(dataframe.sample(0.1,123).collect()) | |
print(dataframe.sample(0.1,456).collect()) | |
# Using the withReplacement(May contain duplicates) | |
## With Duplicates | |
print(dataframe.sample(True,0.3,123).collect()) | |
## Without Duplicates | |
print(dataframe.sample(0.3,123).collect()) | |
# Using sampleBy() function | |
dataframe2 = dataframe.select((dataframe.id % 3).alias("key")) | |
print(dataframe2.sampleBy("key", {0: 0.1, 1: 0.2},0).collect()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment