Skip to content

Instantly share code, notes, and snippets.

View nfarah86's full-sized avatar

nadine farah nfarah86

View GitHub Profile
import org.apache.hudi.client.validator.SparkPreCommitValidator
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.Row
class CustomPreCommitValidator extends SparkPreCommitValidator {
override def validateRecordsBeforeAndAfter(before: Dataset[Row],
after: Dataset[Row], partitionsAffected: Set[String]): Unit = {
// Custom validation logic
// Perform data quality checks, apply business rules, etc.
CREATE TABLE demo (
id int,
name string,
email string,
phoneNumber string,
ts timestamp
)
USING hudi
OPTIONS (
primaryKey = "id",
-- INSERT statements
INSERT INTO demo VALUES (1, 'TestName1', CURRENT_TIMESTAMP);
INSERT INTO demo VALUES (2, NULL, CURRENT_TIMESTAMP), (3, 'TestName1', CURRENT_TIMESTAMP);
-- CREATE TABLE statement
CREATE TABLE demo (
id int,
name string,
ts timestamp
)
USING hudi
OPTIONS (
primaryKey = "id",
preCombineField = "ts",
spark.write.format("hudi").option("hoodie.precommit.validators",
"org.apache.hudi.client.validator.ValidatorClass1,
org.apache.hudi.client.validator.ValidatorClass2").save("path/to/data")
spark.sql("select marketplace, product_id, count(*),
avg(star_rating) from <TABLE> where product_category = 'Books'
group by 1,2 having avg(star_rating) >= 4 order by 3 desc,4 desc").show(20)
select * from tableA where customer='xyz' and dt between '2022-12-01' AND '2023-01-31'
spark.sql("select marketplace, product_id, count(*),
avg(star_rating) from <TABLE> where product_category = 'Books'
group by 1,2 having avg(star_rating) >= 4 order by 3 desc,4 desc").show(20)
set sql-client.execution.result-mode=tableau;
-- create the datagen table
CREATE TABLE sourceT (
uuid varchar(20),
name varchar(10),
age int,
ts timestamp(3)
) WITH (
'connector' = 'datagen',