Skip to content

Instantly share code, notes, and snippets.

@ad1happy2go
Created May 13, 2024 06:44
Show Gist options
  • Save ad1happy2go/66fdbba4266d531f056c7da914fbb6dc to your computer and use it in GitHub Desktop.
Save ad1happy2go/66fdbba4266d531f056c7da914fbb6dc to your computer and use it in GitHub Desktop.
tableName = "record_index_deepdive"
basePath = "/tmp/" + tableName
columns = ["ts","uuid","rider","driver","fare","city"]
data =[(1695159649087,"334e26e9-8355-45cc-97c6-c31daf0df3301","rider-A","driver-K",19.10,"san_francisco"),
(1695091554788,"e96c4396-3fad-413a-a942-4cb36106d7212","rider-C","driver-M",27.70 ,"san_francisco"),
(1695046462179,"9909a8b1-2d15-4d3d-8ec9-efc48c536a003","rider-D","driver-L",33.90 ,"san_francisco"),
(1695516137016,"e3cf430c-889d-4015-bc98-59bdce1e530c4","rider-F","driver-P",34.15,"sao_paulo"),
(1695115999911,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa5","rider-J","driver-T",17.85,"chennai")]
inserts = spark.createDataFrame(data).toDF(*columns)
hudi_options = {
'hoodie.table.name': tableName,
'hoodie.datasource.write.partitionpath.field': 'city',
'hoodie.datasource.write.recordkey.field':'uuid',
'hoodie.datasource.write.precombine.field':'ts',
'hoodie.metadata.record.index.enable':'true',
}
'hoodie.metadata.record.index.min.filegroup.count':'3',
'hoodie.metadata.record.index.max.filegroup.size':'10'
inserts.write.format("hudi"). \
options(**hudi_options). \
mode("append"). \
save(basePath)
for i in range(1,10):
data =[(1695159649087,"334e26e9-8355-45cc-97c6-c31daf0d" + str(i),"rider-A","driver-K",19.10,"san_francisco")]
inserts = spark.createDataFrame(data).toDF(*columns)
inserts.write.format("hudi"). \
options(**hudi_options). \
mode("append"). \
save(basePath)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment