Skip to content

Instantly share code, notes, and snippets.

@prodeezy
Last active May 11, 2019 00:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save prodeezy/fe1b447c78c0bc9dc3be66272341d1a7 to your computer and use it in GitHub Desktop.
Save prodeezy/fe1b447c78c0bc9dc3be66272341d1a7 to your computer and use it in GitHub Desktop.
Iceberg not skipping files using useful Metrics
# Test JSON files
bash-3.2$ cat people.json
{"name":"Michael"}
{"name":"Andy", "age":30, "friends": {"Josh": 10, "Biswa": 25} }
{"name":"Justin", "age":19, "friends": {"Kannan": 75, "Sanjay": 100} }
bash-3.2$ cat people2.json
{"name":"Biswa", "age":75, "friends": {"Kannan": 90, "Josh": 10} }
{"name":"Kannan", "age":90, "friends": {"Michael": 10, "Justin": 19} }
spark-shell
import org.apache.spark.sql.types._ ;
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.Schema;
import org.apache.iceberg.spark.SparkSchemaUtil
import org.apache.iceberg.TableScan
import scala.collection.JavaConverters._
import com.netflix.iceberg.expressions._
val schema = new StructType().add("age", IntegerType).add("name", StringType).add("friends", MapType(StringType, IntegerType))
val iceSchema = SparkSchemaUtil.convert(schema)
val tables = new HadoopTables()
val iceTable = tables.create(iceSchema, "iceberg-people")
// val iceTable = tables.load("iceberg-people")
// read 2 json files
val json = spark.read.schema(schema).json("people.json")
val json2 = spark.read.schema(schema).json("people2.json")
// write both files to same iceberg table
json.write.format("iceberg").mode("append").save("iceberg-people")
json2.write.format("iceberg").mode("append").save("iceberg-people")
// Literal equals test
val ageExp = Expressions.equal("age", 30)
val ageScan = iceTable.newScan().filter(ageExp)
var fileCount = 0
ageScan.planFiles.asScala.foreach(fl => fileCount = fileCount + 1)
// SHOULD SCAN ONE FILE ONLY
scala> fileCount
res28: Int = 2
// Column isNull test
val isNullExp = Expressions.isNull("age")
val isNullScan = iceTable.newScan().filter(isNullExp)
var fileCount = 0
// SHOULD SCAN ONE FILE ONLY
isNullScan.planFiles.asScala.foreach(fl => fileCount = fileCount + 1)
fileCount
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment