Skip to content

Instantly share code, notes, and snippets.

View prodeezy's full-sized avatar

Gautam prodeezy

View GitHub Profile
CREATE TABLE nested_array_data_parquet USING PARQUET AS
SELECT id as row_id,
NAMED_STRUCT( "first_level", ARRAY(NAMED_STRUCT("second_level",
ARRAY(NAMED_STRUCT("level3_id", id+1, "revenue", CAST(RAND(1)*100 AS DOUBLE)),
NAMED_STRUCT("level3_id", id+2, "revenue", CAST(RAND(2)*100 AS DOUBLE)),
NAMED_STRUCT("level3_id", id+3, "revenue", CAST(RAND(3)*100 AS DOUBLE)),
NAMED_STRUCT("level3_id", id+4, "revenue", CAST(RAND(4)*100 AS DOUBLE))),
"level2_id", id+1,
"level_2_other", "should_not_be_read"
),
@prodeezy
prodeezy / vectorization_test_without_maps
Last active November 22, 2019 02:07
Vectorization Test Without Maps
import org.apache.spark.sql.types._ ;
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.Schema;
import org.apache.iceberg.spark.SparkSchemaUtil
val schema = new StructType().add("age", IntegerType).add("name", StringType).add("location", new StructType().add("lat", DoubleType).add("lon", DoubleType))
val json = spark.read.schema(schema).json("people_no_maps.json")
json.printSchema
json.show
@prodeezy
prodeezy / timestamp_day_transform_issue.txt
Last active September 30, 2019 09:20
Timestamp Day Transform Partition Read Example
import org.apache.iceberg.types.Types.NestedField._
import org.apache.iceberg.types._
import org.apache.iceberg._
val timestampSchema = new Schema(optional(1, "timestamp", Types.TimestampType.withoutZone()))
val partitionByDate = PartitionSpec.builderFor(timestampSchema).day("timestamp", "date").build
@prodeezy
prodeezy / iceberg_schema_evolution_ingest.txt
Last active September 26, 2019 08:21
Iceberg Schema Evolution Scenarios
// *** Sample Data
bash-3.2$ cat people_flat.json
{"name":"Michael", "grade": 4.0}
{"name":"Andy", "age":30, "grade": 3.5}
{"name":"Justin", "age":19}
bash-3.2$ cat people_reordered.json
{"age":65, "name":"Biswa", "grade": 4.0}
@prodeezy
prodeezy / Test-Struct_Filtering_on_ Iceberg.txt
Last active August 30, 2019 11:33
Test Struct based Filter on Iceberg
bash-3.2$ cat people.json
{"name":"Michael"}
{"name":"Andy", "age":30, "friends": {"Josh": 10, "Biswa": 25}, "location": { "lat": 101.123, "lon": 50.324 } }
{"name":"Justin", "age":19, "friends": {"Kannan": 75, "Sanjay": 100}, "location": { "lat": 175.926, "lon": 20.524 } }
spark-shell
import org.apache.spark.sql.types._ ;
import org.apache.iceberg.hadoop.HadoopTables;
@prodeezy
prodeezy / Test_Iceberg_File_Scan.txt
Last active May 11, 2019 00:02
Iceberg not skipping files using useful Metrics
# Test JSON files
bash-3.2$ cat people.json
{"name":"Michael"}
{"name":"Andy", "age":30, "friends": {"Josh": 10, "Biswa": 25} }
{"name":"Justin", "age":19, "friends": {"Kannan": 75, "Sanjay": 100} }
bash-3.2$ cat people2.json
{"name":"Biswa", "age":75, "friends": {"Kannan": 90, "Josh": 10} }
{"name":"Kannan", "age":90, "friends": {"Michael": 10, "Justin": 19} }
@prodeezy
prodeezy / gist:072dfbc69774652640e36b9ad5f17c68
Last active April 22, 2019 18:04
Test for Complex Predicate over Iceberg not returning rows
root@61b7c92f78a4:/usr/local/spark/test# cat people.json
{"name":"Michael"}
{"name":"Andy", "age":30, "friends": {"Josh": 10, "Biswa": 25} }
{"name":"Justin", "age":19, "friends": {"Kannan": 75, "Sanjay": 100} }
$SPARK_HOME/bin/spark-shell --jars ~/iceberg-runtime-ce457ce.jar
import org.apache.spark.sql.types._ ;
@prodeezy
prodeezy / Test_Fixed_Column_Stats_Iceberg.txt
Last active April 10, 2019 08:47
Test Fix for File pruning on Simple Predicates in Iceberg
# Test JSON files
bash-3.2$ cat people.json
{"name":"Michael"}
{"name":"Andy", "age":30, "friends": {"Josh": 10, "Biswa": 25} }
{"name":"Justin", "age":19, "friends": {"Kannan": 75, "Sanjay": 100} }
bash-3.2$ cat people2.json
{"name":"Biswa", "age":75, "friends": {"Kannan": 90, "Josh": 10} }
{"name":"Kannan", "age":90, "friends": {"Michael": 10, "Justin": 19} }
@prodeezy
prodeezy / Iceberg_table_Relative_path_loading
Last active April 2, 2019 05:13
Test for loading Iceberg table using relative path
bash-3.2$ ls -d test/iceberg-people
test/iceberg-people
spark-shell --jars runtime/build/libs/iceberg-runtime.jar
import org.apache.spark.sql.types._ ;
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.Schema;
import org.apache.iceberg.spark.SparkSchemaUtil
package org.apache.hadoop.hbase.util;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.conf.Configuration;
import java.io.IOException;
import java.util.ArrayList;