Skip to content

Instantly share code, notes, and snippets.

@HeartSaVioR
Last active June 5, 2018 09:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HeartSaVioR/556ea7db6740fa2fce7dae72a75d9618 to your computer and use it in GitHub Desktop.
Save HeartSaVioR/556ea7db6740fa2fce7dae72a75d9618 to your computer and use it in GitHub Desktop.
Calculating size for various kinds of UnsafeRow
package net.heartsavior.spark.trial
import java.nio.charset.StandardCharsets
import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeProjection, UnsafeRow}
import org.apache.spark.sql.types.StructType
import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.util.SizeEstimator
object SparkTrialCalculateUnsafeRowSize {
def printSizeInfoForUnsafeRow(structType: StructType, row: UnsafeRow) = {
println(s"====== ${structType} ======")
println(s"Row underlying byte array size: ${row.getBytes.length}")
println(s"Row sizeInBytes: ${row.getSizeInBytes}")
println(s"Row estimated size: ${SizeEstimator.estimate(row)}")
println(s"Row estimated size (copied): ${SizeEstimator.estimate(row.copy())}")
}
def main(args: Array[String]): Unit = {
// 10 MB of characters
val dummyStr = (0 until 10 * 1024 * 1024).map(_ => "c").toList.mkString("")
val dummyUtf8Str = UTF8String.fromString(dummyStr)
println("====== Dummy Data Information ======")
println(s"INFO: estimated String size: ${SizeEstimator.estimate(dummyStr)}")
println(s"INFO: String size (length): ${dummyStr.length}")
println(s"INFO: underlying byte array on String size (length): ${dummyStr.getBytes(StandardCharsets.UTF_8).length}")
println(s"INFO: estimated UTF8 String size: ${SizeEstimator.estimate(dummyUtf8Str)}")
println(s"INFO: UTF8 String (byte) size: ${dummyUtf8Str.numBytes()}")
println(s"INFO: UTF8 String (char) size: ${dummyUtf8Str.numChars()}")
val emptySchema = new StructType()
val emptyToUnsafeRow = UnsafeProjection.create(emptySchema)
val emptyValueRow = emptyToUnsafeRow(new SpecificInternalRow(emptySchema))
printSizeInfoForUnsafeRow(emptySchema, emptyValueRow)
val intValueSchema = new StructType().add("value", "int")
val intToUnsafeRow = UnsafeProjection.create(intValueSchema)
val intValueRow = intToUnsafeRow(new SpecificInternalRow(intValueSchema))
intValueRow.setInt(0, 123)
printSizeInfoForUnsafeRow(intValueSchema, intValueRow)
val longValueSchema = new StructType().add("value", "long")
val longToUnsafeRow = UnsafeProjection.create(longValueSchema)
val longValueRow = longToUnsafeRow(new SpecificInternalRow(longValueSchema))
longValueRow.setLong(0, 123L)
printSizeInfoForUnsafeRow(longValueSchema, longValueRow)
val floatValueSchema = new StructType().add("value", "float")
val floatToUnsafeRow = UnsafeProjection.create(floatValueSchema)
val floatValueRow = floatToUnsafeRow(new SpecificInternalRow(floatValueSchema))
floatValueRow.setFloat(0, 123.456f)
printSizeInfoForUnsafeRow(floatValueSchema, floatValueRow)
val doubleValueSchema = new StructType().add("value", "double")
val doubleToUnsafeRow = UnsafeProjection.create(doubleValueSchema)
val doubleValueRow = doubleToUnsafeRow(new SpecificInternalRow(doubleValueSchema))
doubleValueRow.setDouble(0, 123.456)
printSizeInfoForUnsafeRow(doubleValueSchema, doubleValueRow)
val long2ValueSchema = new StructType().add("value", "long").add("value2", "long")
val long2ToUnsafeRow = UnsafeProjection.create(long2ValueSchema)
val long2ValueRow = long2ToUnsafeRow(new SpecificInternalRow(long2ValueSchema))
long2ValueRow.setLong(0, 123L)
long2ValueRow.setLong(1, 789L)
printSizeInfoForUnsafeRow(long2ValueSchema, long2ValueRow)
val double2ValueSchema = new StructType().add("value", "double").add("value2", "double")
val double2ToUnsafeRow = UnsafeProjection.create(double2ValueSchema)
val double2ValueRow = double2ToUnsafeRow(new SpecificInternalRow(double2ValueSchema))
double2ValueRow.setDouble(0, 123.456)
double2ValueRow.setDouble(1, 789.123)
printSizeInfoForUnsafeRow(double2ValueSchema, double2ValueRow)
val stringValueSchema = new StructType().add("value", "string")
val stringToUnsafeRow = UnsafeProjection.create(stringValueSchema)
val stringInternalRow = new SpecificInternalRow(stringValueSchema)
stringInternalRow.update(0, dummyUtf8Str.clone())
val stringValueRow = stringToUnsafeRow(stringInternalRow)
printSizeInfoForUnsafeRow(stringValueSchema, stringValueRow)
val string2ValueSchema = new StructType().add("value", "string").add("value2", "string")
val string2ToUnsafeRow = UnsafeProjection.create(string2ValueSchema)
val string2InternalRow = new SpecificInternalRow(string2ValueSchema)
string2InternalRow.update(0, dummyUtf8Str.clone())
string2InternalRow.update(1, dummyUtf8Str.clone())
val string2ValueRow = string2ToUnsafeRow(string2InternalRow)
printSizeInfoForUnsafeRow(string2ValueSchema, string2ValueRow)
val string3ValueSchema = new StructType().add("value", "string").add("value2", "string")
.add("value3", "string")
val string3ToUnsafeRow = UnsafeProjection.create(string3ValueSchema)
val string3InternalRow = new SpecificInternalRow(string3ValueSchema)
string3InternalRow.update(0, dummyUtf8Str.clone())
string3InternalRow.update(1, dummyUtf8Str.clone())
string3InternalRow.update(2, dummyUtf8Str.clone())
val string3ValueRow = string3ToUnsafeRow(string3InternalRow)
printSizeInfoForUnsafeRow(string3ValueSchema, string3ValueRow)
val string4ValueSchema = new StructType().add("value", "string").add("value2", "string")
.add("value3", "string").add("value4", "string")
val string4ToUnsafeRow = UnsafeProjection.create(string4ValueSchema)
val string4InternalRow = new SpecificInternalRow(string4ValueSchema)
string4InternalRow.update(0, dummyUtf8Str.clone())
string4InternalRow.update(1, dummyUtf8Str.clone())
string4InternalRow.update(2, dummyUtf8Str.clone())
string4InternalRow.update(3, dummyUtf8Str.clone())
val string4ValueRow = string4ToUnsafeRow(string4InternalRow)
printSizeInfoForUnsafeRow(string4ValueSchema, string4ValueRow)
val nestedFieldSchema = new StructType().add("nestedIntField", "int").add("nestedStringField", "string")
val nestedStructSchema = new StructType().add("nested", nestedFieldSchema)
.add("stringField", "string").add("doubleField", "double")
val nestedFieldToUnsafeRow = UnsafeProjection.create(nestedFieldSchema)
val nestedFieldInternalRow = new SpecificInternalRow(nestedFieldSchema)
nestedFieldInternalRow.setInt(0, 100)
nestedFieldInternalRow.update(1, dummyUtf8Str.clone())
val nestedRow = nestedFieldToUnsafeRow(nestedFieldInternalRow)
val nestedStructToUnsafeRow = UnsafeProjection.create(nestedStructSchema)
val nestedStructInternalRow = new SpecificInternalRow(nestedStructSchema)
nestedStructInternalRow.update(0, nestedRow)
nestedStructInternalRow.update(1, dummyUtf8Str.clone())
nestedStructInternalRow.setDouble(2, 123.456)
val nestedOuterRow = nestedStructToUnsafeRow(nestedStructInternalRow)
printSizeInfoForUnsafeRow(nestedStructSchema, nestedOuterRow)
}
}
/*
====== Dummy Data Information ======
INFO: estimated String size: 20971560
INFO: String size (length): 10485760
INFO: underlying byte array on String size (length): 10485760
INFO: estimated UTF8 String size: 10485840
INFO: UTF8 String (byte) size: 10485760
INFO: UTF8 String (char) size: 10485760
====== StructType() ======
Row underlying byte array size: 0
Row sizeInBytes: 0
Row estimated size: 56
Row estimated size (copied): 56
====== StructType(StructField(value,IntegerType,true)) ======
Row underlying byte array size: 16
Row sizeInBytes: 16
Row estimated size: 72
Row estimated size (copied): 72
====== StructType(StructField(value,LongType,true)) ======
Row underlying byte array size: 16
Row sizeInBytes: 16
Row estimated size: 72
Row estimated size (copied): 72
====== StructType(StructField(value,FloatType,true)) ======
Row underlying byte array size: 16
Row sizeInBytes: 16
Row estimated size: 72
Row estimated size (copied): 72
====== StructType(StructField(value,DoubleType,true)) ======
Row underlying byte array size: 16
Row sizeInBytes: 16
Row estimated size: 72
Row estimated size (copied): 72
====== StructType(StructField(value,LongType,true), StructField(value2,LongType,true)) ======
Row underlying byte array size: 24
Row sizeInBytes: 24
Row estimated size: 80
Row estimated size (copied): 80
====== StructType(StructField(value,DoubleType,true), StructField(value2,DoubleType,true)) ======
Row underlying byte array size: 24
Row sizeInBytes: 24
Row estimated size: 80
Row estimated size (copied): 80
====== StructType(StructField(value,StringType,true)) ======
Row underlying byte array size: 10485776
Row sizeInBytes: 10485776
Row estimated size: 20971608
Row estimated size (copied): 10485832
====== StructType(StructField(value,StringType,true), StructField(value2,StringType,true)) ======
Row underlying byte array size: 20971544
Row sizeInBytes: 20971544
Row estimated size: 20971624
Row estimated size (copied): 20971600
====== StructType(StructField(value,StringType,true), StructField(value2,StringType,true), StructField(value3,StringType,true)) ======
Row underlying byte array size: 31457312
Row sizeInBytes: 31457312
Row estimated size: 62914680
Row estimated size (copied): 31457368
====== StructType(StructField(value,StringType,true), StructField(value2,StringType,true), StructField(value3,StringType,true), StructField(value4,StringType,true)) ======
Row underlying byte array size: 41943080
Row sizeInBytes: 41943080
Row estimated size: 62914696
Row estimated size (copied): 41943136
====== StructType(StructField(nested,StructType(StructField(nestedIntField,IntegerType,true), StructField(nestedStringField,StringType,true)),true), StructField(stringField,StringType,true), StructField(doubleField,DoubleType,true)) ======
Row underlying byte array size: 20971576
Row sizeInBytes: 20971576
Row estimated size: 20971688
Row estimated size (copied): 20971632
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment