Last active
June 5, 2018 09:30
-
-
Save HeartSaVioR/556ea7db6740fa2fce7dae72a75d9618 to your computer and use it in GitHub Desktop.
Calculating size for various kinds of UnsafeRow
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package net.heartsavior.spark.trial | |
import java.nio.charset.StandardCharsets | |
import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeProjection, UnsafeRow} | |
import org.apache.spark.sql.types.StructType | |
import org.apache.spark.unsafe.types.UTF8String | |
import org.apache.spark.util.SizeEstimator | |
object SparkTrialCalculateUnsafeRowSize { | |
def printSizeInfoForUnsafeRow(structType: StructType, row: UnsafeRow) = { | |
println(s"====== ${structType} ======") | |
println(s"Row underlying byte array size: ${row.getBytes.length}") | |
println(s"Row sizeInBytes: ${row.getSizeInBytes}") | |
println(s"Row estimated size: ${SizeEstimator.estimate(row)}") | |
println(s"Row estimated size (copied): ${SizeEstimator.estimate(row.copy())}") | |
} | |
def main(args: Array[String]): Unit = { | |
// 10 MB of characters | |
val dummyStr = (0 until 10 * 1024 * 1024).map(_ => "c").toList.mkString("") | |
val dummyUtf8Str = UTF8String.fromString(dummyStr) | |
println("====== Dummy Data Information ======") | |
println(s"INFO: estimated String size: ${SizeEstimator.estimate(dummyStr)}") | |
println(s"INFO: String size (length): ${dummyStr.length}") | |
println(s"INFO: underlying byte array on String size (length): ${dummyStr.getBytes(StandardCharsets.UTF_8).length}") | |
println(s"INFO: estimated UTF8 String size: ${SizeEstimator.estimate(dummyUtf8Str)}") | |
println(s"INFO: UTF8 String (byte) size: ${dummyUtf8Str.numBytes()}") | |
println(s"INFO: UTF8 String (char) size: ${dummyUtf8Str.numChars()}") | |
val emptySchema = new StructType() | |
val emptyToUnsafeRow = UnsafeProjection.create(emptySchema) | |
val emptyValueRow = emptyToUnsafeRow(new SpecificInternalRow(emptySchema)) | |
printSizeInfoForUnsafeRow(emptySchema, emptyValueRow) | |
val intValueSchema = new StructType().add("value", "int") | |
val intToUnsafeRow = UnsafeProjection.create(intValueSchema) | |
val intValueRow = intToUnsafeRow(new SpecificInternalRow(intValueSchema)) | |
intValueRow.setInt(0, 123) | |
printSizeInfoForUnsafeRow(intValueSchema, intValueRow) | |
val longValueSchema = new StructType().add("value", "long") | |
val longToUnsafeRow = UnsafeProjection.create(longValueSchema) | |
val longValueRow = longToUnsafeRow(new SpecificInternalRow(longValueSchema)) | |
longValueRow.setLong(0, 123L) | |
printSizeInfoForUnsafeRow(longValueSchema, longValueRow) | |
val floatValueSchema = new StructType().add("value", "float") | |
val floatToUnsafeRow = UnsafeProjection.create(floatValueSchema) | |
val floatValueRow = floatToUnsafeRow(new SpecificInternalRow(floatValueSchema)) | |
floatValueRow.setFloat(0, 123.456f) | |
printSizeInfoForUnsafeRow(floatValueSchema, floatValueRow) | |
val doubleValueSchema = new StructType().add("value", "double") | |
val doubleToUnsafeRow = UnsafeProjection.create(doubleValueSchema) | |
val doubleValueRow = doubleToUnsafeRow(new SpecificInternalRow(doubleValueSchema)) | |
doubleValueRow.setDouble(0, 123.456) | |
printSizeInfoForUnsafeRow(doubleValueSchema, doubleValueRow) | |
val long2ValueSchema = new StructType().add("value", "long").add("value2", "long") | |
val long2ToUnsafeRow = UnsafeProjection.create(long2ValueSchema) | |
val long2ValueRow = long2ToUnsafeRow(new SpecificInternalRow(long2ValueSchema)) | |
long2ValueRow.setLong(0, 123L) | |
long2ValueRow.setLong(1, 789L) | |
printSizeInfoForUnsafeRow(long2ValueSchema, long2ValueRow) | |
val double2ValueSchema = new StructType().add("value", "double").add("value2", "double") | |
val double2ToUnsafeRow = UnsafeProjection.create(double2ValueSchema) | |
val double2ValueRow = double2ToUnsafeRow(new SpecificInternalRow(double2ValueSchema)) | |
double2ValueRow.setDouble(0, 123.456) | |
double2ValueRow.setDouble(1, 789.123) | |
printSizeInfoForUnsafeRow(double2ValueSchema, double2ValueRow) | |
val stringValueSchema = new StructType().add("value", "string") | |
val stringToUnsafeRow = UnsafeProjection.create(stringValueSchema) | |
val stringInternalRow = new SpecificInternalRow(stringValueSchema) | |
stringInternalRow.update(0, dummyUtf8Str.clone()) | |
val stringValueRow = stringToUnsafeRow(stringInternalRow) | |
printSizeInfoForUnsafeRow(stringValueSchema, stringValueRow) | |
val string2ValueSchema = new StructType().add("value", "string").add("value2", "string") | |
val string2ToUnsafeRow = UnsafeProjection.create(string2ValueSchema) | |
val string2InternalRow = new SpecificInternalRow(string2ValueSchema) | |
string2InternalRow.update(0, dummyUtf8Str.clone()) | |
string2InternalRow.update(1, dummyUtf8Str.clone()) | |
val string2ValueRow = string2ToUnsafeRow(string2InternalRow) | |
printSizeInfoForUnsafeRow(string2ValueSchema, string2ValueRow) | |
val string3ValueSchema = new StructType().add("value", "string").add("value2", "string") | |
.add("value3", "string") | |
val string3ToUnsafeRow = UnsafeProjection.create(string3ValueSchema) | |
val string3InternalRow = new SpecificInternalRow(string3ValueSchema) | |
string3InternalRow.update(0, dummyUtf8Str.clone()) | |
string3InternalRow.update(1, dummyUtf8Str.clone()) | |
string3InternalRow.update(2, dummyUtf8Str.clone()) | |
val string3ValueRow = string3ToUnsafeRow(string3InternalRow) | |
printSizeInfoForUnsafeRow(string3ValueSchema, string3ValueRow) | |
val string4ValueSchema = new StructType().add("value", "string").add("value2", "string") | |
.add("value3", "string").add("value4", "string") | |
val string4ToUnsafeRow = UnsafeProjection.create(string4ValueSchema) | |
val string4InternalRow = new SpecificInternalRow(string4ValueSchema) | |
string4InternalRow.update(0, dummyUtf8Str.clone()) | |
string4InternalRow.update(1, dummyUtf8Str.clone()) | |
string4InternalRow.update(2, dummyUtf8Str.clone()) | |
string4InternalRow.update(3, dummyUtf8Str.clone()) | |
val string4ValueRow = string4ToUnsafeRow(string4InternalRow) | |
printSizeInfoForUnsafeRow(string4ValueSchema, string4ValueRow) | |
val nestedFieldSchema = new StructType().add("nestedIntField", "int").add("nestedStringField", "string") | |
val nestedStructSchema = new StructType().add("nested", nestedFieldSchema) | |
.add("stringField", "string").add("doubleField", "double") | |
val nestedFieldToUnsafeRow = UnsafeProjection.create(nestedFieldSchema) | |
val nestedFieldInternalRow = new SpecificInternalRow(nestedFieldSchema) | |
nestedFieldInternalRow.setInt(0, 100) | |
nestedFieldInternalRow.update(1, dummyUtf8Str.clone()) | |
val nestedRow = nestedFieldToUnsafeRow(nestedFieldInternalRow) | |
val nestedStructToUnsafeRow = UnsafeProjection.create(nestedStructSchema) | |
val nestedStructInternalRow = new SpecificInternalRow(nestedStructSchema) | |
nestedStructInternalRow.update(0, nestedRow) | |
nestedStructInternalRow.update(1, dummyUtf8Str.clone()) | |
nestedStructInternalRow.setDouble(2, 123.456) | |
val nestedOuterRow = nestedStructToUnsafeRow(nestedStructInternalRow) | |
printSizeInfoForUnsafeRow(nestedStructSchema, nestedOuterRow) | |
} | |
} | |
/* | |
====== Dummy Data Information ====== | |
INFO: estimated String size: 20971560 | |
INFO: String size (length): 10485760 | |
INFO: underlying byte array on String size (length): 10485760 | |
INFO: estimated UTF8 String size: 10485840 | |
INFO: UTF8 String (byte) size: 10485760 | |
INFO: UTF8 String (char) size: 10485760 | |
====== StructType() ====== | |
Row underlying byte array size: 0 | |
Row sizeInBytes: 0 | |
Row estimated size: 56 | |
Row estimated size (copied): 56 | |
====== StructType(StructField(value,IntegerType,true)) ====== | |
Row underlying byte array size: 16 | |
Row sizeInBytes: 16 | |
Row estimated size: 72 | |
Row estimated size (copied): 72 | |
====== StructType(StructField(value,LongType,true)) ====== | |
Row underlying byte array size: 16 | |
Row sizeInBytes: 16 | |
Row estimated size: 72 | |
Row estimated size (copied): 72 | |
====== StructType(StructField(value,FloatType,true)) ====== | |
Row underlying byte array size: 16 | |
Row sizeInBytes: 16 | |
Row estimated size: 72 | |
Row estimated size (copied): 72 | |
====== StructType(StructField(value,DoubleType,true)) ====== | |
Row underlying byte array size: 16 | |
Row sizeInBytes: 16 | |
Row estimated size: 72 | |
Row estimated size (copied): 72 | |
====== StructType(StructField(value,LongType,true), StructField(value2,LongType,true)) ====== | |
Row underlying byte array size: 24 | |
Row sizeInBytes: 24 | |
Row estimated size: 80 | |
Row estimated size (copied): 80 | |
====== StructType(StructField(value,DoubleType,true), StructField(value2,DoubleType,true)) ====== | |
Row underlying byte array size: 24 | |
Row sizeInBytes: 24 | |
Row estimated size: 80 | |
Row estimated size (copied): 80 | |
====== StructType(StructField(value,StringType,true)) ====== | |
Row underlying byte array size: 10485776 | |
Row sizeInBytes: 10485776 | |
Row estimated size: 20971608 | |
Row estimated size (copied): 10485832 | |
====== StructType(StructField(value,StringType,true), StructField(value2,StringType,true)) ====== | |
Row underlying byte array size: 20971544 | |
Row sizeInBytes: 20971544 | |
Row estimated size: 20971624 | |
Row estimated size (copied): 20971600 | |
====== StructType(StructField(value,StringType,true), StructField(value2,StringType,true), StructField(value3,StringType,true)) ====== | |
Row underlying byte array size: 31457312 | |
Row sizeInBytes: 31457312 | |
Row estimated size: 62914680 | |
Row estimated size (copied): 31457368 | |
====== StructType(StructField(value,StringType,true), StructField(value2,StringType,true), StructField(value3,StringType,true), StructField(value4,StringType,true)) ====== | |
Row underlying byte array size: 41943080 | |
Row sizeInBytes: 41943080 | |
Row estimated size: 62914696 | |
Row estimated size (copied): 41943136 | |
====== StructType(StructField(nested,StructType(StructField(nestedIntField,IntegerType,true), StructField(nestedStringField,StringType,true)),true), StructField(stringField,StringType,true), StructField(doubleField,DoubleType,true)) ====== | |
Row underlying byte array size: 20971576 | |
Row sizeInBytes: 20971576 | |
Row estimated size: 20971688 | |
Row estimated size (copied): 20971632 | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment