Skip to content

Instantly share code, notes, and snippets.

@ebenoist
Created February 5, 2017 21:43
Show Gist options
  • Save ebenoist/80542b95d1c5d0c0aaabaa37ee154922 to your computer and use it in GitHub Desktop.
Save ebenoist/80542b95d1c5d0c0aaabaa37ee154922 to your computer and use it in GitHub Desktop.
Convert a Protobuf Description to a Spark SQL Schema
import com.google.protobuf.Descriptors;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
import java.util.List;
public class ProtoToStruct {
public static StructType convert(Descriptors.Descriptor descriptor) {
return new StructType(schemaFor(descriptor));
}
public static DataType typeFor(Descriptors.FieldDescriptor field) {
Descriptors.FieldDescriptor.JavaType type = field.getJavaType();
switch (type) {
case BOOLEAN:
return DataTypes.BooleanType;
case BYTE_STRING:
return DataTypes.BinaryType;
case DOUBLE:
return DataTypes.DoubleType;
case ENUM:
return DataTypes.IntegerType;
case FLOAT:
return DataTypes.FloatType;
case INT:
return DataTypes.IntegerType;
case LONG:
return DataTypes.LongType;
case MESSAGE:
return new StructType(schemaFor(field.getMessageType()));
case STRING:
return DataTypes.StringType;
default:
throw new RuntimeException("Unknown type");
}
}
public static StructField[] schemaFor(Descriptors.Descriptor descriptor) {
List<StructField> structFields = new ArrayList<>();
List<Descriptors.FieldDescriptor> protoFields = descriptor.getFields();
protoFields.forEach((fieldDescriptor) -> {
String name = fieldDescriptor.getName();
structFields.add(new StructField(
name,
typeFor(fieldDescriptor),
(fieldDescriptor.getJavaType() == Descriptors.FieldDescriptor.JavaType.MESSAGE),
null
));
});
return structFields.toArray(new StructField[protoFields.size()]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment